In [8]:
#pip install pandas dask scikit-learn numpy

In [9]:
#BO
import os  
import pandas as pd
import dask.dataframe as dd
from dask import delayed
from dask.distributed import Client
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import gc

# --- FUNCIONES DE CARGA Y PREPROCESADO ---

def inicialize_dask():
    client = Client(memory_limit='8GB', processes=False)
    print(client)

def cargar_datos():
    inicialize_dask()
    data_path = '../data'
    files = os.listdir(data_path)
    
    selected_files = []
    for file in files:
        if file.endswith('.csv'):
            parts = file.split('_')
            if len(parts) < 3:
                continue
            year = parts[0]
            #if year not in ['2024', '2023', '2022']:
            #   continue
            if year not in ['2024','2023']:
                continue
            month = parts[1]
            #if year == '2024' and month not in ['05']:
            #   continue
            if year == '2023' and month not in ['06','07','08','09','10','11','12']:
               continue
            if year == '2024' and month not in ['06','07','08','09','10','11','12']:
               continue
            #if year == '2023' and month not in ['01', '11']:
            #    continue
            #if year == '2022' and month not in ['06']:
            #    continue
            selected_files.append(os.path.join(data_path, file))
    
    @delayed
    def process_file(file_path):
        try:
            df = pd.read_csv(file_path, low_memory=True, dtype=str, 
             usecols=['station_id', 'last_reported', 'num_bikes_available',
             'num_docks_available', 'num_bikes_available_types.mechanical', 'num_bikes_available_types.ebike', 
             'is_installed', 'is_renting', 'is_returning', 'is_charging_station'],
            skiprows=lambda i: i > 0 and i % 3 != 0)
            df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')
            df['year'] = df['last_reported'].dt.year
            df['month'] = df['last_reported'].dt.month
            df['day'] = df['last_reported'].dt.day
            df['hour'] = df['last_reported'].dt.hour

            cols_to_drop = [col for col in ['traffic', 'V1'] if col in df.columns]
            if cols_to_drop:
                df.drop(columns=cols_to_drop, inplace=True)

            numeric_cols = ['num_bikes_available', 'num_docks_available', 
                            'num_bikes_available_types.mechanical', 'num_bikes_available_types.ebike']
            for col in numeric_cols:
                if col in df.columns:
                    df[col] = df[col].astype(float)
            
            df = df.dropna(how='any')
            return df
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            return pd.DataFrame()

    delayed_dfs = [process_file(file_path) for file_path in selected_files]
    
    if not delayed_dfs:
        return None

    ddf = dd.from_delayed(delayed_dfs)
    df_meta = pd.read_csv('../Informacio_Estacions_Bicing_2025.csv',
                          usecols=['station_id', 'lat', 'lon', 'capacity'],
                          low_memory=False)
    ddf['station_id'] = ddf['station_id'].astype('Int64')
    df_meta['station_id'] = df_meta['station_id'].astype('Int64')

    for col in ['is_installed', 'is_renting', 'is_returning']:
        if col in ddf.columns:
            ddf[col] = ddf[col].astype(str).replace({'nan': '0', '<NA>': '0'}).astype(int)
    
    if 'is_charging_station' in ddf.columns:
        ddf['is_charging_station'] = ddf['is_charging_station'].astype(str).map({'TRUE': 1, 'FALSE': 0}).fillna(0).astype(int)
    
    ddf = ddf.dropna(how='any')
    ddf = ddf.merge(df_meta, on='station_id', how='inner')
    ddf['sum_capacity'] = ddf['num_bikes_available'] + ddf['num_docks_available']
    df_final = ddf.compute()

    median_capacity = df_final.groupby('station_id')['sum_capacity'].median()
    df_final['capacity'] = df_final['capacity'].fillna(df_final['station_id'].map(median_capacity))
    df_final['num_docks_available'] = df_final['num_docks_available'].clip(lower=0, upper=df_final['capacity'])
    df_final['target'] = df_final['num_docks_available'] / df_final['capacity']
    
    aggregated_df = df_final.groupby(['station_id', 'year', 'month', 'day', 'hour']).agg(
        num_bikes_available=('num_bikes_available', 'mean'),
        num_docks_available=('num_docks_available', 'mean'),
        num_mechanical=('num_bikes_available_types.mechanical', 'median'),
        num_ebike=('num_bikes_available_types.ebike', 'median'),
        is_renting=('is_renting', 'mean'),
        is_returning=('is_returning', 'mean'),
        target=('target', 'mean'),
        lat=('lat', 'first'),
        lon=('lon', 'first'),
        capacity=('capacity', 'first')
    ).reset_index()

    id_df = pd.read_csv('../data/metadata_sample_submission_2025.csv')
    station_list = pd.unique(id_df['station_id'])
    aggregated_df = aggregated_df[aggregated_df['station_id'].isin(station_list)]
    
    return aggregated_df

def crear_campos_optimized(df):
    df = df.sort_values(by=['station_id', 'year', 'month', 'day', 'hour']).reset_index(drop=True)

    for lag in range(1, 5):
        df[f'ctx-{lag}'] = df.groupby('station_id')['target'].shift(lag)

    mask = df.groupby('station_id').cumcount() >= 4
    df = df[mask]
    df = df.iloc[::5].reset_index(drop=True)

    return df

def day_categorization_bcn(df):
    holiday_set = set([ "2020-01-01", "2020-01-06", "2020-04-10", "2020-04-13", "2020-05-01", "2020-06-24", 
                        "2020-09-11", "2020-09-24", "2020-10-12", "2020-11-01", "2020-12-06", "2020-12-08", 
                        "2020-12-25", "2020-12-26", "2021-01-01", "2021-01-06", "2021-04-02", "2021-04-05", 
                        "2021-05-01", "2021-06-24", "2021-09-11", "2021-09-24", "2021-10-12", "2021-11-01", 
                        "2021-12-06", "2021-12-08", "2021-12-25", "2021-12-26", "2022-01-01", "2022-01-06", 
                        "2022-04-15", "2022-04-18", "2022-05-01", "2022-06-24", "2022-09-11", "2022-09-24", 
                        "2022-10-12", "2022-11-01", "2022-12-06", "2022-12-08", "2022-12-25", "2022-12-26", 
                        "2023-01-01", "2023-01-06", "2023-04-07", "2023-04-10", "2023-05-01", "2023-06-24", 
                        "2023-09-11", "2023-09-24", "2023-10-12", "2023-11-01", "2023-12-06", "2023-12-08", 
                        "2023-12-25", "2023-12-26", "2024-01-01", "2024-01-06", "2024-03-29", "2024-04-01", 
                        "2024-05-01", "2024-06-24", "2024-09-11", "2024-09-24", "2024-10-12", "2024-11-01", 
                        "2024-12-06", "2024-12-08", "2024-12-25", "2024-12-26", "2025-01-01", "2025-01-06", 
                        "2025-04-18", "2025-04-21", "2025-05-01", "2025-06-24", "2025-09-11", "2025-09-24", 
                        "2025-10-12", "2025-11-01", "2025-12-06", "2025-12-08", "2025-12-25", "2025-12-26"])
    
    df['date'] = pd.to_datetime(df[['year', 'month', 'day']])

    df['day_type'] = 0  # Default: workday
    df.loc[df['date'].dt.weekday >= 5, 'day_type'] = 1  # Weekend
    df.loc[df['date'].astype(str).isin(holiday_set), 'day_type'] = 2  # Holiday

    return df.drop(columns=['date'])



In [10]:
# --- PREPARACIÓN Y PREDICCIÓN ---

# Cargar y preparar los datos una vez
df_merge = cargar_datos()
df_merge = crear_campos_optimized(df_merge)
df_merge_final = day_categorization_bcn(df_merge)

X = df_merge_final[['station_id', 'month', 'day', 'hour', 'ctx-1', 'ctx-2', 'ctx-3', 'ctx-4', 'lat', 'lon', 'day_type']]
y = df_merge_final['target']

def neural_network_model(X, y, test_size=0.2):
    # Filtrar datos de validación (junio a diciembre)
    df_validation = X[(X['month'] >= 6) & (X['month'] <= 12)]
    y_validation = y.loc[df_validation.index]
    
    # Separar los datos de entrenamiento (eliminando los de validación)
    #df_train = X[~X.index.isin(df_validation.index)] no ho trec de moment
    df_train = X[X.index.isin(df_validation.index)] 
    y_train = y.loc[df_train.index]

    # Especificación de las columnas de entrada
    percent_features = ['ctx-1', 'ctx-2', 'ctx-3', 'ctx-4']
    bounded_features = ['month', 'day', 'hour']
    continuous_features = ['lat', 'lon']
    categorical_features = ['station_id', 'day_type']

    # Asegurar que las columnas necesarias están en X_train
    missing_cols = [col for col in percent_features + bounded_features + continuous_features + categorical_features if col not in df_train.columns]
    if missing_cols:
        raise ValueError(f"Missing columns in X: {missing_cols}")

    # Definir las matrices X_train_data y X_validation_data como DataFrames
    X_train_data = df_train[percent_features + bounded_features + continuous_features + categorical_features]
    X_validation_data = df_validation[percent_features + bounded_features + continuous_features + categorical_features]

    # Escalado de las etiquetas (target)
    y_train_data = np.array(y_train).reshape(-1, 1)
    y_validation_data = np.array(y_validation).reshape(-1, 1)

    preprocessor = ColumnTransformer(
        transformers=[('percent', MinMaxScaler(), percent_features),
                      ('bounded', MinMaxScaler(), bounded_features),
                      ('continuous', StandardScaler(), continuous_features),
                      ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)]
    )

    # Escalar las etiquetas (target)
    y_scaler = MinMaxScaler()
    y_train_scaled = y_scaler.fit_transform(y_train_data)
    y_validation_scaled = y_scaler.transform(y_validation_data)

    # Definir el modelo
    model = MLPRegressor(
        hidden_layer_sizes=(128, 64),
        activation='relu',
        solver='adam',
        alpha=0.001,
        random_state=42,
        max_iter=1000,
        early_stopping=True,
        validation_fraction=0.1,
        batch_size=128,
        n_iter_no_change=10,
        verbose=1
    )

    # Crear el pipeline con preprocesamiento y modelo
    pipeline = Pipeline([('preprocess', preprocessor), ('regressor', model)])
    
    # Ajustar el modelo
    pipeline.fit(X_train_data, y_train_scaled.ravel())

    # Realizar predicciones en el conjunto de validación
    y_pred_scaled = pipeline.predict(X_validation_data)
    y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

    # Evaluar el modelo
    r2 = r2_score(y_validation_data, y_pred)
    mse = mean_squared_error(y_validation_data, y_pred)
    mae = mean_absolute_error(y_validation_data, y_pred)

    return r2, mse, mae, pipeline, X_validation_data, y_validation_data

# Resultados del modelo
r2_nn, mse_nn, mae_nn, nn_model, _, _ = neural_network_model(X, y, 0.3)

# Verificar valores antes de construir el DataFrame
print(f"R² NN: {r2_nn}, MSE NN: {mse_nn}, MAE NN: {mae_nn}")

metrics_df = pd.DataFrame({
    'Métrica': ['R²', 'MSE', 'MAE'],
    'Neural Network': [r2_nn, mse_nn, mae_nn]
})

print(metrics_df)


Perhaps you already have a cluster running?
Hosting the HTTP server on port 54043 instead


<Client: 'inproc://192.168.1.37/23936/18' processes=1 threads=12, memory=7.45 GiB>


  df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')
You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map function that you are using.
  Before: .map(func)
  After:  .map(func, meta=('is_charging_station', 'float64'))

  df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')
  df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')
  df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')
  df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')
  df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')
  df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')


Iteration 1, loss = 0.00787872
Validation score: 0.847848
Iteration 2, loss = 0.00608672
Validation score: 0.854368
Iteration 3, loss = 0.00584731
Validation score: 0.857552
Iteration 4, loss = 0.00575164
Validation score: 0.857557
Iteration 5, loss = 0.00570788
Validation score: 0.859082
Iteration 6, loss = 0.00567058
Validation score: 0.860029
Iteration 7, loss = 0.00564978
Validation score: 0.860486
Iteration 8, loss = 0.00563027
Validation score: 0.859034
Iteration 9, loss = 0.00561900
Validation score: 0.856939
Iteration 10, loss = 0.00561563
Validation score: 0.861289
Iteration 11, loss = 0.00560471
Validation score: 0.860484
Iteration 12, loss = 0.00560343
Validation score: 0.861179
Iteration 13, loss = 0.00559218
Validation score: 0.861197
Iteration 14, loss = 0.00558468
Validation score: 0.860910
Iteration 15, loss = 0.00558139
Validation score: 0.862348
Iteration 16, loss = 0.00558125
Validation score: 0.862188
Iteration 17, loss = 0.00556700
Validation score: 0.862047
Iterat



Iteration 23, loss = 0.00554830
Validation score: 0.859141
Iteration 24, loss = 0.00554975
Validation score: 0.862884
Iteration 25, loss = 0.00554332
Validation score: 0.862831
Iteration 26, loss = 0.00554221
Validation score: 0.862218
Iteration 27, loss = 0.00554259
Validation score: 0.863102
Iteration 28, loss = 0.00553917
Validation score: 0.861827
Iteration 29, loss = 0.00553867
Validation score: 0.858918
Iteration 30, loss = 0.00553707
Validation score: 0.861466
Iteration 31, loss = 0.00553084
Validation score: 0.863127
Iteration 32, loss = 0.00552924
Validation score: 0.862850
Iteration 33, loss = 0.00552581
Validation score: 0.858674
Iteration 34, loss = 0.00552667
Validation score: 0.862932
Iteration 35, loss = 0.00551876
Validation score: 0.862615
Iteration 36, loss = 0.00552626
Validation score: 0.863102
Iteration 37, loss = 0.00552458
Validation score: 0.862937
Iteration 38, loss = 0.00552261
Validation score: 0.863013
Validation score did not improve more than tol=0.000100 

In [12]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import gc

# --- CARGA PARCIAL DEL DATASET, INCLUYENDO 'index' ---
use_cols = ['index', 'station_id', 'month', 'day', 'hour', 'ctx-4', 'ctx-3', 'ctx-2', 'ctx-1']
df = pd.read_csv('../data/metadata_sample_submission_2025.csv', usecols=use_cols)

df['year'] = 2024  # Necesario para `day_categorization_bcn`

# --- IMPUTACIÓN DE LAT/LON ---
df_merge_final = df_merge_final[['station_id', 'lat', 'lon']].drop_duplicates()
df = df.merge(df_merge_final, on='station_id', how='left')

# --- CLASIFICACIÓN DEL DÍA ---
df = day_categorization_bcn(df)

# --- CONVERSIÓN DE CATEGÓRICAS (si hace falta) ---
if df['day_type'].dtype == 'object':
    df['day_type'] = LabelEncoder().fit_transform(df['day_type'])

# --- PREDICCIÓN POR LOTES ---
features = ['station_id', 'month', 'day', 'hour', 'ctx-4', 'ctx-3', 'ctx-2', 'ctx-1', 'lat', 'lon', 'day_type']
X_predict = df[features]

batch_size = 5000
predictions = []

for start in range(0, len(X_predict), batch_size):
    end = start + batch_size
    batch = X_predict.iloc[start:end]
    preds = nn_model.predict(batch)
    predictions.extend(preds)
    del batch
    gc.collect()

# --- CREACIÓN DEL DF FINAL PARA ENTREGABLE ---
df['percentage_docks_available'] = predictions
df_final = df[['index', 'percentage_docks_available']]
df_final.to_csv('predictions.csv', index=False)

print("✅ Archivo 'predictions.csv' creado correctamente.")


✅ Archivo 'predictions.csv' creado correctamente.
