In [1]:
import os
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import dask.dataframe as dd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin


def cargar_datos():
    data_path = '../data'
    files = os.listdir(data_path)
    
    df_list = []

    for file in files:
        if file.endswith('.csv'):  
            file_path = os.path.join(data_path, file)

            parts = file.split('_')

            if len(parts) < 3:
                continue
            
            year = parts[0]  
            month_name = parts[2]  

            if year not in ['2024']:
            # if year not in ['2024']:

                continue

            #if month_name not in ['Gener', 'Febrer', 'Marc', 'Abril', 'Maig']:
            if month_name not in ['Maig']:

                continue

            try:
                #df = pd.read_csv(file_path, low_memory=True, dtype=str, skiprows=lambda i: i % 1000 != 0)
                df = dd.read_csv(file_path, low_memory=True, dtype=str)


                df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')

                df['year'] = df['last_reported'].dt.year
                df['month'] = df['last_reported'].dt.month
                df['day'] = df['last_reported'].dt.day
                df['hour'] = df['last_reported'].dt.hour

                if 'traffic' in df.columns:
                    df.drop(columns=['traffic'], inplace=True)
                
                if 'V1' in df.columns:
                    df.drop(columns=['V1'], inplace=True)

                # Convertir columnas a float
                for col in ['num_bikes_available','num_docks_available','num_bikes_available_types.mechanical', 'num_bikes_available_types.ebike']:
                    df[col] = df[col].astype('float64')

                df_list.append(df)

            except Exception:
                continue

    merged_df = pd.concat(df_list, ignore_index=True) if df_list else None

    if merged_df is not None:
        df_2 = pd.read_csv('../Informacio_Estacions_Bicing_2025.csv', usecols=['station_id', 'lat', 'lon','capacity'], low_memory=False)

        merged_df['station_id'] = merged_df['station_id'].astype('Int64')
        df_2['station_id'] = df_2['station_id'].astype('Int64')
        
        merged_df = merged_df.merge(df_2, on='station_id', how='inner')

    # Crear columna sum_capacity
    merged_df['sum_capacity'] = merged_df['num_bikes_available'] + merged_df['num_docks_available']

    # Calcular la mediana de sum_capacity por estación
    median_capacity = merged_df.groupby('station_id')['sum_capacity'].median()

    def impute_capacity(row):
        return median_capacity[row['station_id']] if pd.isna(row['capacity']) else row['capacity']

    merged_df['capacity'] = merged_df.apply(impute_capacity, axis=1)
    # merged_df['diff_capacity_available'] = merged_df['capacity'] - (merged_df['num_bikes_available'] + merged_df['num_docks_available'])

    # Asegurar límites de num_docks_available
    merged_df['num_docks_available'] = merged_df.apply(
        lambda row: min(max(row['num_docks_available'], 0), row['capacity']),
        axis=1
    )

    # Crear columna target (% de bicis disponibles)
    merged_df['target'] = merged_df['num_docks_available'] / merged_df['capacity']

    # **Agrupar a nivel de hora por estación**
    aggregated_df = merged_df.groupby(['station_id', 'year', 'month', 'day', 'hour']).agg(
        num_bikes_available=('num_bikes_available', 'mean'),
        num_docks_available=('num_docks_available', 'mean'),
        num_mechanical=('num_bikes_available_types.mechanical', 'median'),
        num_ebike=('num_bikes_available_types.ebike', 'median'),
        target=('target', 'mean'),
        lat=('lat', 'first'),
        lon=('lon', 'first'),
        capacity=('capacity', 'first')
    ).reset_index()

    id = pd.read_csv('../data/metadata_sample_submission_2025.csv')

    llista_stations = pd.unique(id['station_id'])

    aggregated_df = aggregated_df[aggregated_df['station_id'].isin(llista_stations)]

    return aggregated_df

In [2]:
import os
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import dask.dataframe as dd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin

def cargar_datos():
    data_path = '../data'
    files = os.listdir(data_path)
    
    df_list = []

    for file in files:
        if file.endswith('.csv'):  
            file_path = os.path.join(data_path, file)

            parts = file.split('_')

            if len(parts) < 3:
                continue
            
            year = parts[0]  
            month_name = parts[2]  

            if year not in ['2024']:
                continue

            if month_name not in ['Gener']:
                continue
            

            try:
                df = dd.read_csv(file_path, low_memory=True, dtype=str)

                df['last_reported'] = dd.to_datetime(df['last_reported'], unit='s', errors='coerce')

                df['year'] = df['last_reported'].dt.year
                df['month'] = df['last_reported'].dt.month
                df['day'] = df['last_reported'].dt.day
                df['hour'] = df['last_reported'].dt.hour

                if 'traffic' in df.columns:
                    df = df.drop(columns=['traffic'])
                
                if 'V1' in df.columns:
                    df = df.drop(columns=['V1'])

                for col in ['num_bikes_available','num_docks_available','num_bikes_available_types.mechanical', 'num_bikes_available_types.ebike']:
                    df[col] = df[col].astype('float64')

                df_list.append(df)

            except Exception:
                continue

    if df_list:
        merged_df = dd.concat(df_list, axis=0, interleave_partitions=True)
    else:
        merged_df = None

    if merged_df is not None:
        df_2 = pd.read_csv('../Informacio_Estacions_Bicing_2025.csv', usecols=['station_id', 'lat', 'lon','capacity'], low_memory=False)

        merged_df['station_id'] = merged_df['station_id'].astype('Int64')
        df_2['station_id'] = df_2['station_id'].astype('Int64')
        
        merged_df = merged_df.merge(df_2, on='station_id', how='inner')

        merged_df['sum_capacity'] = merged_df['num_bikes_available'] + merged_df['num_docks_available']

        median_capacity = merged_df.groupby('station_id')['sum_capacity'].median().compute()

        def impute_capacity(row):
            return median_capacity[row['station_id']] if pd.isna(row['capacity']) else row['capacity']

        merged_df['capacity'] = merged_df.apply(impute_capacity, axis=1, meta=('capacity', 'float64'))

        merged_df['num_docks_available'] = merged_df.apply(
            lambda row: min(max(row['num_docks_available'], 0), row['capacity']),
            axis=1, meta=('num_docks_available', 'float64')
        )

        merged_df['target'] = merged_df['num_docks_available'] / merged_df['capacity']

        aggregated_df = merged_df.groupby(['station_id', 'year', 'month', 'day', 'hour']).agg(
            num_bikes_available=('num_bikes_available', 'mean'),
            num_docks_available=('num_docks_available', 'mean'),
            num_mechanical=('num_bikes_available_types.mechanical', 'median'),
            num_ebike=('num_bikes_available_types.ebike', 'median'),
            target=('target', 'mean'),
            lat=('lat', 'first'),
            lon=('lon', 'first'),
            capacity=('capacity', 'first')
        ).reset_index().compute()

        id = pd.read_csv('../data/metadata_sample_submission_2025.csv')

        llista_stations = pd.unique(id['station_id'])

        aggregated_df = aggregated_df[aggregated_df['station_id'].isin(llista_stations)]

        return aggregated_df
    else:
        return None

In [4]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class CyclicalFeaturesTransformer(BaseEstimator, TransformerMixin):
    """
    Transformer para crear variables cíclicas (seno y coseno) a partir de columnas temporales.
    Por defecto, transforma: month, day, hour y minute.
    La columna 'year' se deja sin transformar.
    """
    def __init__(self, columns=None):
        """
        columns: diccionario que define para cada columna el valor máximo (p.ej. {"month": 12, "day": 31, "hour": 24, "minute": 60})
        Si se omite, se usan valores por defecto.
        """
        if columns is None:
            self.columns = {"month": 12, "day": 31, "hour": 24, "minute": 60}
        else:
            self.columns = columns

    def fit(self, X, y=None):
        # No es necesario aprender nada de X
        return self

    def transform(self, X):
        X_ = X.copy()
        for col, max_val in self.columns.items():
            if col in X_.columns:
                X_[f'{col}_sin'] = np.sin(2 * np.pi * X_[col].astype(float) / max_val)
                X_[f'{col}_cos'] = np.cos(2 * np.pi * X_[col].astype(float) / max_val)
        return X_

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder


status_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='NOT_IN_SERVICE')),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore', categories=[['IN_SERVICE', 'MAINTENANCE', 'NOT_IN_SERVICE', 'PLANNED']])), # One-Hot Encoding
])

is_charging_station_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse_output=False, drop='if_binary')),
])

ordinal_pipeline = Pipeline([
    ('encoder', OrdinalEncoder())
])

cyclic_pipeline = Pipeline([
    ('cyclic_transformer', CyclicalFeaturesTransformer())
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('status', status_pipeline, ['status']),
        ('is_charging_station', is_charging_station_pipeline, ['is_charging_station']),
        ('ordinal', ordinal_pipeline, ['month','day','hour']),
        ('cyclic', cyclic_pipeline, ['month','day','hour','minute']),
        ('year', 'passthrough', ['year']),
        # CREAR COLUMNA FESTIVO (0/1) A PARTIR DE 'day' y 'month'
    ],
    remainder='passthrough'
)

In [None]:
df_merge = cargar_datos()

# transformed_data = preprocessor.fit_transform(df_merge)

  return get_meta_library(args[0]).to_datetime(*args, **kwargs)
  return get_meta_library(args[0]).to_datetime(*args, **kwargs)
  return get_meta_library(args[0]).to_datetime(*args, **kwargs)
  return get_meta_library(args[0]).to_datetime(*args, **kwargs)
  return get_meta_library(args[0]).to_datetime(*args, **kwargs)
  return get_meta_library(args[0]).to_datetime(*args, **kwargs)
  return get_meta_library(args[0]).to_datetime(*args, **kwargs)
  return get_meta_library(args[0]).to_datetime(*args, **kwargs)
  return get_meta_library(args[0]).to_datetime(*args, **kwargs)
  return get_meta_library(args[0]).to_datetime(*args, **kwargs)
  return get_meta_library(args[0]).to_datetime(*args, **kwargs)
  return get_meta_library(args[0]).to_datetime(*args, **kwargs)
  return get_meta_library(args[0]).to_datetime(*args, **kwargs)
  return get_meta_library(args[0]).to_datetime(*args, **kwargs)
  return get_meta_library(args[0]).to_datetime(*args, **kwargs)
  return get_meta_library(args[0]).to_da

In [6]:
df_capacity = df_merge[['capacity', 'num_bikes_available', 'num_mechanical', 'num_ebike', 'num_docks_available']].copy()

df_capacity['diff_bike_types'] = df_capacity['num_mechanical'] + df_capacity['num_ebike'] - df_capacity['num_bikes_available']
df_capacity['diff_capacity_available'] = df_capacity['capacity'] - (df_capacity['num_bikes_available'] + df_capacity['num_docks_available'])

# Dime el total de diff_capacity_available / entre los diff_capacity_available que no son iguales a 0
df_capacity[df_capacity['diff_capacity_available'] == 0].shape[0] / df_capacity.shape[0]

0.27085833976558277

In [None]:
df_merge.sort_values(by=['station_id','year', 'month', 'day', 'hour'], ascending=True, inplace=True)
df_merge.head()  


Unnamed: 0,station_id,year,month,day,hour,num_bikes_available,num_docks_available,num_mechanical,num_ebike,target,lat,lon,capacity
0,1,2024.0,4.0,30.0,21.0,20.0,25.0,19.0,1.0,0.543478,41.397978,2.180107,46
1,1,2024.0,4.0,30.0,22.0,25.25,19.75,20.0,6.0,0.429348,41.397978,2.180107,46
2,1,2024.0,4.0,30.0,23.0,32.333333,12.666667,24.5,9.0,0.275362,41.397978,2.180107,46
3,1,2024.0,5.0,1.0,0.0,25.916667,19.083333,16.5,6.5,0.414855,41.397978,2.180107,46
4,1,2024.0,5.0,1.0,1.0,18.416667,26.583333,15.0,4.0,0.577899,41.397978,2.180107,46


Unnamed: 0,station_id,year,month,day,hour,num_bikes_available,num_docks_available,num_mechanical,num_ebike,target,lat,lon,capacity
4,1,2024.0,5.0,1.0,1.0,18.416667,26.583333,15.0,4.0,0.577899,41.397978,2.180107,46
5,1,2024.0,5.0,1.0,2.0,16.75,28.25,14.0,2.0,0.61413,41.397978,2.180107,46
6,1,2024.0,5.0,1.0,3.0,17.0,28.0,16.0,1.0,0.608696,41.397978,2.180107,46
7,1,2024.0,5.0,1.0,4.0,17.166667,27.833333,17.0,0.0,0.605072,41.397978,2.180107,46
8,1,2024.0,5.0,1.0,5.0,18.666667,26.333333,19.0,0.0,0.572464,41.397978,2.180107,46


In [None]:
#suposant que comencem per l'inici dun dia(elimino les 4 hores del dia anterior del mes anterior, ja que per defecte surten)
df_merge = df_merge.iloc[4:]
df_merge.head()  

#funcio per afegir les 4h anteriors
def crear_campos_lags(df):
    resultados = []
    # Agrupamos por station_id
    for station, grupo in df.groupby('station_id'):
        # Aseguramos el orden cronológico
        grupo = grupo.sort_values(by=['year', 'month', 'day', 'hour']).reset_index(drop=True)
        n = len(grupo)
        # Iteramos empezando en el índice 4 y avanzamos de 5 en 5
        for i in range(4, n, 5):
            if i - 4 >= 0:
                fila = grupo.loc[i].copy()
                # Agregamos los valores de 'target' de las 4 horas previas
                fila['ctx-4'] = grupo.loc[i - 4, 'target']
                fila['ctx-3'] = grupo.loc[i - 3, 'target']
                fila['ctx-2'] = grupo.loc[i - 2, 'target']
                fila['ctx-1'] = grupo.loc[i - 1, 'target']
                
                resultados.append(fila)
    return pd.DataFrame(resultados)

# Aplicamos la función al DataFrame df_merge
df_merge_final = crear_campos_lags(df_merge)

print(df_merge_final.head())  # Ver las primeras filas del nuevo DataFrame


    station_id    year  month  day  hour  num_bikes_available  \
4          1.0  2024.0    1.0  1.0   2.0            14.500000   
9          1.0  2024.0    1.0  1.0   7.0            13.000000   
14         1.0  2024.0    1.0  1.0  12.0             4.666667   
19         1.0  2024.0    1.0  1.0  17.0             5.666667   
24         1.0  2024.0    1.0  1.0  22.0            10.000000   

    num_docks_available  num_mechanical  num_ebike    target        lat  \
4             30.500000             5.5        9.5  0.663043  41.397978   
9             32.000000             9.0        4.0  0.695652  41.397978   
14            40.333333             1.0        4.0  0.876812  41.397978   
19            39.333333             2.0        4.0  0.855072  41.397978   
24            35.000000             4.0        6.0  0.760870  41.397978   

         lon  capacity  4h_before  3h_before  2h_before  1h_before  
4   2.180107      46.0   0.717391   0.768116   0.748188   0.764493  
9   2.180107      46

In [18]:
X = df_merge_final[['station_id','month', 'day', 'hour', '1h_before', '2h_before', '3h_before', '4h_before']]
y = df_merge_final['target']

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Random forest

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=42)

# Apply preprocessing pipeline
X_train_transformed = X_train
X_test_transformed = X_test

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_transformed, y_train)

# Predictions
y_train_pred_rf = rf_model.predict(X_train_transformed)
y_test_pred_rf = rf_model.predict(X_test_transformed)

# Evaluate Model
train_mse_rf = mean_squared_error(y_train, y_train_pred_rf)
test_mse_rf = mean_squared_error(y_test, y_test_pred_rf)

print(f'Train MSE: {train_mse_rf}')
print(f'Test MSE: {test_mse_rf}')

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def linear_regression(X,y, size):
#Separamos train y test
    X_train, X_test, y_train, y_test = train_test_split(X ,y , test_size= size, random_state=42)

    lm = LinearRegression()
#Entrenamiento modelo
    lm.fit(X_train, y_train)
#Predict
    y_pred = lm.predict(X_test)
#Metricas
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    return  r2, mse, mae


#RandomForest
from sklearn.ensemble import RandomForestRegressor

def RandomForest(X,y, size):
#Separamos train y test
    X_train, X_test, y_train, y_test = train_test_split(X ,y , test_size= size, random_state=42)

    rf = RandomForestRegressor(n_estimators = 100, random_state= 42)
#Entrenamiento modelo
    rf.fit(X_train, y_train)
#Predict
    y_pred = rf.predict(X_test)
#Metricas
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
#Factoring Importance
    feature_importance = rf.feature_importances_
    feature_names = X_test.columns
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    return  r2, mse, mae, importance_df, rf

In [19]:
r2_linear, mse_linear, mae_linear = linear_regression(X,y, 0.3)
r2_rf, mse_rf, mae_rf, importance_df,rf = RandomForest(X, y, 0.3)
metrics_df = pd.DataFrame({
    'Métrica': ['R²', 'MSE', 'MAE'],
    'Regresión Lineal': [r2_linear, mse_linear, mae_linear],
    'Random Forest': [r2_rf, mse_rf, mae_rf]
})
print(metrics_df)
print("Importancia de las características:")
print(importance_df)

  Métrica  Regresión Lineal  Random Forest
0      R²          0.803255       0.803612
1     MSE          0.011856       0.011834
2     MAE          0.073037       0.073680
Importancia de las características:
      Feature  Importance
4   1h_before    0.827825
5   2h_before    0.045118
6   3h_before    0.029135
0  station_id    0.028805
7   4h_before    0.028627
3        hour    0.021051
2         day    0.019440
1       month    0.000000


In [24]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import pandas as pd

# Cargar el archivo CSV
df = pd.read_csv('../metadata_sample_submission_2025.csv')

# Seleccionar las características para predecir
X_predict = df[['station_id', 'month', 'day', 'hour', '1h_before', '2h_before', '3h_before', '4h_before']]

# Hacer predicciones
predictions = rf.predict(X_predict)

# Añadir las predicciones como una nueva columna en el DataFrame
df['predictions'] = predictions

# Guardar el DataFrame con las predicciones en un nuevo archivo CSV
df.to_csv('metadata_sample_submission_with_predictions.csv', index=False)

print("Predicciones añadidas y guardadas en 'metadata_sample_submission_with_predictions.csv'")


Predicciones añadidas y guardadas en 'metadata_sample_submission_with_predictions.csv'


In [25]:
df_final = df[['index','predictions']]
df_final.to_csv('predictions.csv', index=False)