In [1]:
import os
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin

def cargar_datos():
    data_path = '../data'
    files = os.listdir(data_path)
    
    df_list = []

    for file in files:
        if file.endswith('.csv'):  
            file_path = os.path.join(data_path, file)

            parts = file.split('_')

            if len(parts) < 3:
                continue
            
            year = parts[0]  
            month_name = parts[2]  

            if year not in ['2024']:
                continue

            if month_name not in ['Gener']:
                continue
            
            try:
                df = pd.read_csv(file_path, low_memory=True, dtype=str)

                df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')

                df['year'] = df['last_reported'].dt.year
                df['month'] = df['last_reported'].dt.month
                df['day'] = df['last_reported'].dt.day
                df['hour'] = df['last_reported'].dt.hour

                # Eliminar columnas innecesarias si existen
                df.drop(columns=[col for col in ['traffic', 'V1'] if col in df.columns], inplace=True)

                # Convertir a float64 las columnas numéricas
                numeric_cols = ['num_bikes_available', 'num_docks_available', 
                                'num_bikes_available_types.mechanical', 'num_bikes_available_types.ebike']
                for col in numeric_cols:
                    if col in df.columns:
                        df[col] = df[col].astype(float)

                df_list.append(df)

            except Exception as e:
                print(f"Error al procesar {file}: {e}")
                continue

    if df_list:
        merged_df = pd.concat(df_list, axis=0)
    else:
        return None

    # Cargar datos adicionales
    df_2 = pd.read_csv('../Informacio_Estacions_Bicing_2025.csv', usecols=['station_id', 'lat', 'lon', 'capacity'], low_memory=False)

    # Convertir station_id a Int64 para evitar errores de merge
    merged_df['station_id'] = merged_df['station_id'].astype('Int64')
    df_2['station_id'] = df_2['station_id'].astype('Int64')

    # Pasar is_installed, is_renting, is_returning a int, manejando NaN
    for col in ['is_installed', 'is_renting', 'is_returning']:
        if col in merged_df.columns:
            merged_df[col] = merged_df[col].fillna(0).astype(int)

    # Convertir 'TRUE'/'FALSE' de is_charging_station a 1/0
    if 'is_charging_station' in merged_df.columns:
        merged_df['is_charging_station'] = merged_df['is_charging_station'].map({'TRUE': 1, 'FALSE': 0})

    # Merge con datos de estaciones
    merged_df = merged_df.merge(df_2, on='station_id', how='inner')

    # Calcular capacidad total y completar valores nulos
    merged_df['sum_capacity'] = merged_df['num_bikes_available'] + merged_df['num_docks_available']
    median_capacity = merged_df.groupby('station_id')['sum_capacity'].median()

    merged_df['capacity'] = merged_df.apply(
        lambda row: median_capacity[row['station_id']] if pd.isna(row['capacity']) else row['capacity'],
        axis=1
    )

    # Limitar num_docks_available a [0, capacity]
    merged_df['num_docks_available'] = merged_df.apply(
        lambda row: min(max(row['num_docks_available'], 0), row['capacity']),
        axis=1
    )

    # Calcular el target
    merged_df['target'] = merged_df['num_docks_available'] / merged_df['capacity']
    
    # Agregación de datos
    aggregated_df = merged_df.groupby(['station_id', 'year', 'month', 'day', 'hour']).agg(
        num_bikes_available=('num_bikes_available', 'mean'),
        num_docks_available=('num_docks_available', 'mean'),
        num_mechanical=('num_bikes_available_types.mechanical', 'median'),
        num_ebike=('num_bikes_available_types.ebike', 'median'),
        # is_installed=('is_installed', 'mean'), No lo ponemos porque todos son true
        is_renting=('is_renting', 'mean'),
        is_returning=('is_returning', 'mean'),
        target=('target', 'mean'),
        lat=('lat', 'first'),
        lon=('lon', 'first'),
        capacity=('capacity', 'first')
    ).reset_index()

    # Filtrar por estaciones disponibles en metadata
    id = pd.read_csv('../data/metadata_sample_submission_2025.csv')
    llista_stations = pd.unique(id['station_id'])
    aggregated_df = aggregated_df[aggregated_df['station_id'].isin(llista_stations)]

    return aggregated_df


In [2]:
df_merge = cargar_datos()

  df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')


In [3]:
#suposant que comencem per l'inici dun dia(elimino les 4 hores del dia anterior del mes anterior, ja que per defecte surten)
df_merge = df_merge.iloc[4:]
df_merge.head()  

#funcio per afegir les 4h anteriors
def crear_campos_lags(df):
    resultados = []
    # Agrupamos por station_id
    for station, grupo in df.groupby('station_id'):
        # Aseguramos el orden cronológico
        grupo = grupo.sort_values(by=['year', 'month', 'day', 'hour']).reset_index(drop=True)
        n = len(grupo)
        # Iteramos empezando en el índice 4 y avanzamos de 5 en 5
        for i in range(4, n):
            if i - 4 >= 0:
                fila = grupo.loc[i].copy()
                # Agregamos los valores de 'target' de las 4 horas previas
                fila['ctx-4'] = grupo.loc[i - 4, 'target']
                fila['ctx-3'] = grupo.loc[i - 3, 'target']
                fila['ctx-2'] = grupo.loc[i - 2, 'target']
                fila['ctx-1'] = grupo.loc[i - 1, 'target']
                
                resultados.append(fila)
    return pd.DataFrame(resultados)

# Aplicamos la función al DataFrame df_merge
df_merge_final = crear_campos_lags(df_merge)

print(df_merge_final.head())  # Ver las primeras filas del nuevo DataFrame


   station_id    year  month  day  hour  num_bikes_available  \
4         1.0  2024.0    1.0  1.0   6.0            11.818182   
5         1.0  2024.0    1.0  1.0   7.0            13.000000   
6         1.0  2024.0    1.0  1.0   8.0            13.000000   
7         1.0  2024.0    1.0  1.0   9.0            12.076923   
8         1.0  2024.0    1.0  1.0  10.0             7.833333   

   num_docks_available  num_mechanical  num_ebike  is_renting  is_returning  \
4            33.181818             9.0        3.0         1.0           1.0   
5            32.000000             9.0        4.0         1.0           1.0   
6            32.000000             9.0        4.0         1.0           1.0   
7            32.923077             9.0        3.0         1.0           1.0   
8            37.166667             4.0        3.0         1.0           1.0   

     target        lat       lon  capacity     ctx-4     ctx-3     ctx-2  \
4  0.721344  41.397978  2.180107      46.0  0.663043  0.681159  

In [4]:
# Ahora con el df_merge_final crear un df nuevo que sea df_train que coja 1 de cada 5 filas
df_train = df_merge_final.iloc[::5]
df_train = df_train.reset_index(drop=True)
print(df_train.head())  # Ver las primeras filas del nuevo DataFrame

   station_id    year  month  day  hour  num_bikes_available  \
0         1.0  2024.0    1.0  1.0   6.0            11.818182   
1         1.0  2024.0    1.0  1.0  11.0             7.666667   
2         1.0  2024.0    1.0  1.0  16.0             7.750000   
3         1.0  2024.0    1.0  1.0  21.0            12.916667   
4         1.0  2024.0    1.0  2.0   2.0             9.000000   

   num_docks_available  num_mechanical  num_ebike  is_renting  is_returning  \
0            33.181818             9.0        3.0         1.0           1.0   
1            37.250000             3.0        5.0         1.0           1.0   
2            37.166667             0.0        7.5         1.0           1.0   
3            32.083333             3.0        9.5         1.0           1.0   
4            36.000000             4.0        5.0         1.0           1.0   

     target        lat       lon  capacity     ctx-4     ctx-3     ctx-2  \
0  0.721344  41.397978  2.180107      46.0  0.663043  0.681159  

In [5]:
# Para separar los datos usamos df_train
X = df_train[['station_id','month', 'day', 'hour', 'ctx-1', 'ctx-2', 'ctx-3', 'ctx-4','lat','lon']]
y = df_train['target']

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def linear_regression(X,y, size):
#Separamos train y test
    X_train, X_test, y_train, y_test = train_test_split(X ,y , test_size= size, random_state=42)

    lm = LinearRegression()
#Entrenamiento modelo
    lm.fit(X_train, y_train)
#Predict
    y_pred = lm.predict(X_test)
#Metricas
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    return  r2, mse, mae


#RandomForest
from sklearn.ensemble import RandomForestRegressor

def RandomForest(X,y, size):
#Separamos train y test
    X_train, X_test, y_train, y_test = train_test_split(X ,y , test_size= size, random_state=42)

    rf = RandomForestRegressor(n_estimators = 100, random_state= 42)
#Entrenamiento modelo
    rf.fit(X_train, y_train)
#Predict
    y_pred = rf.predict(X_test)
#Metricas
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
#Factoring Importance
    feature_importance = rf.feature_importances_
    feature_names = X_test.columns
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    return  r2, mse, mae, importance_df, rf

In [7]:
r2_linear, mse_linear, mae_linear = linear_regression(X,y, 0.3)
r2_rf, mse_rf, mae_rf, importance_df,rf = RandomForest(X, y, 0.3)
metrics_df = pd.DataFrame({
    'Métrica': ['R²', 'MSE', 'MAE'],
    'Regresión Lineal': [r2_linear, mse_linear, mae_linear],
    'Random Forest': [r2_rf, mse_rf, mae_rf]
})
print(metrics_df)
print("Importancia de las características:")
print(importance_df)

  Métrica  Regresión Lineal  Random Forest
0      R²          0.807260       0.812416
1     MSE          0.011483       0.011176
2     MAE          0.072257       0.071597
Importancia de las características:
      Feature  Importance
4       ctx-1    0.824966
5       ctx-2    0.038820
6       ctx-3    0.022982
7       ctx-4    0.021907
0  station_id    0.020130
9         lon    0.019853
8         lat    0.018047
3        hour    0.017378
2         day    0.015916
1       month    0.000000


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import pandas as pd

# Cargar el archivo CSV
df = pd.read_csv('../metadata_sample_submission_2025.csv')


# Función para imputar las columnas 'lat, lon' del df_merge_final en base (FK) la columa station_id
# ---- IMPUTACIÓN DE COLUMNAS ADICIONALES ----
cols_to_impute = ['lat', 'lon']

# Usamos `merge` para traer la información de df_merge_final a df
df = df.merge(df_merge_final[['station_id'] + cols_to_impute], on='station_id', how='left')

# Seleccionar las características para predecir
X_predict = df[['station_id', 'month', 'day', 'hour', 'ctx-4', 'ctx-3', 'ctx-2', 'ctx-1','lat','lon']]

# Hacer predicciones
predictions = rf.predict(X_predict)

# Añadir las predicciones como una nueva columna en el DataFrame
df['predictions'] = predictions

# Guardar el DataFrame con las predicciones en un nuevo archivo CSV
df.to_csv('metadata_sample_submission_with_predictions.csv', index=False)

print("Predicciones añadidas y guardadas en 'metadata_sample_submission_with_predictions.csv'")


In [25]:
df_final = df[['index','predictions']]
df_final.to_csv('predictions.csv', index=False)