In [None]:

import os
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin


def cargar_datos():
    data_path = '../data'
    files = os.listdir(data_path)
    
    df_list = []

    for file in files:
        if file.endswith('.csv'):  
            file_path = os.path.join(data_path, file)

            parts = file.split('_')

            if len(parts) < 3:
                continue
            
            year = parts[0]  
            month_name = parts[2]  

            if year not in ['2024']:
                continue

            if month_name not in ['Gener']:
                continue
            
            try:
                df = pd.read_csv(file_path, low_memory=True, dtype=str)

                df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')

                df['year'] = df['last_reported'].dt.year
                df['month'] = df['last_reported'].dt.month
                df['day'] = df['last_reported'].dt.day
                df['hour'] = df['last_reported'].dt.hour

                # Eliminar columnas innecesarias si existen
                df.drop(columns=[col for col in ['traffic', 'V1'] if col in df.columns], inplace=True)

                # Convertir a float64 las columnas numéricas
                numeric_cols = ['num_bikes_available', 'num_docks_available', 
                                'num_bikes_available_types.mechanical', 'num_bikes_available_types.ebike']
                for col in numeric_cols:
                    if col in df.columns:
                        df[col] = df[col].astype(float)

                df_list.append(df)

            except Exception as e:
                print(f"Error al procesar {file}: {e}")
                continue

    if df_list:
        merged_df = pd.concat(df_list, axis=0)
    else:
        return None

    # Cargar datos adicionales
    df_2 = pd.read_csv('../Informacio_Estacions_Bicing_2025.csv', usecols=['station_id', 'lat', 'lon', 'capacity'], low_memory=False)

    # Convertir station_id a Int64 para evitar errores de merge
    merged_df['station_id'] = merged_df['station_id'].astype('Int64')
    df_2['station_id'] = df_2['station_id'].astype('Int64')

    # Pasar is_installed, is_renting, is_returning a int, manejando NaN
    for col in ['is_installed', 'is_renting', 'is_returning']:
        if col in merged_df.columns:
            merged_df[col] = merged_df[col].fillna(0).astype(int)

    # Convertir 'TRUE'/'FALSE' de is_charging_station a 1/0
    if 'is_charging_station' in merged_df.columns:
        merged_df['is_charging_station'] = merged_df['is_charging_station'].map({'TRUE': 1, 'FALSE': 0})

    # Merge con datos de estaciones
    merged_df = merged_df.merge(df_2, on='station_id', how='inner')

    # Calcular capacidad total y completar valores nulos
    merged_df['sum_capacity'] = merged_df['num_bikes_available'] + merged_df['num_docks_available']
    median_capacity = merged_df.groupby('station_id')['sum_capacity'].median()

    merged_df['capacity'] = merged_df.apply(
        lambda row: median_capacity[row['station_id']] if pd.isna(row['capacity']) else row['capacity'],
        axis=1
    )

    # Limitar num_docks_available a [0, capacity]
    merged_df['num_docks_available'] = merged_df.apply(
        lambda row: min(max(row['num_docks_available'], 0), row['capacity']),
        axis=1
    )

    # Calcular el target
    merged_df['target'] = merged_df['num_docks_available'] / merged_df['capacity']
    
    # Agregación de datos
    aggregated_df = merged_df.groupby(['station_id', 'year', 'month', 'day', 'hour']).agg(
        num_bikes_available=('num_bikes_available', 'mean'),
        num_docks_available=('num_docks_available', 'mean'),
        num_mechanical=('num_bikes_available_types.mechanical', 'median'),
        num_ebike=('num_bikes_available_types.ebike', 'median'),
        # is_installed=('is_installed', 'mean'), No lo ponemos porque todos son true
        is_renting=('is_renting', 'mean'),
        is_returning=('is_returning', 'mean'),
        target=('target', 'mean'),
        lat=('lat', 'first'),
        lon=('lon', 'first'),
        capacity=('capacity', 'first')
    ).reset_index()

    # Filtrar por estaciones disponibles en metadata
    id = pd.read_csv('../data/metadata_sample_submission_2025.csv')
    llista_stations = pd.unique(id['station_id'])
    aggregated_df = aggregated_df[aggregated_df['station_id'].isin(llista_stations)]

    return aggregated_df


In [6]:
#AFEGIM ALTRES VARIABLES ( 4h anteriors + tipo de dia: festiu cap setmana laborable)

def crear_campos_lags(df):
   # df= df.iloc[4:] 
  
    resultados = []
    # Agrupamos por station_id
    for station, grupo in df.groupby('station_id'):
        # Aseguramos el orden cronológico
        grupo = grupo.sort_values(by=['year', 'month', 'day', 'hour']).reset_index(drop=True)
        n = len(grupo)
        # Iteramos empezando en el índice 4 y avanzamos de 5 en 5
        for i in range(4, n):
            if i - 4 >= 0:
                fila = grupo.loc[i].copy()
                # Agregamos los valores de 'target' de las 4 horas previas
                fila['ctx-4'] = grupo.loc[i - 4, 'target']
                fila['ctx-3'] = grupo.loc[i - 3, 'target']
                fila['ctx-2'] = grupo.loc[i - 2, 'target']
                fila['ctx-1'] = grupo.loc[i - 1, 'target']
                
                resultados.append(fila)
    return pd.DataFrame(resultados)


#funcio per afegir el tipo de dia, festiu laborable o cap setmana 
def day_categorization_bcn(df):
    """
    Añade una columna 'day_type' al DataFrame con la clasificación numérica de cada día:
    0 = Laborable, 1 = Fin de semana, 2 = Festivo.
    """
    import pandas as pd

    df['date'] = pd.to_datetime(df[['year', 'month', 'day']])

    # Festivos en Barcelona (2020 - marzo 2025)
    holidays = [
        "2020-01-01", "2020-01-06", "2020-04-10", "2020-04-13", "2020-05-01", "2020-06-24", "2020-09-11", "2020-09-24",
        "2020-10-12", "2020-11-01", "2020-12-06", "2020-12-08", "2020-12-25", "2020-12-26",
        "2021-01-01", "2021-01-06", "2021-04-02", "2021-04-05", "2021-05-01", "2021-06-24", "2021-09-11", "2021-09-24",
        "2021-10-12", "2021-11-01", "2021-12-06", "2021-12-08", "2021-12-25", "2021-12-26",
        "2022-01-01", "2022-01-06", "2022-04-15", "2022-04-18", "2022-05-01", "2022-06-24", "2022-09-11", "2022-09-24",
        "2022-10-12", "2022-11-01", "2022-12-06", "2022-12-08", "2022-12-25", "2022-12-26",
        "2023-01-01", "2023-01-06", "2023-04-07", "2023-04-10", "2023-05-01", "2023-06-24", "2023-09-11", "2023-09-24",
        "2023-10-12", "2023-11-01", "2023-12-06", "2023-12-08", "2023-12-25", "2023-12-26",
        "2024-01-01", "2024-01-06", "2024-03-29", "2024-04-01", "2024-05-01", "2024-06-24", "2024-09-11", "2024-09-24",
        "2024-10-12", "2024-11-01", "2024-12-06", "2024-12-08", "2024-12-25", "2024-12-26",
        "2025-01-01", "2025-01-06", "2025-04-18", "2025-04-21", "2025-05-01", "2025-06-24", "2025-09-11", "2025-09-24",
        "2025-10-12", "2025-11-01", "2025-12-06", "2025-12-08", "2025-12-25", "2025-12-26"
    ]
    
    holiday_dates = pd.to_datetime(holidays)

    # Función para clasificar el día en valores numéricos
    def classify_day(date):
        if date in holiday_dates:
            return 2  # Festivo
        elif date.weekday() >= 5:  # Sábado (5) o domingo (6)
            return 1  # Fin de semana
        else:
            return 0  # Laborable

    df['day_type'] = df['date'].apply(classify_day)

    df.drop(columns=['date'], inplace=True)

    return df



In [7]:
df_merge = cargar_datos()
df_merge= crear_campos_lags(df_merge)
df_merge_final=day_categorization_bcn(df_merge)


  df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')


In [8]:
# Ahora con el df_merge_final crear un df nuevo que sea df_train que coja 1 de cada 5 filas
df_train = df_merge_final.iloc[::5]
df_train = df_train.reset_index(drop=True)
print(df_train.head())  # Ver las primeras filas del nuevo DataFrame

   station_id    year  month  day  hour  num_bikes_available  \
0         1.0  2024.0    1.0  1.0   2.0            14.500000   
1         1.0  2024.0    1.0  1.0   7.0            13.000000   
2         1.0  2024.0    1.0  1.0  12.0             4.666667   
3         1.0  2024.0    1.0  1.0  17.0             5.666667   
4         1.0  2024.0    1.0  1.0  22.0            10.000000   

   num_docks_available  num_mechanical  num_ebike  is_renting  is_returning  \
0            30.500000             5.5        9.5         1.0           1.0   
1            32.000000             9.0        4.0         1.0           1.0   
2            40.333333             1.0        4.0         1.0           1.0   
3            39.333333             2.0        4.0         1.0           1.0   
4            35.000000             4.0        6.0         1.0           1.0   

     target        lat       lon  capacity     ctx-4     ctx-3     ctx-2  \
0  0.663043  41.397978  2.180107      46.0  0.717391  0.768116  

In [9]:
# Para separar los datos usamos df_train
X = df_train[['station_id','month', 'day', 'hour', 'ctx-1', 'ctx-2', 'ctx-3', 'ctx-4','lat','lon', 'day_type']]
y = df_train['target']

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def linear_regression(X,y, size):
#Separamos train y test
    X_train, X_test, y_train, y_test = train_test_split(X ,y , test_size= size, random_state=42)

    lm = LinearRegression()
#Entrenamiento modelo
    lm.fit(X_train, y_train)
#Predict
    y_pred = lm.predict(X_test)
#Metricas
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    return  r2, mse, mae


#RandomForest
from sklearn.ensemble import RandomForestRegressor

def RandomForest(X,y, size):
#Separamos train y test
    X_train, X_test, y_train, y_test = train_test_split(X ,y , test_size= size, random_state=42)

    rf = RandomForestRegressor(n_estimators = 100, random_state= 42)
#Entrenamiento modelo
    rf.fit(X_train, y_train)
#Predict
    y_pred = rf.predict(X_test)
#Metricas
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
#Factoring Importance
    feature_importance = rf.feature_importances_
    feature_names = X_test.columns
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    return  r2, mse, mae, importance_df, rf

In [11]:
r2_linear, mse_linear, mae_linear = linear_regression(X,y, 0.3)
r2_rf, mse_rf, mae_rf, importance_df,rf = RandomForest(X, y, 0.3)
metrics_df = pd.DataFrame({
    'Métrica': ['R²', 'MSE', 'MAE'],
    'Regresión Lineal': [r2_linear, mse_linear, mae_linear],
    'Random Forest': [r2_rf, mse_rf, mae_rf]
})
print(metrics_df)
print("Importancia de las características:")
print(importance_df)

  Métrica  Regresión Lineal  Random Forest
0      R²          0.813425       0.821507
1     MSE          0.011177       0.010693
2     MAE          0.071522       0.070555
Importancia de las características:
       Feature  Importance
4        ctx-1    0.826631
5        ctx-2    0.038013
6        ctx-3    0.022586
7        ctx-4    0.021565
0   station_id    0.019756
9          lon    0.019517
8          lat    0.017714
3         hour    0.017167
2          day    0.014763
10    day_type    0.002288
1        month    0.000000


In [12]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import pandas as pd

# Cargar el archivo CSV
df = pd.read_csv('../metadata_sample_submission_2025.csv')
df['year'] = 2025 #em faltava crear columna de year=2025 per poder aplicar funcio day_categorization_bcn 


# Función para imputar las columnas 'lat, lon' del df_merge_final en base (FK) la columa station_id
# ---- IMPUTACIÓN DE COLUMNAS ADICIONALES ----
cols_to_impute = ['lat', 'lon']

# Usamos `merge` para traer la información de df_merge_final a df
df = df.merge(df_merge_final[['station_id'] + cols_to_impute], on='station_id', how='left')
df= day_categorization_bcn(df) 

# Seleccionar las características para predecir
X_predict = df[['station_id', 'month', 'day', 'hour', 'ctx-4', 'ctx-3', 'ctx-2', 'ctx-1','lat','lon', 'day_type']]

# Hacer predicciones
predictions = rf.predict(X_predict)

# Añadir las predicciones como una nueva columna en el DataFrame
df['percentage_docks_available'] = predictions

# Guardar el DataFrame con las predicciones en un nuevo archivo CSV
df.to_csv('metadata_sample_submission_with_predictions.csv', index=False)

print("Predicciones añadidas y guardadas en 'metadata_sample_submission_with_predictions.csv'")

: 

In [None]:
df_final = df[['index','percentage_docks_available']]
df_final.to_csv('predictions.csv', index=False)