In [118]:
import os
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin


def cargar_datos():
    data_path = '../data'
    files = os.listdir(data_path)
    
    df_list = []

    for file in files:
        if file.endswith('.csv'):  
            file_path = os.path.join(data_path, file)

            parts = file.split('_')

            if len(parts) < 3:
                continue
            
            year = parts[0]  
            month_name = parts[2]  

            if year not in ['2023', '2024']:
                continue

            # if month_name not in ['Gener', 'Febrer', 'Marc', 'Abril', 'Maig']:
            if month_name not in ['Gener', 'Febrer', 'Marc', 'Abril', 'Maig']:
                continue

            try:
                df = pd.read_csv(file_path, low_memory=True, dtype=str, skiprows=lambda i: i % 1000 != 0)

                df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')
                df['last_updated'] = pd.to_datetime(df['last_updated'], unit='s', errors='coerce')

                # en base el last_reported creame la columna, year, month, day, hour, minute
                df['year'] = df['last_reported'].dt.year
                df['month'] = df['last_reported'].dt.month
                df['day'] = df['last_reported'].dt.day
                df['hour'] = df['last_reported'].dt.hour
                df['minute'] = df['last_reported'].dt.minute

                if 'traffic' in df.columns:
                    df.drop(columns=['traffic'], inplace=True)
                
                if 'V1' in df.columns:
                    df.drop(columns=['V1'], inplace=True)

                # Parsear las columnas 'num_bikes_available','num_bikes_available_types.mechanical', 'num_bikes_available_types.ebike' a int
                for col in ['num_bikes_available','num_docks_available','num_bikes_available_types.mechanical', 'num_bikes_available_types.ebike']:
                    df[col] = df[col].astype('float64')

                df_list.append(df)

            except Exception:
                continue

    merged_df = pd.concat(df_list, ignore_index=True) if df_list else None

    if merged_df is not None:
        df_2 = pd.read_csv('../data/Informacio_Estacions_Bicing_2025.csv', usecols=['station_id', 'lat', 'lon','capacity'], low_memory=False) #aqui faig el merge amb el segon dataset, només agafo les variables geoespacials
        
        merged_df['station_id'] = merged_df['station_id'].astype(str)
        df_2['station_id'] = df_2['station_id'].astype(str)
        
        merged_df = merged_df.merge(df_2, on='station_id', how='left')

    # Crea una columna que sea la suman 'num_bikes_available' + 'num_docks_available' que se llame sum_capacity
    merged_df['sum_capacity'] = merged_df['num_bikes_available'] + merged_df['num_docks_available']

    # Calcular la mediana de 'sum_capacity' para cada 'station_id'
    median_capacity = merged_df.groupby('station_id')['sum_capacity'].median()

    def impute_capacity(row):
        if pd.isna(row['capacity']):
            return median_capacity[row['station_id']]
        else:
            return row['capacity']

    merged_df['capacity'] = merged_df.apply(impute_capacity, axis=1)
    merged_df['diff_capacity_available'] = merged_df['capacity'] - (merged_df['num_bikes_available'] + merged_df['num_docks_available'])


    # En las columnas que el diff_capacity_available sea > 0 sumamos este valor al num_docks_avaiable.
    merged_df.loc[merged_df['diff_capacity_available'] > 0, 'num_docks_available'] += merged_df['diff_capacity_available']

    # En las columnas que el diff_capacity_available sea < 0 restamos este valor al num_docks_available.
    merged_df.loc[merged_df['diff_capacity_available'] < 0, 'num_docks_available'] += merged_df['diff_capacity_available']

    # num_docks_available que nunca sea mas grande que capacity ni mas pequeño que 0
    # Aseguramos que num_docks_available esté entre 0 y capacity, de forma row-wise.
    merged_df['num_docks_available'] = merged_df.apply(
        lambda row: min(max(row['num_docks_available'], 0), row['capacity']),
        axis=1
    )

    # Creamos la columna target que sera el % de bicis disponibles
    merged_df['target'] = merged_df['num_docks_available'] / merged_df['capacity']

    return merged_df

In [106]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class CyclicalFeaturesTransformer(BaseEstimator, TransformerMixin):
    """
    Transformer para crear variables cíclicas (seno y coseno) a partir de columnas temporales.
    Por defecto, transforma: month, day, hour y minute.
    La columna 'year' se deja sin transformar.
    """
    def __init__(self, columns=None):
        """
        columns: diccionario que define para cada columna el valor máximo (p.ej. {"month": 12, "day": 31, "hour": 24, "minute": 60})
        Si se omite, se usan valores por defecto.
        """
        if columns is None:
            self.columns = {"month": 12, "day": 31, "hour": 24, "minute": 60}
        else:
            self.columns = columns

    def fit(self, X, y=None):
        # No es necesario aprender nada de X
        return self

    def transform(self, X):
        X_ = X.copy()
        for col, max_val in self.columns.items():
            if col in X_.columns:
                X_[f'{col}_sin'] = np.sin(2 * np.pi * X_[col].astype(float) / max_val)
                X_[f'{col}_cos'] = np.cos(2 * np.pi * X_[col].astype(float) / max_val)
        return X_

In [52]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder


status_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='NOT_IN_SERVICE')),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore', categories=[['IN_SERVICE', 'MAINTENANCE', 'NOT_IN_SERVICE', 'PLANNED']])), # One-Hot Encoding
])

is_charging_station_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse_output=False, drop='if_binary')),
])

ordinal_pipeline = Pipeline([
    ('encoder', OrdinalEncoder())
])

cyclic_pipeline = Pipeline([
    ('cyclic_transformer', CyclicalFeaturesTransformer())
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('status', status_pipeline, ['status']),
        ('is_charging_station', is_charging_station_pipeline, ['is_charging_station']),
        ('ordinal', ordinal_pipeline, ['month','day','hour']),
        ('cyclic', cyclic_pipeline, ['month','day','hour','minute']),
        ('year', 'passthrough', ['year']),
    ],
    remainder='passthrough'
)

In [119]:
df_merge = cargar_datos()

# transformed_data = preprocessor.fit_transform(df_merge)

  df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')
  df['last_updated'] = pd.to_datetime(df['last_updated'], unit='s', errors='coerce')
  df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')
  df['last_updated'] = pd.to_datetime(df['last_updated'], unit='s', errors='coerce')
  df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')
  df['last_updated'] = pd.to_datetime(df['last_updated'], unit='s', errors='coerce')
  df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')
  df['last_updated'] = pd.to_datetime(df['last_updated'], unit='s', errors='coerce')
  df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')
  df['last_updated'] = pd.to_datetime(df['last_updated'], unit='s', errors='coerce')
  df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s', errors='coerce')
  df['last_updated'] = pd.to_datetime(df['last_update

In [112]:
df_capacity = df_merge[['capacity','num_bikes_available','num_bikes_available_types.mechanical', 'num_bikes_available_types.ebike', 'num_docks_available']]

df_capacity['diff_bike_types'] = df_capacity['num_bikes_available_types.mechanical'] + df_capacity['num_bikes_available_types.ebike'] - df_capacity['num_bikes_available']
df_capacity['diff_capacity_available'] = df_capacity['capacity'] - (df_capacity['num_bikes_available'] + df_capacity['num_docks_available'])

# Dime el total de diff_capacity_available / entre los diff_capacity_available que no son iguales a 0
df_capacity[df_capacity['diff_capacity_available'] == 0].shape[0] / df_capacity.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_capacity['diff_bike_types'] = df_capacity['num_bikes_available_types.mechanical'] + df_capacity['num_bikes_available_types.ebike'] - df_capacity['num_bikes_available']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_capacity['diff_capacity_available'] = df_capacity['capacity'] - (df_capacity['num_bikes_available'] + df_capacity['num_docks_available'])


1.0