In [None]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import plotly.express as px
from prophet import Prophet

In [None]:
df = pd.read_csv('../data/processed/merged_data.csv')

station_counts = (
    df['swap_station_id']
    .value_counts()
    .reset_index()
    .rename(columns={'index': 'swap_station_id', 'swap_station_id': 'n_swaps'})
)

df = df[df['swap_station_id'].isin(station_counts[:5].swap_station_id)][:].reset_index(drop=True)


df['datetime'] = pd.to_datetime(df['created_at']).dt.floor('h')
df['date'] = df['datetime'].dt.date 

def group_data(df, time_col, id_col):
    # Agora agrupar por gabinete e hora
    df_model = (
        df
        .groupby([id_col, time_col])
        .size()
        .reset_index(name='counts')
    )

    freq = 'H'
    if time_col == 'date':
        freq = 'D'
    # Determinar o intervalo de datas completo
    all_hours = pd.date_range(
        start=df_model[time_col].min(),
        end=df_model[time_col].max(),
        freq=freq
    )
    
    # Para cada cabinet_id, combinar com todas as horas
    cabinets = df_model[id_col].unique()
    full_index = pd.MultiIndex.from_product(
        [cabinets, all_hours],
        names=[id_col, time_col]
    )
    
    # Reindexar e preencher horas faltantes com 0
    df_model = (
        df_model
        .set_index([id_col, time_col])
        .reindex(full_index, fill_value=0)
        .reset_index()
    )
    
    df_model['hour'] = df_model[time_col].dt.hour
    df_model['day_of_week'] = df_model[time_col].dt.dayofweek   
    df_model['is_weekend']  = (df_model['day_of_week'] >= 5).astype(int)
    df_model['month'] = df_model[time_col].dt.month
    
    # Lags úteis para capturar autocorrelação
    for lag in [1, 2, 3, 7, 30, 24, 48]:  # 1h, 2h, 1 dia, 2 dias
        df_model[f'lag_{lag}'] = df_model['counts'].shift(lag).fillna(0)
    
    # Converter categóricas em categoria para GLM
    df_model['day_of_week']  = df_model['day_of_week'].astype('category')
    df_model['month']  = df_model['month'].astype('category')
    
    # Rolling mean
    df_model['rolling_mean_3'] = df_model['counts'].shift(1).rolling(window=3).mean().fillna(df_model['counts'].mean())
    df_model['rolling_mean_24'] = df_model['counts'].shift(1).rolling(window=24).mean().fillna(df_model['counts'].mean())
    df_model['rolling_mean_7'] = df_model['counts'].shift(1).rolling(window=7).mean().fillna(df_model['counts'].mean())
    df_model['rolling_mean_30'] = df_model['counts'].shift(1).rolling(window=30).mean().fillna(df_model['counts'].mean())

    return df_model

In [None]:
df_model = group_data(df, 'datetime', 'swap_station_id')

In [None]:
df_model.swap_station_id.unique()

In [None]:
def make_predictions_prophet(df, time_col, id_col, id):
    # Filtrar a série
    df_sub = df[df[id_col] == id].copy().reset_index(drop=True)

    freq = 'D'
    horizon = 7
    if time_col == 'datetime':
        freq = 'H'
        horizon = 24*7
    
    # Preparar dataframe para Prophet
    prophet_df = df_sub[[time_col, 'counts']].rename(columns={time_col:'ds', 'counts':'y'})
    
    # Ajustar Prophet
    model = Prophet(
        daily_seasonality=True,
        weekly_seasonality=True,
        yearly_seasonality=False,
        interval_width=0.95
    )
    
    model.fit(prophet_df)
    
    # Criar dataframe futuro
    future = model.make_future_dataframe(periods=horizon, freq=freq)
    
    # Prever
    forecast = model.predict(future)
    
    # Criar coluna predicted arredondada
    forecast['predicted'] = forecast['yhat'].round()
    
    # Mesclar histórico + futuro
    df_pred = pd.merge(
        df_sub,
        forecast[['ds','predicted']],
        left_on=time_col,
        right_on='ds',
        how='outer'
    ).drop(columns=['ds'])

    df_pred['datetime'] = forecast['ds']
    df_pred.loc[df_pred['datetime']>prophet_df['ds'].max(), 'predicted'] = df_pred['predicted']*1.1
    # Garantir que histórico que não tem previsão futura mantém counts original
    df_pred['predicted'] = df_pred['predicted'].fillna(np.nan)

    # Ordenar por tempo
    df_pred = df_pred.sort_values(time_col).reset_index(drop=True)
    
    return df_pred

In [None]:
df_prophet = make_predictions_prophet(df_model, 'datetime', 'swap_station_id', 553)

In [None]:
def make_predictions(df, time_col, id_col):
    df_preds = pd.DataFrame()
    for id in df[id_col].unique():
        df_sub = df[df[id_col] == id].copy().reset_index(drop=True)

        freq = 'D'
        horizon = 7
        if time_col == 'datetime':
            freq = 'H'
            horizon = 24*7
        
        # Preparar dataframe para Prophet
        prophet_df = df_sub[[time_col, 'counts']].rename(columns={time_col:'ds', 'counts':'y'})
        
        # Ajustar Prophet
        model = Prophet(
            daily_seasonality=True,
            weekly_seasonality=True,
            yearly_seasonality=False,
            interval_width=0.95
        )
        
        model.fit(prophet_df)
        
        # Criar dataframe futuro
        future = model.make_future_dataframe(periods=horizon, freq=freq)
        
        # Prever
        forecast = model.predict(future)
        
        # Criar coluna predicted arredondada
        forecast['predicted'] = forecast['yhat']
        
        # Mesclar histórico + futuro
        df_pred = pd.merge(
            df_sub,
            forecast[['ds','predicted']],
            left_on=time_col,
            right_on='ds',
            how='outer'
        )

        df_pred.loc[df_pred['ds']>prophet_df['ds'].max(), 'predicted'] = df_pred['predicted']*1.1
        
        # Garantir que histórico que não tem previsão futura mantém counts original
        df_pred['predicted'] = df_pred['predicted'].fillna(np.nan).round()

        df_pred.loc[df_pred['predicted']<0, 'predicted'] = 0
        
        # Ordenar por tempo
        df_pred = df_pred.sort_values(time_col).reset_index(drop=True)
        
        df_preds = pd.concat([df_preds, df_pred])

    return df_preds

In [None]:
df_preds = make_predictions(df_model, 'datetime', 'swap_station_id')

In [None]:
df_preds.predicted.tail()

In [None]:
df_preds.swap_station_id.value_counts()

In [None]:
px.line(df_preds[df_preds['swap_station_id']==1], x='ds', y=['counts','predicted'])