In [49]:
pip install pydantic

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from prophet import Prophet

In [2]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from matplotlib import pyplot as plt

In [11]:
df = pd.read_csv('C:\\Users\\Utilisateur\\Downloads\\scooter_demand_simulation.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   datetime  840 non-null    object
 1   zone_id   840 non-null    object
 2   demand    840 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 19.8+ KB


In [5]:
import os
import pickle
import json
import optuna

In [6]:
def train_and_save_models(df, model_dir="models", use_optuna=False, n_trials=30):
    os.makedirs(f"{model_dir}/prophet_zones", exist_ok=True)

    # Feature engineering
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['hour'] = df['datetime'].dt.hour
    df['dayofweek'] = df['datetime'].dt.dayofweek
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
    df['ds'] = df['datetime']
    df['y'] = df['demand']

    prophet_preds = []

    # Prophet по зонам
    for zone in df['zone_id'].unique():
        df_zone = df[df['zone_id'] == zone][['ds', 'y']].copy()
        model = Prophet(daily_seasonality=True, weekly_seasonality=False)
        model.fit(df_zone)

        with open(f"{model_dir}/prophet_zones/prophet_zone_{zone}.pkl", "wb") as f:
            pickle.dump(model, f)

        future = model.make_future_dataframe(periods=0, freq='H')
        forecast = model.predict(future)
        #df_zone['yhat'] = forecast['yhat'].values
        forecast['zone_id'] = zone
        prophet_preds.append(forecast[['ds', 'zone_id', 'yhat']])

    prophet_all = pd.concat(prophet_preds, ignore_index=True)
    df = df.merge(prophet_all, on=['ds', 'zone_id'], how='left')
    df['residual'] = df['y'] - df['yhat']

    df['lag_1'] = df.groupby('zone_id')['residual'].shift(1)
    df['lag_24'] = df.groupby('zone_id')['residual'].shift(24)
    df['rolling_mean_24'] = df.groupby('zone_id')['residual'].shift(1).rolling(24).mean().reset_index(0, drop=True)
    df_model = df.dropna()

    features = ['hour', 'dayofweek', 'is_weekend', 'zone_id', 'lag_1', 'lag_24', 'rolling_mean_24']
    target = 'residual'

    cutoff = df_model['ds'].max() - pd.Timedelta(days=2)
    train = df_model[df_model['ds'] <= cutoff]
    test = df_model[df_model['ds'] > cutoff]

    X_train, y_train = train[features], train[target]
    X_test, y_test = test[features], test[target]

    if use_optuna:
        def objective(trial):
            params = {
                "iterations": 100,
                "depth": trial.suggest_int("depth", 4, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
                "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
                "loss_function": "RMSE",
                "random_seed": 42,
                "verbose": 0
            }
            model = CatBoostRegressor(**params)
            model.fit(X_train, y_train, cat_features=['zone_id'], eval_set=(X_test, y_test), early_stopping_rounds=10)
            preds = model.predict(X_test)
            # Calculate the final forecast within the objective function
            trial_final_forecast = test.loc[X_test.index, 'yhat'].values + preds
            return mean_squared_error(y_test, trial_final_forecast) ** 0.5

        study = optuna.create_study(direction="minimize")
        study.optimize(objective, n_trials=n_trials)
        best_params = study.best_params
        best_params.update({"iterations": 100, "loss_function": "RMSE", "random_seed": 42, "verbose": 0})
        model = CatBoostRegressor(**best_params)
    else:
        model = CatBoostRegressor(verbose=0)

    model.fit(X_train, y_train, cat_features=['zone_id'])

    # Сохраняем модель и фичи
    model.save_model(f"{model_dir}/catboost_residual_model.cbm")
    with open(f"{model_dir}/features.json", "w") as f:
        json.dump(features, f)

    # Метрики
    pred_resid = model.predict(X_test)
    final_forecast = test['yhat'].values + pred_resid # This calculation is still needed for the final metrics reported outside Optuna
    rmse = mean_squared_error(y_test, final_forecast) ** 0.5
    mae = mean_absolute_error(y_test, final_forecast)

    print(f"RMSE: {rmse:.2f} | MAE: {mae:.2f}")

In [14]:
train_and_save_models(df, use_optuna=True, n_trials=25)

10:08:38 - cmdstanpy - INFO - Chain [1] start processing
10:08:39 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(
10:08:39 - cmdstanpy - INFO - Chain [1] start processing
10:08:39 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(
10:08:39 - cmdstanpy - INFO - Chain [1] start processing
10:08:39 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(
10:08:39 - cmdstanpy - INFO - Chain [1] start processing
10:08:39 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(
10:08:40 - cmdstanpy - INFO - Chain [1] start processing
10:08:40 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(
[I 2025-06-03 10:08:40,371] A new study created in memory with name: no-name-55799679-0a80-41e7-af6f-795b1b34b2f3
[I 2025-06-03 10:08:40,821] Trial 0 finished with value: 14.97566936159995 and parameters: {'depth': 9, 'learning_rate': 0.12627376482774966, 'l2_leaf_reg': 6.992278793138479}. Best is trial 0 wit

RMSE: 15.02 | MAE: 14.27


In [8]:
df.head(120)

Unnamed: 0,datetime,zone_id,demand,hour,dayofweek,is_weekend,ds,y
0,2024-01-01 00:00:00,zone_0,11,0,0,0,2024-01-01 00:00:00,11
1,2024-01-01 01:00:00,zone_0,11,1,0,0,2024-01-01 01:00:00,11
2,2024-01-01 02:00:00,zone_0,13,2,0,0,2024-01-01 02:00:00,13
3,2024-01-01 03:00:00,zone_0,16,3,0,0,2024-01-01 03:00:00,16
4,2024-01-01 04:00:00,zone_0,14,4,0,0,2024-01-01 04:00:00,14
...,...,...,...,...,...,...,...,...
115,2024-01-05 19:00:00,zone_0,6,19,4,0,2024-01-05 19:00:00,6
116,2024-01-05 20:00:00,zone_0,6,20,4,0,2024-01-05 20:00:00,6
117,2024-01-05 21:00:00,zone_0,5,21,4,0,2024-01-05 21:00:00,5
118,2024-01-05 22:00:00,zone_0,9,22,4,0,2024-01-05 22:00:00,9


NameError: name 'mae' is not defined

In [13]:
import os
import pickle
import json
import pandas as pd
from prophet import Prophet
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error, mean_absolute_error
import optuna

def train_and_save_models(df, model_dir="models", use_optuna=False, n_trials=30):
    os.makedirs(f"{model_dir}/prophet_zones", exist_ok=True)

    # Feature engineering
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['hour'] = df['datetime'].dt.hour
    df['dayofweek'] = df['datetime'].dt.dayofweek
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
    df['ds'] = df['datetime']
    df['y'] = df['demand']

    prophet_preds = []

    # Prophet по зонам
    for zone in df['zone_id'].unique():
        df_zone = df[df['zone_id'] == zone][['ds', 'y']].copy()
        model = Prophet(daily_seasonality=True, weekly_seasonality=False)
        model.fit(df_zone)

        with open(f"{model_dir}/prophet_zones/prophet_zone_{zone}.pkl", "wb") as f:
            pickle.dump(model, f)

        future = model.make_future_dataframe(periods=0, freq='H')
        forecast = model.predict(future)
        df_zone['yhat'] = forecast['yhat'].values
        df_zone['zone_id'] = zone
        prophet_preds.append(df_zone[['ds', 'zone_id', 'yhat']])

    prophet_all = pd.concat(prophet_preds)
    df = df.merge(prophet_all, on=['ds', 'zone_id'], how='left')
    df['residual'] = df['y'] - df['yhat']

    df['lag_1'] = df.groupby('zone_id')['residual'].shift(1)
    df['lag_24'] = df.groupby('zone_id')['residual'].shift(24)
    df['rolling_mean_24'] = df.groupby('zone_id')['residual'].shift(1).rolling(24).mean().reset_index(0, drop=True)
    df_model = df.dropna()

    features = ['hour', 'dayofweek', 'is_weekend', 'zone_id', 'lag_1', 'lag_24', 'rolling_mean_24']
    target = 'residual'

    cutoff = df_model['ds'].max() - pd.Timedelta(days=2)
    train = df_model[df_model['ds'] <= cutoff]
    test = df_model[df_model['ds'] > cutoff]

    X_train, y_train = train[features], train[target]
    X_test, y_test = test[features], test[target]

    if use_optuna:
        def objective(trial):
            params = {
                "iterations": 100,
                "depth": trial.suggest_int("depth", 4, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
                "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
                "loss_function": "RMSE",
                "random_seed": 42,
                "verbose": 0
            }
            model = CatBoostRegressor(**params)
            model.fit(X_train, y_train, cat_features=['zone_id'], eval_set=(X_test, y_test), early_stopping_rounds=10)
            preds = model.predict(X_test)
            # Calculate the final forecast within the objective function
            trial_final_forecast = test.loc[X_test.index, 'yhat'].values + preds
            return mean_squared_error(y_test, trial_final_forecast) ** 0.5

        study = optuna.create_study(direction="minimize")
        study.optimize(objective, n_trials=n_trials)
        best_params = study.best_params
        best_params.update({"iterations": 100, "loss_function": "RMSE", "random_seed": 42, "verbose": 0})
        model = CatBoostRegressor(**best_params)
    else:
        model = CatBoostRegressor(verbose=0)

    model.fit(X_train, y_train, cat_features=['zone_id'])

    # Сохраняем модель и фичи
    model.save_model(f"{model_dir}/catboost_residual_model.cbm")
    with open(f"{model_dir}/features.json", "w") as f:
        json.dump(features, f)

    # Метрики
    pred_resid = model.predict(X_test)
    final_forecast = test['yhat'].values + pred_resid # This calculation is still needed for the final metrics reported outside Optuna
    rmse = mean_squared_error(y_test, final_forecast) ** 0.5
    mae = mean_absolute_error(y_test, final_forecast)

    print(f"RMSE: {rmse:.2f} | MAE: {mae:.2f}")

In [20]:
import requests
import json

def predict_model(payload):
    url = 'http://127.0.0.1:8000/predict'
    response = requests.post(url, json=payload)
    return response.json()

# Загружаем весь JSON как словарь
with open('predict_payload.json', 'r') as f:
    payload = json.load(f)

predictions = predict_model(payload)
print(predictions)


[{'datetime': '2024-01-02T00:00:00', 'zone_id': 'zone_0', 'final_forecast': 10.228759659518735}, {'datetime': '2024-01-02T01:00:00', 'zone_id': 'zone_0', 'final_forecast': 11.114449945777897}, {'datetime': '2024-01-02T02:00:00', 'zone_id': 'zone_0', 'final_forecast': 11.96505587549384}, {'datetime': '2024-01-02T03:00:00', 'zone_id': 'zone_0', 'final_forecast': 12.964011372556621}, {'datetime': '2024-01-02T04:00:00', 'zone_id': 'zone_0', 'final_forecast': 13.992614524159482}, {'datetime': '2024-01-02T05:00:00', 'zone_id': 'zone_0', 'final_forecast': 14.710436843887623}, {'datetime': '2024-01-02T06:00:00', 'zone_id': 'zone_0', 'final_forecast': 14.865754183393655}, {'datetime': '2024-01-02T07:00:00', 'zone_id': 'zone_0', 'final_forecast': 14.532684442760239}, {'datetime': '2024-01-02T08:00:00', 'zone_id': 'zone_0', 'final_forecast': 13.996801826486275}, {'datetime': '2024-01-02T09:00:00', 'zone_id': 'zone_0', 'final_forecast': 13.434694798084964}, {'datetime': '2024-01-02T10:00:00', 'zon