# 02 Forecast
- Import necessary data for prediction
- Feature creation
- Perform forecasts
- Export prediction

In [0]:
!pip install flaml

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting flaml
  Using cached FLAML-2.3.2-py3-none-any.whl (313 kB)
Installing collected packages: flaml
Successfully installed flaml-2.3.2
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


In [0]:
#logging.getLogger().setLevel(logging.ERROR)

In [0]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import math
import matplotlib.pyplot as plt
import numpy as np

from flaml import AutoML

In [0]:
# Importing data necessary for prediction
input_path = '/dbfs/mnt/02_SILVER/Growth_Navigator_STRATIS/hf/'
input_path_sj = '/dbfs/mnt/02_SILVER/Growth_Navigator_STRATIS/sj/'

filename = 'data_for_prediction_sj.csv'
data = pd.read_csv(input_path_sj + filename,  parse_dates=['SES_TRX_DATE'])

custom_scaler = pd.read_csv(input_path + 'custom_scaler_sj.csv')

In [0]:
# Dropping irrelevant columns
data.drop(columns=['month_1', 'month_2', 'month_3','month_4', 'month_5', 'month_6','month_7', 'month_8', 'month_9','month_10', 'month_11', 'month_12', 
                   'dayofweek_0', 'dayofweek_1','dayofweek_2', 'dayofweek_3', 'dayofweek_4', 'dayofweek_5','dayofweek_6',
                   'year_2023',  'year_2024'], inplace=True)

data.drop(columns=['VOUCHER_FLAG'], inplace=True)

## Create features
- Using past data as features (lag features) for prediction

In [0]:
# Creating the features used for prediction
def create_features(data, n_lags=7, feature_name=None, horizont=7):
    features = data.sort_values(['id', 'SES_TRX_DATE'])
    features.drop(columns=[feature_name], inplace=True)

    # Create lag features
    for lag in range(horizont, 2*horizont+1, 1):
        features[f'{feature_name}_lag{lag}'] = data[['id', f'{feature_name}']].groupby('id').shift(lag)

    for lag in range(2*horizont, 8*horizont + 1, 7):
        features[f'{feature_name}_lag{lag}'] = data[['id', f'{feature_name}']].groupby('id').shift(lag)

    for lag in range(8*horizont, 26*horizont + 1, 28):
        features[f'{feature_name}_lag{lag}'] = data[['id', f'{feature_name}']].groupby('id').shift(lag)
    
    features[feature_name] = data[f'{feature_name}']
    feature_list = features.columns.tolist()

    return features, feature_list

## Forecast function
- Define LGBM forecast function
- Perform predictions
- Calculate MAPE

In [0]:
# Defining parameters for the prediction
# The target variable we want to predict
target = 'value_scaled'

# The start date for the predictions
prediction_start_date = '2024-10-01'

# Confidence level for the lower/upper bound
alpha = 0.7

# Horizont of the predictions
horizont = 7

# Preparing data
data.reset_index(drop=True, inplace=True)
data.loc[data.SES_TRX_DATE >= prediction_start_date, target] = None

# Creating features
data_all, feature_list = create_features(data, 
                                         n_lags= 2*horizont, 
                                         feature_name=target, 
                                         horizont=horizont)

# LightGBM parameters
params = {
       'objective': 'regression',  # Multivariate regression task
       'metric': 'mape',
       'boosting_type': 'gbdt',  # Gradient boosting decision tree
       'learning_rate': 0.05,
       'num_leaves': 100, #31
       'feature_fraction': 0.9,
       'bagging_fraction': 0.8,
       'bagging_freq': 5
}

# Alternative parameters for quantile regression
lower_params = {
    'objective': 'quantile',  
    'alpha': 1 - alpha, 
    'boosting_type': 'gbdt', 
    'learning_rate': 0.05,
    'num_leaves': 100,  # 31
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
}

upper_params = {
    'objective': 'quantile',  
    'alpha': alpha, 
    'boosting_type': 'gbdt', 
    'learning_rate': 0.05,
    'num_leaves': 100,  # 31
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
}

In [0]:
# Finding the start and end of the prediction period
prediction_start_date = pd.to_datetime(prediction_start_date)
prediction_end_date = pd.to_datetime(prediction_start_date) + pd.offsets.MonthEnd(0)
  
# Creating the list of dates for prediction
dates = pd.date_range(start=prediction_start_date, end=prediction_end_date, freq='7D')
dates = dates.append(pd.DatetimeIndex([prediction_end_date])+pd.Timedelta(days=1))

# Define prediction intervals
intervals = [(dates[i], dates[i + 1] - pd.Timedelta(days=1)) for i in range(len(dates) - 1)]

In [0]:
# Defining the function for the prediction
def predict_future(data, prediction_start_date, prediction_end_date, target, horizont):

  # Separating past and future data
  dt_past = data[data.SES_TRX_DATE < prediction_start_date].dropna()
  dt_future = data[(data.SES_TRX_DATE >= prediction_start_date) & (data.SES_TRX_DATE <= prediction_end_date)]      

  # Separating predictors and target 
  X = dt_past.drop(columns=['value', 'id', 'SES_TRX_DATE', target])
  y = dt_past[target]  

  # Split data into training and test sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

  # Create LightGBM dataset
  train_data = lgb.Dataset(X_train, label=y_train)
  test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

  # Train the main regression model
  model = lgb.train(
      params,
      train_data,
      valid_sets=[train_data, test_data],
      num_boost_round=100,
      callbacks=[lgb.early_stopping(stopping_rounds=100)]
  )

  # Train lower quantile model
  lower = lgb.train(
      lower_params,
      train_data,
      valid_sets=[train_data, test_data],
      num_boost_round=100,
      callbacks=[lgb.early_stopping(stopping_rounds=100)]
  )

  # Train upper quantile model
  upper = lgb.train(
      upper_params,
      train_data,
      valid_sets=[train_data, test_data],
      num_boost_round=100,
      callbacks=[lgb.early_stopping(stopping_rounds=100)]
  )

  # Calculate contributions and predictions
  contributions = model.predict(
      dt_future.drop(columns=['value', 'id', 'SES_TRX_DATE', target]),
      num_iteration=model.best_iteration,
      pred_contrib=True
  )

  dt_future['predicted'] = model.predict(
      dt_future.drop(columns=['value', 'id', 'SES_TRX_DATE', target]),
      num_iteration=model.best_iteration, predict_disable_shape_check=True
  )
  dt_future['lower_scaled'] = lower.predict(
      dt_future.drop(columns=['value', 'id', 'SES_TRX_DATE', target]),
      num_iteration=model.best_iteration, predict_disable_shape_check=True
  )
  dt_future['upper_scaled'] = upper.predict(
      dt_future.drop(columns=['value', 'id', 'SES_TRX_DATE', target]),
      num_iteration=model.best_iteration, predict_disable_shape_check=True
  )

  # Apply custom scaling
  dt_future = dt_future.merge(custom_scaler, how='left', on=['id'])
  dt_future['prediction'] = dt_future.apply(lambda x: max(0, x['predicted'] * (x['max'] - x['min']) + x['min']), axis=1)
  dt_future['lower_bound'] = dt_future.apply(lambda x: max(0, x['lower_scaled'] * (x['max'] - x['min']) + x['min']), axis=1)
  dt_future['upper_bound'] = dt_future.apply(lambda x: min(max(0, x['upper_scaled'] * (x['max'] - x['min']) + x['min']), x['max'] * 1.5), axis=1)

  # Add the prediction horizon
  dt_future['horizont'] = horizont

  return dt_future[['id', 'SES_TRX_DATE', 'horizont', 'prediction', 'lower_bound', 'upper_bound']]

In [0]:
# Performing the prediction for the intervals
prediction_all = pd.DataFrame() 

for i in intervals:
  # Get the start and end date for the prediction
  start = pd.to_datetime(i[0])
  end = pd.to_datetime(i[1])
  h = (end - pd.to_datetime(prediction_start_date)).days + 1 
  
  # Get the feature list
  features_wo_lag = [f for f in feature_list if "lag" not in f]
  features_w_lag = [f for f in feature_list if "lag" in f and int(f.split('lag')[-1]) >= h]
  features = features_wo_lag + features_w_lag

  df = data_all[features]
  print(f"horizont = {h}")

  # Prediction
  prediction = predict_future(df, start, end, target, horizont=h)
  prediction_all = pd.concat([prediction_all, prediction])

horizont = 7
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007634 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5367
[LightGBM] [Info] Number of data points in the train set: 651522, number of used features: 280
[LightGBM] [Info] Start training from score 0.437076
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	training's mape: 0.0723902	valid_1's mape: 0.0770645
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037728 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5367
[LightGBM] [Info] Number of data points in the train set: 651522, number of used features: 280
[LightGBM] [Info] Start training from score 0.339623
Training until validation scores don't improve for 100 rounds
Did 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt_future['predicted'] = model.predict(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt_future['lower_scaled'] = lower.predict(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt_future['upper_scaled'] = upper.predict(


horizont = 14
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022815 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3582
[LightGBM] [Info] Number of data points in the train set: 651522, number of used features: 273
[LightGBM] [Info] Start training from score 0.437076
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	training's mape: 0.0751022	valid_1's mape: 0.081259
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028785 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3582
[LightGBM] [Info] Number of data points in the train set: 651522, number of used features: 273
[LightGBM] [Info] Start training from score 0.339623
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	training's qua

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt_future['predicted'] = model.predict(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt_future['lower_scaled'] = lower.predict(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt_future['upper_scaled'] = upper.predict(


horizont = 21
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028713 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3327
[LightGBM] [Info] Number of data points in the train set: 651522, number of used features: 272
[LightGBM] [Info] Start training from score 0.437076
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	training's mape: 0.0765381	valid_1's mape: 0.08361
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005856 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3327
[LightGBM] [Info] Number of data points in the train set: 651522, number of used features: 272
[LightGBM] [Info] Start training from score 0.339623
Training until validation scores don't improve for 100 rounds
Did n

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt_future['predicted'] = model.predict(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt_future['lower_scaled'] = lower.predict(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt_future['upper_scaled'] = upper.predict(


horizont = 28
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005084 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3072
[LightGBM] [Info] Number of data points in the train set: 651522, number of used features: 271
[LightGBM] [Info] Start training from score 0.437076
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	training's mape: 0.0777902	valid_1's mape: 0.0858443
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026815 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3072
[LightGBM] [Info] Number of data points in the train set: 651522, number of used features: 271
[LightGBM] [Info] Start training from score 0.339623
Training until validation scores don't improve for 100 rounds
Did

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt_future['predicted'] = model.predict(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt_future['lower_scaled'] = lower.predict(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt_future['upper_scaled'] = upper.predict(


horizont = 31
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004818 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2817
[LightGBM] [Info] Number of data points in the train set: 651522, number of used features: 270
[LightGBM] [Info] Start training from score 0.437076
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[99]	training's mape: 0.0792624	valid_1's mape: 0.0877647
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004713 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2817
[LightGBM] [Info] Number of data points in the train set: 651522, number of used features: 270
[LightGBM] [Info] Start training from score 0.339623
Tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt_future['predicted'] = model.predict(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt_future['lower_scaled'] = lower.predict(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt_future['upper_scaled'] = upper.predict(


In [0]:

# Calculate MAPE fpr all predictions
dt_plot = data_all[['SES_TRX_DATE', 'id', 'value']].merge(prediction_all, how='left', on = ['id', 'SES_TRX_DATE'])
a = dt_plot[(~dt_plot.prediction.isna())&(dt_plot.value > 0)]

mean_absolute_percentage_error(a['value'], a['prediction'])

0.2160614426051421

In [0]:
# Export predictions
prediction_all.to_csv(input_path_sj+f'pred/lightgbm_4weeks_pred_{prediction_start_date}_eval_bound{alpha}_old.csv', index=False)

In [0]:
# Calculate MAPE for each horizont
a.groupby('horizont').apply(lambda x: mean_absolute_percentage_error(x['value'], x['prediction']))

horizont
7.0     0.208862
14.0    0.203308
21.0    0.192915
28.0    0.258383
31.0    0.217832
dtype: float64