<center style="font-size:160%; color:brown; font-family:verdana;"> Forecasting Sales of Courses </center>


## Import dependancies

In [1]:
import pandas as pd
import numpy as np
import holidays
from datetime import datetime
from pycaret.regression import *

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136950 entries, 0 to 136949
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        136950 non-null  int64 
 1   date      136950 non-null  object
 2   country   136950 non-null  object
 3   store     136950 non-null  object
 4   product   136950 non-null  object
 5   num_sold  136950 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 6.3+ MB


In [4]:
data_full=pd.concat([train,test],axis=0,sort=False,ignore_index=True)


In [5]:
data_full['date'] = pd.to_datetime(data_full['date'])

In [6]:
data_full.describe(datetime_is_numeric=True)

Unnamed: 0,id,date,num_sold
count,164325.0,164325,136950.0
mean,82162.0,2020-01-01 00:00:00,165.522636
min,0.0,2017-01-01 00:00:00,2.0
25%,41081.0,2018-07-02 00:00:00,46.0
50%,82162.0,2020-01-01 00:00:00,98.0
75%,123243.0,2021-07-02 00:00:00,184.0
max,164324.0,2022-12-31 00:00:00,1380.0
std,47436.68583,,183.691575


## Preprocessing

In [7]:
# Create year, month, day column
data_full['year'] = data_full['date'].dt.year
data_full['month'] = data_full['date'].dt.month
data_full['day'] = data_full['date'].dt.day
data_full['quarter'] = data_full['date'].dt.quarter
data_full['dayofweek'] = data_full['date'].dt.dayofweek
data_full['weekofyear'] = data_full['date'].dt.weekofyear
data_full['dayofyear'] = data_full['date'].dt.dayofyear
data_full['is_month_end'] = data_full['date'].dt.is_month_end
data_full['is_month_start'] = data_full['date'].dt.is_month_start
data_full['is_quarter_end'] = data_full['date'].dt.is_quarter_end
data_full['is_quarter_start'] = data_full['date'].dt.is_quarter_start
data_full['is_year_end'] = data_full['date'].dt.is_year_end
data_full['is_year_start'] = data_full['date'].dt.is_year_start
data_full['weekend'] = (data_full['dayofweek'] > 4).astype(int)
data_full['sunday'] = (data_full['dayofweek'] == 6).astype(int)

# Create Fourier Features
data_full['month_sin'] = np.sin(2*np.pi*data_full.month/12)
data_full['month_cos'] = np.cos(2*np.pi*data_full.month/12)
data_full['day_sin'] = np.sin(2*np.pi*data_full.day/24)
data_full['day_cos'] = np.cos(2*np.pi*data_full.day/24)




In [8]:
# remove covid period
data_full = data_full[data_full['year']!=2020]

In [9]:
# Add holidays
# Find unique years in the dataset
unique_years = data_full['date'].dt.year.unique()

# Create separate arrays for each country with holiday dates for each unique year
argentina_holidays = []
canada_holidays = []
estonia_holidays = []
japan_holidays = []


argentina_holidays.extend(holidays.AR(years=unique_years))
canada_holidays.extend(holidays.CA(years=unique_years))
estonia_holidays.extend(holidays.EE(years=unique_years))
japan_holidays.extend(holidays.JP(years=unique_years))

# Function to check if the date is a holiday for a specific country
def is_holiday_in_country(date, country):
    if country == 'Argentina':
        return date in argentina_holidays
    elif country == 'Canada':
        return date in canada_holidays
    elif country == 'Estonia':
        return date in estonia_holidays
    elif country == 'Japan':
        return date in japan_holidays
    else:
        return False

# Add a new 'holiday' column for each country
for country in data_full['country'].unique():
    data_full[f'holiday_{country}'] = data_full.apply(lambda row: 1 if is_holiday_in_country(row['date'], country) else 0, axis=1)



In [10]:
train = data_full.loc[data_full.index[0]:train.index[-1]]
test = data_full.loc[train.index[-1]+1:]
test.drop(['num_sold'],axis=1,inplace=True)

In [11]:
train.head()

Unnamed: 0,id,date,country,store,product,num_sold,year,month,day,quarter,...,sunday,month_sin,month_cos,day_sin,day_cos,holiday_Argentina,holiday_Canada,holiday_Estonia,holiday_Japan,holiday_Spain
0,0,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,63.0,2017,1,1,1,...,1,0.5,0.866025,0.258819,0.965926,1,1,1,1,0
1,1,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs,66.0,2017,1,1,1,...,1,0.5,0.866025,0.258819,0.965926,1,1,1,1,0
2,2,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People,9.0,2017,1,1,1,...,1,0.5,0.866025,0.258819,0.965926,1,1,1,1,0
3,3,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,59.0,2017,1,1,1,...,1,0.5,0.866025,0.258819,0.965926,1,1,1,1,0
4,4,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Write Better,49.0,2017,1,1,1,...,1,0.5,0.866025,0.258819,0.965926,1,1,1,1,0


## Modeling

In [12]:
from pycaret.regression import *

# initialize setup**
s = setup(data = train, 
          target = 'num_sold', 
          categorical_features = ['product','country', 'store'], 
          ignore_features= ['id','date'],
          fold_strategy = 'timeseries', 
          fold = 5, 
          normalize = True, normalize_method = 'robust', 
          transform_target = True, 
          session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,num_sold
2,Target type,Regression
3,Original data shape,"(109500, 30)"
4,Transformed data shape,"(109500, 38)"
5,Transformed train set shape,"(76650, 38)"
6,Transformed test set shape,"(32850, 38)"
7,Ignore features,2
8,Numeric features,18
9,Categorical features,3


In [13]:
best_3models = compare_models(n_select = 3,sort= 'mae', fold=5)

In [14]:
best_3models[0]

In [15]:
ensembled_top3 = [ensemble_model(i) for i in best_3models]



Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,8.6148,294.9737,17.1748,0.9916,0.0647,0.0521
1,8.2795,251.7532,15.8667,0.993,0.0616,0.0497
2,8.1222,230.8393,15.1934,0.9932,0.0621,0.0501
3,8.1568,232.0114,15.2319,0.9933,0.0616,0.0499
4,8.1206,236.8434,15.3897,0.9932,0.0612,0.0497
Mean,8.2588,249.2842,15.7713,0.9929,0.0622,0.0503
Std,0.1873,24.029,0.7415,0.0006,0.0013,0.0009


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,10.5399,455.1276,21.3337,0.9871,0.0818,0.0635
1,9.4737,336.6428,18.3478,0.9906,0.0717,0.0564
2,8.9717,303.4003,17.4184,0.9911,0.0711,0.0552
3,8.7462,277.2967,16.6522,0.992,0.0674,0.0532
4,8.5689,296.503,17.2193,0.9915,0.0653,0.0515
Mean,9.2601,333.7941,18.1943,0.9905,0.0715,0.056
Std,0.7083,63.6181,1.6619,0.0017,0.0057,0.0041


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,10.5568,481.9047,21.9523,0.9864,0.0841,0.0635
1,9.6951,373.9746,19.3384,0.9896,0.0738,0.0572
2,9.2039,337.7869,18.379,0.9901,0.0741,0.0565
3,9.0237,309.8791,17.6034,0.9911,0.0706,0.0547
4,8.9507,353.4856,18.8012,0.9898,0.0686,0.0534
Mean,9.486,371.4062,19.2149,0.9894,0.0742,0.057
Std,0.595,59.0822,1.4816,0.0016,0.0053,0.0035


In [16]:
# blender
blender = blend_models(best_3models, choose_better = True)
blender_weighted = blend_models(best_3models,weights=[0.6, 0.3, 0.1])



Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,9.3184,351.9952,18.7615,0.99,0.0719,0.0566
1,8.7137,282.3807,16.8042,0.9921,0.0653,0.052
2,8.3377,251.1387,15.8474,0.9926,0.0649,0.0515
3,8.3188,250.3821,15.8235,0.9928,0.0631,0.0505
4,8.216,267.947,16.3691,0.9923,0.062,0.0497
Mean,8.5809,280.7688,16.7211,0.992,0.0655,0.0521
Std,0.4056,37.5273,1.0829,0.001,0.0034,0.0024


Original model was better than the blended model, hence it will be returned. NOTE: The display metrics are for the blended model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,8.8043,307.1955,17.527,0.9913,0.0668,0.0534
1,8.3358,252.5735,15.8926,0.9929,0.0622,0.05
2,8.0687,228.7138,15.1233,0.9933,0.062,0.0498
3,8.0774,231.8656,15.2271,0.9933,0.0609,0.0491
4,8.0131,243.3745,15.6005,0.993,0.0604,0.0487
Mean,8.2599,252.7446,15.8741,0.9928,0.0624,0.0502
Std,0.2942,28.5172,0.8706,0.0008,0.0023,0.0017


In [17]:
# stacker
stacker = stack_models(best_3models)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,8.4041,260.5139,16.1404,0.9926,0.0637,0.0514
1,8.1825,234.5345,15.3145,0.9935,0.061,0.0493
2,7.9406,214.9648,14.6617,0.9937,0.0607,0.0491
3,7.9296,218.0128,14.7653,0.9937,0.06,0.0486
4,7.9415,227.9442,15.0978,0.9934,0.0598,0.0485
Mean,8.0797,231.194,15.1959,0.9934,0.061,0.0494
Std,0.188,16.2389,0.5266,0.0004,0.0014,0.0011


In [18]:
# check leaderboard
leaderboard = get_leaderboard()
# Sort the leaderboard based on a specific metric (e.g., 'Accuracy')
sorted_leaderboard = leaderboard.sort_values(by='MSE')

sorted_leaderboard

Unnamed: 0_level_0,Model Name,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
26,Stacking Regressor,(TransformerWrapperWithInverse(transformer=Tar...,8.0797,231.194,15.1959,0.9934,0.061,0.0494
22,Light Gradient Boosting Machine,(TransformerWrapperWithInverse(transformer=Tar...,8.3162,248.9515,15.768,0.9929,0.0628,0.0508
16,Light Gradient Boosting Machine,(TransformerWrapperWithInverse(transformer=Tar...,8.3162,248.9515,15.768,0.9929,0.0628,0.0508
18,Bagging Regressor,(TransformerWrapperWithInverse(transformer=Tar...,8.2588,249.2842,15.7713,0.9929,0.0622,0.0503
25,Voting Regressor,(TransformerWrapperWithInverse(transformer=Tar...,8.2599,252.7446,15.8741,0.9928,0.0624,0.0502
21,Voting Regressor,(TransformerWrapperWithInverse(transformer=Tar...,8.5809,280.7688,16.7211,0.992,0.0655,0.0521
23,Random Forest Regressor,(TransformerWrapperWithInverse(transformer=Tar...,9.2438,327.8951,18.0547,0.9906,0.0714,0.0561
12,Random Forest Regressor,(TransformerWrapperWithInverse(transformer=Tar...,9.2438,327.8951,18.0547,0.9906,0.0714,0.0561
19,Bagging Regressor,(TransformerWrapperWithInverse(transformer=Tar...,9.2601,333.7941,18.1943,0.9905,0.0715,0.056
20,Bagging Regressor,(TransformerWrapperWithInverse(transformer=Tar...,9.486,371.4062,19.2149,0.9894,0.0742,0.057


In [19]:
# Get the best model (top row) from the sorted leaderboard
best_model = sorted_leaderboard.iloc[0]['Model']

In [20]:
best_model

In [21]:
evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

## Prediction of Unseen Data

In [22]:
predictions = predict_model(best_model, data=test)

In [23]:
def multipliers(predictors, prediction, canada = 1, japan = 1, spain = 1, estonia = 1, argentina = 1):
    prediction[predictors.country == 'Canada'] *= canada
    prediction[predictors.country == 'Japan'] *= japan
    prediction[predictors.country == 'Spain'] *= spain
    prediction[predictors.country == 'Estonia'] *= estonia
    prediction[predictors.country == 'Argentina'] *= argentina
    return prediction

predictions['num_sold'] = multipliers(predictions, np.round(predictions["prediction_label"] ) * 1.5, .58, .76, 1, 1.08, 2.82)
kaggle_predictions = pd.DataFrame({"id": predictions["id"],
                                   "num_sold": predictions["num_sold"] })
kaggle_predictions.to_csv('submission.csv', index = False)


In [24]:
kaggle_predictions.head()

Unnamed: 0,id,num_sold
136950,136950,164.97
136951,136951,160.74
136952,136952,25.38
136953,136953,152.28
136954,136954,131.13
