# Faux Kaggle Data Time Series Models

This is a notebook for testing some time series models on sales data for 3 products in 3 countries. 

## Import needed libraries

In [None]:
import os
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import altair as alt

from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder,PolynomialFeatures, OrdinalEncoder
from sklearn.model_selection import TimeSeriesSplit, train_test_split, cross_val_score, RandomizedSearchCV

import tensorflow as tf

In [None]:
# helper function for testing new regression models efficiently

def regression_results(y_true, y_pred):
    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

def model_fit(model, X, y):
    m = model.fit(X,y)
    return m

def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

## Data Preprocessing

In [None]:
raw_train = pd.read_csv('/work/Machine-Learning/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('/work/Machine-Learning/tabular-playground-series-jan-2022/test.csv')

In [None]:
alt.Chart(raw_train.iloc[-5000:], height=100, width=250, title='Sample of Faux Sales for Kaggle-branded Products'
).mark_line().encode(
    x=alt.X('date:O', title=None),
    y=alt.Y('num_sold:Q',title=None),
    color='store',
    row=alt.Row('country', title='Units sold per Country'),
    column=alt.Column('product', title='Products')
)

In [None]:
train_exp = raw_train.copy()
train_exp['date'] = pd.to_datetime(train_exp.date, format='%Y.%m.%d')
# train_exp['num_sold_yest'] = \
#     train_exp.groupby(['country','store','product'])['num_sold'].transform(lambda x: x.shift(1))
# train_exp['num_sold_yest_diff'] = train_exp['num_sold_yest'].diff()
# train_exp['date2'] = \
#     train_exp.groupby(['country','store','product'])['date'].transform(lambda x: x.shift(1))
train_exp = train_exp.copy().dropna(axis=0)

train_exp.tail()

Unnamed: 0,row_id,date,country,store,product,num_sold
26293,26293,2018-12-31,Sweden,KaggleMart,Kaggle Hat,823
26294,26294,2018-12-31,Sweden,KaggleMart,Kaggle Sticker,250
26295,26295,2018-12-31,Sweden,KaggleRama,Kaggle Mug,1004
26296,26296,2018-12-31,Sweden,KaggleRama,Kaggle Hat,1441
26297,26297,2018-12-31,Sweden,KaggleRama,Kaggle Sticker,388


In [None]:
enc = OrdinalEncoder()

for col in train_exp.select_dtypes('object'):
    data = enc.fit_transform(train_exp[[col]])
    res = pd.DataFrame(data)
    train_exp[col] = res[0]

In [None]:
# Create separate variables for both the features and target variables
features = train_exp.drop(['num_sold','row_id'],axis=1)
target = train_exp['num_sold'].copy()
print(len(features), len(target))

26298 26298


## CV Models

In [None]:
models = []
models.append(('LR', LinearRegression()))
models.append(('NN', MLPRegressor(solver='lbfgs', max_iter=1500)))
models.append(('KNN', KNeighborsRegressor()))
models.append(('RF', RandomForestRegressor(n_estimators=20, random_state=42)))
models.append(('SVR', SVR(gamma='auto')))

results = []
names = []
smape_score = make_scorer(smape, greater_is_better=False)

for name, model in models:
    tscv = TimeSeriesSplit(n_splits=3)
    for train_index, val_index in tscv.split(features, target):
        X_train, X_val = features[features.index.isin(list(train_index))].set_index('date'), features[features.index.isin(list(val_index))].set_index('date')
        y_train, y_val = target[target.index.isin(list(train_index))], target[target.index.isin(list(val_index))]
        fitted = model_fit(model, X_train, y_train)
        pred = fitted.predict(X_val)
        res = smape(y_val, pred)
        results.append(res)
        names.append((name, model))
        print('%s: %f' % (name, res))

LR: 34.249462
LR: 33.958488
LR: 35.368194
NN: 17.814316
NN: 21.669809
NN: 16.507994
KNN: 39.013082
KNN: 28.731590
KNN: 28.688705
RF: 15.211539
RF: 14.882231
RF: 16.486316
SVR: 16.077222
SVR: 16.920754
SVR: 19.091796


For the SMAPE measure, lower is better. Thus, the Random Forest regression model performed marginally better on average than other models. Nonetheless, it is always good to have a baseline model like a linear regression to compare performance to with a standard best fit line and assumptions of normality. For production or competition winning results, Hyperparameter tuning and GridSearchCV would be necessary to differentiate the Support Vector, Multilayer Perceptron (NN), and the Random Forest regressor. I have also recently learned that tsmoothie - a newer library available for review on github - could subsequently better more the regression and help demonstrate a level on uncertainty in the visualization. This data has been used as a training exercise of some of the techniques learned in an academic setting. I did not submit results from one of the models created here to Kaggle competition, but I will continue using newly learned techniques to improve on my modeling skills and compete. 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=24d50205-0d01-4bd6-a263-9bbd6567f7de' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>