In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import HistGradientBoostingRegressor as hgbr
from sklearn.neural_network import MLPRegressor as nn
import sklearn.metrics as metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from panelsplit.cross_validation import PanelSplit
from panelsplit.application import cross_val_fit
from panelsplit.plot import plot_splits
from panelsplit.pipeline import SequentialCVPipeline
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import shap

# https://www.youtube.com/watch?v=-5l3g91NZfQ

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define a utility function to take CV models and average them to output an ensemble
def extract_cv_fit(fit, X):
    predicts = [m.predict(X) for m in fit]
    mean = np.mean(predicts, axis=0)
    # create a DataFrame to hold both the mean and the individual predictions
    df = pd.DataFrame(predicts).T
    df.columns = [f'pred_{i+1}' for i in range(len(predicts))]
    df['pred_mean'] = mean

    return df

In [3]:
reg_data = pd.read_csv(r'../data/reg_data.csv')

In [4]:
def downcast(df, verbose = True):
    start_memory = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == "object":
            pass
        elif dtype_name == "bool":
            df[col] = df[col].astype("int8")
        elif dtype_name.startswith("int") or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast = "integer")
        else:
            df[col] = pd.to_numeric(df[col], downcast = "float")
    end_memory = df.memory_usage().sum() / 1024 ** 2

    if verbose:
        print("{:.1f}% compressed".format(100 * (start_memory - end_memory) / start_memory))

    return df

reg_data = downcast(reg_data, verbose = True)

55.2% compressed


In [6]:
# Recode the ym variable to an index
# Create a time trend variable from ym values
unique_ym = sorted(reg_data['ym'].unique())
ym_to_trend = {ym: i for i, ym in enumerate(unique_ym)}
reg_data['month'] = reg_data['ym'].map(ym_to_trend)

In [7]:
# Declare some variables as categorical
reg_data['shop_id']          = reg_data['shop_id'].astype('category')
reg_data['item_id']          = reg_data['item_id'].astype('category')
reg_data['item_category_id'] = reg_data['item_category_id'].astype('category')

# month of year
reg_data['month_of_year'] = reg_data['month'].mod(12)

reg_data['month_of_year'] = reg_data['month_of_year'].astype('category')

In [8]:
# Clipping variables: the test data has range (0,20)
vars_to_clip = ['qty', 'qty_lag1', 'qty_lag2', 'qty_lag3', 'qty_lag4', 'qty_lag5', 'qty_lag6', 'qty_roll3_lag1', 'qty_roll3_lag2', 'qty_roll3_lag3', 'qty_roll6_lag1', 'qty_roll6_lag2', 'qty_roll6_lag3', 'qty_roll12_lag1', 'qty_roll12_lag2', 'qty_roll12_lag3']

for var in vars_to_clip:
    # Clip the variable to the range (0, 20)
    reg_data[var].clip(0, 20, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reg_data[var].clip(0, 20, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reg_data[var].clip(0, 20, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behave

In [9]:
# One-hot encode categorical variables
item_category = pd.get_dummies(reg_data['item_category_id'], prefix='item_category', dtype=np.int8)
stores = pd.get_dummies(reg_data['shop_id'], prefix='shop', dtype=np.int8)
months = pd.get_dummies(reg_data['month_of_year'], prefix='month', dtype=np.int8)

# Combine the one-hot encoded variables with the original DataFrame
reg_data = pd.concat([reg_data, item_category, stores, months], axis=1)

reg_data

Unnamed: 0,ID,shop_id,item_id,item_category_id,ym,qty,qty_lag1,qty_lag2,qty_lag3,qty_lag4,...,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11
0,0,5,5037,19,516,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,5,5037,19,517,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,5,5037,19,518,0.0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,5,5037,19,519,0.0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,5,5037,19,520,0.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496995,214199,45,969,37,546,0.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7496996,214199,45,969,37,547,0.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7496997,214199,45,969,37,548,0.0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7496998,214199,45,969,37,549,0.0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
# Final Test data is where month == 34 (2015 Nov)
test_2015_nov = reg_data[reg_data['month'] == 34]
reg_data = reg_data[reg_data['month'] != 34]
reg_data = reg_data[reg_data['month'] > 3] # chopping the data
reg_data.index = range(len(reg_data))

In [11]:
cols = ""
for c in reg_data.columns:
    cols += "'" + c + "', "
cols = cols[:-2]  # remove last comma and space
print(cols)

'ID', 'shop_id', 'item_id', 'item_category_id', 'ym', 'qty', 'qty_lag1', 'qty_lag2', 'qty_lag3', 'qty_lag4', 'qty_lag5', 'qty_lag6', 'qty_roll3_lag1', 'qty_roll3_lag2', 'qty_roll3_lag3', 'qty_roll6_lag1', 'qty_roll6_lag2', 'qty_roll6_lag3', 'qty_roll12_lag1', 'qty_roll12_lag2', 'qty_roll12_lag3', 'relative_price_lag1', 'relative_price_lag2', 'relative_price_lag3', 'relative_price_lag4', 'relative_price_lag5', 'relative_price_lag6', 'relative_price3_lag1', 'relative_price3_lag2', 'relative_price3_lag3', 'product_qty_pclag1', 'product_qty_pclag2', 'product_qty_pclag3', 'shop_qty_pclag1', 'shop_qty_pclag2', 'shop_qty_pclag3', 'qty_substitute_prod_pclag1', 'qty_substitute_prod_pclag2', 'qty_substitute_prod_pclag3', 'price_mean_complement_prod_1_pclag1', 'price_mean_complement_prod_1_pclag2', 'price_mean_complement_prod_1_pclag3', 'price_mean_complement_prod_2_pclag1', 'price_mean_complement_prod_2_pclag2', 'price_mean_complement_prod_2_pclag3', 'price_mean_complement_prod_3_pclag1', 'price

In [12]:
Xcols = [
    'qty_lag1', 'qty_lag2', 'qty_lag3', 'qty_lag4', 'qty_lag5', 'qty_lag6', 'qty_roll3_lag1', 'qty_roll3_lag2', 'qty_roll3_lag3', 'qty_roll6_lag1', 'qty_roll6_lag2', 'qty_roll6_lag3', 'qty_roll12_lag1', 'qty_roll12_lag2', 'qty_roll12_lag3', 'relative_price_lag1', 'relative_price_lag2', 'relative_price_lag3', 'relative_price_lag4', 'relative_price_lag5', 'relative_price_lag6', 'relative_price3_lag1', 'relative_price3_lag2', 'relative_price3_lag3', 'product_qty_pclag1', 'product_qty_pclag2', 'product_qty_pclag3', 'shop_qty_pclag1', 'shop_qty_pclag2', 'shop_qty_pclag3', 'qty_substitute_prod_pclag1', 'qty_substitute_prod_pclag2', 'qty_substitute_prod_pclag3', 'price_mean_complement_prod_1_pclag1', 'price_mean_complement_prod_1_pclag2', 'price_mean_complement_prod_1_pclag3', 'price_mean_complement_prod_2_pclag1', 'price_mean_complement_prod_2_pclag2', 'price_mean_complement_prod_2_pclag3', 'price_mean_complement_prod_3_pclag1', 'price_mean_complement_prod_3_pclag2', 'price_mean_complement_prod_3_pclag3', 'price_mean_substitute_shop_1_pclag1', 'price_mean_substitute_shop_1_pclag2', 'price_mean_substitute_shop_1_pclag3', 'price_mean_substitute_shop_2_pclag1', 'price_mean_substitute_shop_2_pclag2', 'price_mean_substitute_shop_2_pclag3', 'price_mean_substitute_shop_3_pclag1', 'price_mean_substitute_shop_3_pclag2', 'price_mean_substitute_shop_3_pclag3', 
    'item_category_0', 'item_category_2', 'item_category_3', 'item_category_5', 'item_category_6', 'item_category_7', 'item_category_9', 'item_category_11', 'item_category_12', 'item_category_15', 'item_category_16', 'item_category_19', 'item_category_20', 'item_category_21', 'item_category_22', 'item_category_23', 'item_category_24', 'item_category_25', 'item_category_26', 'item_category_27', 'item_category_28', 'item_category_29', 'item_category_30', 'item_category_31', 'item_category_33', 'item_category_34', 'item_category_35', 'item_category_36', 'item_category_37', 'item_category_38', 'item_category_40', 'item_category_41', 'item_category_42', 'item_category_43', 'item_category_44', 'item_category_45', 'item_category_47', 'item_category_49', 'item_category_54', 'item_category_55', 'item_category_56', 'item_category_57', 'item_category_58', 'item_category_60', 'item_category_61', 'item_category_62', 'item_category_63', 'item_category_64', 'item_category_65', 'item_category_67', 'item_category_69', 'item_category_70', 'item_category_71', 'item_category_72', 'item_category_73', 'item_category_74', 'item_category_75', 'item_category_76', 'item_category_77', 'item_category_78', 'item_category_79', 'item_category_83', 'shop_2', 'shop_3', 'shop_4', 'shop_5', 'shop_6', 'shop_7', 'shop_10', 'shop_12', 'shop_14', 'shop_15', 'shop_16', 'shop_18', 'shop_19', 'shop_21', 'shop_22', 'shop_24', 'shop_25', 'shop_26', 'shop_28', 'shop_31', 'shop_34', 'shop_35', 'shop_36', 'shop_37', 'shop_38', 'shop_39', 'shop_41', 'shop_42', 'shop_44', 'shop_45', 'shop_46', 'shop_47', 'shop_48', 'shop_49', 'shop_50', 'shop_52', 'shop_53', 'shop_55', 'shop_56', 'shop_57', 'shop_58', 'shop_59', 'month_0', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11'
]

In [13]:
len(Xcols)

167

In [192]:
reg_data['ID'].unique()

array([     0,      1,      2, ..., 214197, 214198, 214199],
      shape=(214200,))

In [14]:
splits=2

mycv = PanelSplit(periods = reg_data.month, n_splits = splits, gap = 0, test_size=1)

hidden_neurons = round((len(Xcols)+1)/3)

nn_model = nn(
    max_iter=1000,
    verbose=True,
    hidden_layer_sizes=(hidden_neurons),
    early_stopping=True,
    validation_fraction=0.2,
    n_iter_no_change=10,
    solver ='adam',
    learning_rate_init=0.001, # default is 0.001
    shuffle=False # important to keep off for time series
    )

pipeline = Pipeline([
     ('scaler', StandardScaler()),
     ('pca', PCA(n_components=0.95)),
     ('model', nn_model)
 ], verbose=True)

fit = cross_val_fit(pipeline, reg_data[Xcols], reg_data['qty'], mycv, n_jobs=1)

[Pipeline] ............ (step 1 of 3) Processing scaler, total=  15.8s
[Pipeline] ............... (step 2 of 3) Processing pca, total=   5.9s
Iteration 1, loss = 0.25531122
Validation score: 0.596868
Iteration 2, loss = 0.23025106
Validation score: 0.611350
Iteration 3, loss = 0.22429292
Validation score: 0.618615
Iteration 4, loss = 0.22087939
Validation score: 0.622434
Iteration 5, loss = 0.21852037
Validation score: 0.626175
Iteration 6, loss = 0.21650502
Validation score: 0.628763
Iteration 7, loss = 0.21482186
Validation score: 0.631384
Iteration 8, loss = 0.21331665
Validation score: 0.632382
Iteration 9, loss = 0.21214639
Validation score: 0.633177
Iteration 10, loss = 0.21076170
Validation score: 0.634634
Iteration 11, loss = 0.20978585
Validation score: 0.637164
Iteration 12, loss = 0.20879014
Validation score: 0.636741
Iteration 13, loss = 0.20828509
Validation score: 0.637596
Iteration 14, loss = 0.20785542
Validation score: 0.639064
Iteration 15, loss = 0.20753817
Validatio

In [15]:
predictions = extract_cv_fit(fit, test_2015_nov[Xcols])

In [16]:
# Export post-processed prediction
testresults = test_2015_nov
testresults.index = range(len(testresults))
testresults['item_cnt_month'] = predictions['pred_mean']

testresults['item_cnt_month'].clip(0, 20, inplace=True)

to_export = testresults[['ID', 'item_cnt_month']]
to_export.to_csv(r'../output/predictions_nn_202505081730.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testresults['item_cnt_month'] = predictions['pred_mean']
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  testresults['item_cnt_month'].clip(0, 20, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testresults['item_cnt_month'].