<a href="https://colab.research.google.com/github/vence-andersen/M5-Forecasting-Accuracy/blob/main/Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Removables**

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle competitions download -c m5-forecasting-accuracy

In [None]:
!unzip sales_train_evaluation.csv.zip
!unzip sales_train_validation.csv.zip
!unzip sample_submission.csv.zip
!unzip sell_prices.csv.zip

# **Importing modules required and reading the CSV files**

In [None]:
! pip install -q downcast
import pandas as pd
import numpy as np
from tqdm import tqdm
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error as mse
from downcast import reduce
import warnings
warnings.filterwarnings("ignore")
import pickle

In [None]:
sales = pd.read_csv("sales_train_evaluation.csv")
sell_price = pd.read_csv("sell_prices.csv")
cal = pd.read_csv("calendar.csv")

# **Function_1**

In [None]:
# Picking a value in random to check it's accuracy
test = sales.sample(random_state=13).reset_index(drop=True)
test

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,d_32,d_33,d_34,...,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_139_WI_1_evaluation,HOBBIES_1_139,HOBBIES_1,HOBBIES,WI_1,WI,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,1,0,1,2,0,0,0,0,1,0,0,0,0,0,1,0,2,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0


In [None]:
def final_1(test):
    """This function predicts the demand of the product for the next 28 days"""

    # We are creating new features required for the prediction for days from 1942 till 1969
    for day in range(1942,1942+28):
        test['d_' + str(day)] = np.int32(0)

    test = reduce(test)
    
    # We are transforming our Time Series problem to Supervised Machine Learning Problem
    data = pd.melt(test, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
            var_name='day', value_name='demand')

    # We are then merging the all the csv files together
    data = data.merge(cal, left_on='day', right_on='d')
    data = data.merge(sell_price,on=['store_id','item_id', 'wm_yr_wk'], how='left')

    # We are then filling the missing places with the products avg sell_price
    data['sell_price'].fillna(data.groupby('id')['sell_price'].transform('mean'), inplace=True)

    # we are stripping the 'd_' from day column to make it an integer feature
    data['day'] = data['day'].apply(lambda x: x.split('_')[1]).astype(np.int16)

    #since weekday's are represented as wday with numbers and d is a duplicate column.
    data.drop(['d','weekday','date'], axis=1, inplace=True) 

    # As we did a custom categorical encoding during the trainig of the best model, we need to to use the same categorical labels that we were created 
    # for every category, hence I created a dictonary for every category as a key and it's label as value and transported it here.

    all_dicts = pickle.load(open('/content/drive/MyDrive/all_dict', 'rb'))

    # all_dicts is a list which has multiple dictonaries in it.

    IDs = all_dicts[0]; ITEM_ids = all_dicts[1]; DEPT_ids = all_dicts[2]; CAT_ids = all_dicts[3]; STORE_ids = all_dicts[4]
    STATE_ids = all_dicts[5]; EVNT_nm_1 = all_dicts[6]; EVNT_nm_2 = all_dicts[7]; EVNT_typ_1 = all_dicts[8]; EVNT_typ_2 = all_dicts[9]

    # we are applying the label value for our test data

    data['id'] = data['id'].apply(lambda x:IDs.get(x)); data['item_id'] = data['item_id'].apply(lambda x:ITEM_ids.get(x));  
    data['cat_id'] = data['cat_id'].apply(lambda x:CAT_ids.get(x)); data['store_id'] = data['store_id'].apply(lambda x:STORE_ids.get(x)); 
    data['state_id'] = data['state_id'].apply(lambda x:STATE_ids.get(x)); data['dept_id'] = data['dept_id'].apply(lambda x:DEPT_ids.get(x));
    data['event_name_1'] = data['event_name_1'].apply(lambda x:EVNT_nm_1.get(x)); data['event_name_2'] = data['event_name_2'].apply(lambda x:EVNT_nm_2.get(x)); 
    data['event_type_1'] = data['event_type_1'].apply(lambda x:EVNT_typ_1.get(x)); data['event_type_2'] = data['event_type_2'].apply(lambda x:EVNT_typ_2.get(x)); 

    # we are filling the nan values with -1, as that was the label replacement during training of the best model
    data['event_name_1'].fillna(-1, inplace=True); data['event_name_2'].fillna(-1, inplace=True);
    data['event_type_1'].fillna(-1, inplace=True); data['event_type_2'].fillna(-1, inplace=True);

    # We are then conberting the data type of the categorical features
    data['event_name_1'] = data['event_name_1'].astype('int8'); data['event_name_2'] = data['event_name_2'].astype('int8')
    data['event_type_1'] = data['event_type_1'].astype('int8'); data['event_type_2'] = data['event_type_2'].astype('int8')
    data['id'] = data['id'].astype('int8'); data['dept_id'] = data['dept_id'].astype('int8'); data['cat_id'] = data['cat_id'].astype('int8');
    data['state_id'] = data['state_id'].astype('int8')

    # Adding lag shift features as those are good time series feature engineering steps.
    lags = [28,30,35,42,49,56,63,70]
    for lag in lags:
        data["lag_" + str(lag)] = data.groupby("id")["demand"].shift(lag).astype(np.float16)

    # We are picking the data after 1000 days because from EDA we found out that there were no proper seasonal follows before that.
    data = data[data['day']>1000]
    data.reset_index(drop=True, inplace=True)

    # we are then reading the best model we got while training
    best = open('/content/drive/MyDrive/best_model','rb')
    lgb = pickle.load(best)

    # We are dropping demand as it's the target value
    data.drop('demand', axis=1, inplace=True)

    # We are splitting the data for validation and test and then predicting it's value
    X_val = data[(data['day']>1913) & (data['day']<1942)]
    pred_val_array = lgb.predict(X_val)

    X_test = data[data['day']>1941]
    pred_test_array = lgb.predict(X_test)

    # We are then reshaping the predicted value
    pred_val_array = np.reshape(pred_val_array, (-1, 28),order = 'F')
    pred_test_array = np.reshape(pred_test_array, (-1, 28),order = 'F')

    cols = ['F'+str(i) for i in range(1,29)]

    vals = pd.concat([pd.DataFrame([test['id']], index=[0]),pd.DataFrame(pred_val_array, columns=cols)],axis=1).rename(columns={0:'ID'})
    vals['ID'] = vals['ID'].apply(lambda x: x.replace('evaluation','validation'))
    tst = pd.concat([pd.DataFrame([test['id']], index=[0]),pd.DataFrame(pred_test_array, columns=cols)],axis=1).rename(columns={0:'ID'})

    return vals, tst

In [None]:
vals, tst = final_1(test)
print('Forecast sales from days 1914 till 1941 is:')
display(vals)
print('\nForecast sales from days 1942 till 1969 is:')
display(tst)

Forecast sales from days 1914 till 1941 is:


Unnamed: 0,ID,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_139_WI_1_validation,0.427085,0.71657,0.343393,0.568409,0.330454,0.794805,0.489243,0.459894,0.750902,0.420226,0.525707,0.425969,0.607013,0.455384,0.291552,0.531099,0.458824,0.51533,0.363455,0.476559,0.402097,0.343486,0.520414,0.489011,0.513274,0.542959,0.416199,0.636495



Forecast sales from days 1942 till 1969 is:


Unnamed: 0,ID,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_139_WI_1_evaluation,0.313194,0.474262,0.416199,0.388617,0.564641,0.280988,0.33888,0.285426,0.546403,0.404203,0.669596,0.444401,0.554913,0.34684,0.267322,0.473971,0.461642,0.651972,0.469221,0.401177,0.291985,0.335359,0.410047,0.435015,0.487305,0.343876,0.350149,0.238236


# **Function_2**

In [None]:
test = sales.sample(random_state=33).reset_index(drop=True)
test

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,d_32,d_33,d_34,...,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOUSEHOLD_2_484_CA_2_evaluation,HOUSEHOLD_2_484,HOUSEHOLD_2,HOUSEHOLD,CA_2,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,3,0,0,0,0,0,0,1,0,1,2,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,2,0,1,0,0,0,1,0


In [None]:
def final_2(test,y_true):
    """This function predicts the demand of the product for the next 28 days"""

    # We are creating new features required for the prediction for days from 1942 till 1969
    for day in range(1942,1942+28):
        test['d_' + str(day)] = np.int32(0)

    test = reduce(test)
    
    # We are transforming our Time Series problem to Supervised Machine Learning Problem
    data = pd.melt(test, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
            var_name='day', value_name='demand')

    # We are then merging the all the csv files together
    data = data.merge(cal, left_on='day', right_on='d')
    data = data.merge(sell_price,on=['store_id','item_id', 'wm_yr_wk'], how='left')

    # We are then filling the missing places with the products avg sell_price
    data['sell_price'].fillna(data.groupby('id')['sell_price'].transform('mean'), inplace=True)

    # we are stripping the 'd_' from day column to make it an integer feature
    data['day'] = data['day'].apply(lambda x: x.split('_')[1]).astype(np.int16)

    #since weekday's are represented as wday with numbers and d is a duplicate column.
    data.drop(['d','weekday','date'], axis=1, inplace=True) 

    # As we did a custom categorical encoding during the trainig of the best model, we need to to use the same categorical labels that we were created 
    # for every category, hence I created a dictonary for every category as a key and it's label as value and transported it here.

    all_dicts = pickle.load(open('/content/drive/MyDrive/all_dict', 'rb'))

    # all_dicts is a list which has multiple dictonaries in it.

    IDs = all_dicts[0]; ITEM_ids = all_dicts[1]; DEPT_ids = all_dicts[2]; CAT_ids = all_dicts[3]; STORE_ids = all_dicts[4]
    STATE_ids = all_dicts[5]; EVNT_nm_1 = all_dicts[6]; EVNT_nm_2 = all_dicts[7]; EVNT_typ_1 = all_dicts[8]; EVNT_typ_2 = all_dicts[9]

    # we are applying the label value for our test data

    data['id'] = data['id'].apply(lambda x:IDs.get(x)); data['item_id'] = data['item_id'].apply(lambda x:ITEM_ids.get(x));  
    data['cat_id'] = data['cat_id'].apply(lambda x:CAT_ids.get(x)); data['store_id'] = data['store_id'].apply(lambda x:STORE_ids.get(x)); 
    data['state_id'] = data['state_id'].apply(lambda x:STATE_ids.get(x)); data['dept_id'] = data['dept_id'].apply(lambda x:DEPT_ids.get(x));
    data['event_name_1'] = data['event_name_1'].apply(lambda x:EVNT_nm_1.get(x)); data['event_name_2'] = data['event_name_2'].apply(lambda x:EVNT_nm_2.get(x)); 
    data['event_type_1'] = data['event_type_1'].apply(lambda x:EVNT_typ_1.get(x)); data['event_type_2'] = data['event_type_2'].apply(lambda x:EVNT_typ_2.get(x)); 

    # we are filling the nan values with -1, as that was the label replacement during training of the best model
    data['event_name_1'].fillna(-1, inplace=True); data['event_name_2'].fillna(-1, inplace=True);
    data['event_type_1'].fillna(-1, inplace=True); data['event_type_2'].fillna(-1, inplace=True);

    # We are then conberting the data type of the categorical features
    data['event_name_1'] = data['event_name_1'].astype('int8'); data['event_name_2'] = data['event_name_2'].astype('int8')
    data['event_type_1'] = data['event_type_1'].astype('int8'); data['event_type_2'] = data['event_type_2'].astype('int8')
    data['id'] = data['id'].astype('int8'); data['dept_id'] = data['dept_id'].astype('int8'); data['cat_id'] = data['cat_id'].astype('int8');
    data['state_id'] = data['state_id'].astype('int8')

    # Adding lag shift features as those are good time series feature engineering steps.
    lags = [28,30,35,42,49,56,63,70]
    for lag in lags:
        data["lag_" + str(lag)] = data.groupby("id")["demand"].shift(lag).astype(np.float16)

    # We are picking the data after 1000 days because from EDA we found out that there were no proper seasonal follows before that.
    data = data[data['day']>1000]
    data.reset_index(drop=True, inplace=True)

    # we are then reading the best model we got while training
    best = open('/content/drive/MyDrive/best_model','rb')
    lgb = pickle.load(best)

    # We are dropping demand as it's the target value
    data.drop('demand', axis=1, inplace=True)

    # We are then spliting the data for prediction and predicting the values
    X_val = data[data['day']>1913]
    y_pred = lgb.predict(X_val)

    # We are then reshaping for calculating the rmse value
    y_pred = np.reshape(y_pred, (-1, 28),order = 'F')

    # We are calculating the rmse value
    r_m_s_e = mse(y_true=y_true, y_pred=y_pred, squared=False)

    return r_m_s_e

In [None]:
r_m_s_e = final_2(test.iloc[:,:-28],test.iloc[:,-28:])
print(f"The RMSE score is {r_m_s_e}")

The RMSE score is 0.6650955041390069
