In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
!pip install plotly_express

Collecting plotly_express
  Downloading https://files.pythonhosted.org/packages/d4/d6/8a2906f51e073a4be80cab35cfa10e7a34853e60f3ed5304ac470852a08d/plotly_express-0.4.1-py2.py3-none-any.whl
Installing collected packages: plotly-express
Successfully installed plotly-express-0.4.1


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [4]:
!cp /content/drive/My\ Drive/m5/m5-forecasting-accuracy.zip /content

In [5]:
!unzip /content/m5-forecasting-accuracy.zip

Archive:  /content/m5-forecasting-accuracy.zip
  inflating: calendar.csv            
  inflating: sales_train_evaluation.csv  
  inflating: sales_train_validation.csv  
  inflating: sample_submission.csv   
  inflating: sell_prices.csv         


In [31]:
sales_df = pd.read_csv('sales_train_validation.csv')
calendar_df = pd.read_csv('calendar.csv')
prices_df = pd.read_csv('sell_prices.csv')

# Memory Saver and Label Encoding

In [32]:
sales_bd = np.round(sales_df.memory_usage().sum()/(1024*1024),1)
calendar_bd = np.round(calendar_df.memory_usage().sum()/(1024*1024),1)
prices_bd = np.round(prices_df.memory_usage().sum()/(1024*1024),1)

In [33]:
#Downcast in order to save memory
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df  

In [34]:
def encode_categorical(df, cols):
    
    for col in cols:
        # Leave NaN as it is.
        le = LabelEncoder()
        #not_null = df[col][df[col].notnull()]
        df[col] = df[col].fillna('nan')
        df[col] = pd.Series(le.fit_transform(df[col]), index=df.index)

    return df

In [11]:
calendar_df = encode_categorical(
    calendar_df, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]).pipe(downcast)

sales_df = encode_categorical(
    sales_df, ["item_id", "dept_id", "cat_id", "store_id", "state_id"]).pipe(downcast)

prices_df = encode_categorical(
    prices_df, ["item_id", "store_id"]).pipe(downcast)

In [12]:
sales_ad = np.round(sales_df.memory_usage().sum()/(1024*1024),1)
calendar_ad = np.round(calendar_df.memory_usage().sum()/(1024*1024),1)
prices_ad = np.round(prices_df.memory_usage().sum()/(1024*1024),1)

In [13]:
dic = {'DataFrame':['sales','calendar','prices'],
       'Before downcasting':[sales_bd,calendar_bd,prices_bd],
       'After downcasting':[sales_ad,calendar_ad,prices_ad]}

memory = pd.DataFrame(dic)
memory = pd.melt(memory, id_vars='DataFrame', var_name='Status', value_name='Memory (MB)')
memory.sort_values('Memory (MB)',inplace=True)
fig = px.bar(memory, x='DataFrame', y='Memory (MB)', color='Status', barmode='group', text='Memory (MB)')
fig.update_traces(texttemplate='%{text} MB', textposition='outside')
fig.update_layout(template='seaborn', title='Effect of Downcasting')
fig.show()

In [14]:
prediction_df = pd.DataFrame(columns=[f'd_{day_num}' for day_num in range(1914, (1914+28+28))])

In [15]:
sales_df = pd.concat([sales_df, prediction_df], axis=1)

In [16]:
sales_df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,d_32,d_33,d_34,...,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941,d_1942,d_1943,d_1944,d_1945,d_1946,d_1947,d_1948,d_1949,d_1950,d_1951,d_1952,d_1953,d_1954,d_1955,d_1956,d_1957,d_1958,d_1959,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969
0,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,HOBBIES_1_002_CA_1_validation,1438,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,HOBBIES_1_003_CA_1_validation,1439,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,HOBBIES_1_005_CA_1_validation,1441,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,1432,2,0,9,2,0,0,2,2,0,3,1,4,1,0,0,3,4,4,0,0,1,0,1,1,7,7,3,6,3,3,7,12,4,2,7,5,12,5,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
30486,FOODS_3_824_WI_3_validation,1433,2,0,9,2,0,0,0,0,0,5,0,1,1,3,1,1,0,4,2,0,1,2,1,1,0,0,0,0,3,1,1,1,2,0,1,0,1,1,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
30487,FOODS_3_825_WI_3_validation,1434,2,0,9,2,0,6,0,2,2,4,1,8,5,2,7,5,3,5,20,8,10,3,3,4,7,2,3,5,6,3,4,1,2,5,1,2,2,2,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
30488,FOODS_3_826_WI_3_validation,1435,2,0,9,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Melting and Merging 

In [17]:
melted_df = pd.melt(sales_df, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d', value_name='sold')

In [18]:
melted_df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sold
0,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,d_1,0
1,HOBBIES_1_002_CA_1_validation,1438,3,1,0,0,d_1,0
2,HOBBIES_1_003_CA_1_validation,1439,3,1,0,0,d_1,0
3,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,d_1,0
4,HOBBIES_1_005_CA_1_validation,1441,3,1,0,0,d_1,0
...,...,...,...,...,...,...,...,...
60034805,FOODS_3_823_WI_3_validation,1432,2,0,9,2,d_1969,
60034806,FOODS_3_824_WI_3_validation,1433,2,0,9,2,d_1969,
60034807,FOODS_3_825_WI_3_validation,1434,2,0,9,2,d_1969,
60034808,FOODS_3_826_WI_3_validation,1435,2,0,9,2,d_1969,


In [19]:
calendar_df['d'].nunique()

1969

In [20]:
melted_df['d'].nunique()

1969

In [21]:
melted_df = pd.merge(calendar_df, melted_df, on='d', how='left')
melted_df = pd.merge(melted_df, prices_df, on=['store_id','item_id','wm_yr_wk'], how='left') 

In [22]:
melted_df['d'].nunique()

1969

In [23]:
melted_df['date'] = pd.to_datetime(melted_df['date'])
melted_df.set_index('date', inplace=True)

In [24]:
melted_df.drop(['d', 'weekday'], axis=1, inplace=True)

In [25]:
melted_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 60034810 entries, 2011-01-29 to 2016-06-19
Data columns (total 19 columns):
 #   Column        Dtype   
---  ------        -----   
 0   wm_yr_wk      int16   
 1   wday          int8    
 2   month         int8    
 3   year          int16   
 4   event_name_1  int8    
 5   event_type_1  int8    
 6   event_name_2  int8    
 7   event_type_2  int8    
 8   snap_CA       int8    
 9   snap_TX       int8    
 10  snap_WI       int8    
 11  id            category
 12  item_id       int16   
 13  dept_id       int8    
 14  cat_id        int8    
 15  store_id      int8    
 16  state_id      int8    
 17  sold          object  
 18  sell_price    float16 
dtypes: category(1), float16(1), int16(3), int8(13), object(1)
memory usage: 2.2+ GB


In [26]:
melted_df

Unnamed: 0_level_0,wm_yr_wk,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,id,item_id,dept_id,cat_id,store_id,state_id,sold,sell_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2011-01-29,11101,1,1,2011,30,4,4,2,0,0,0,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,0,
2011-01-29,11101,1,1,2011,30,4,4,2,0,0,0,HOBBIES_1_002_CA_1_validation,1438,3,1,0,0,0,
2011-01-29,11101,1,1,2011,30,4,4,2,0,0,0,HOBBIES_1_003_CA_1_validation,1439,3,1,0,0,0,
2011-01-29,11101,1,1,2011,30,4,4,2,0,0,0,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,0,
2011-01-29,11101,1,1,2011,30,4,4,2,0,0,0,HOBBIES_1_005_CA_1_validation,1441,3,1,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-06-19,11621,2,6,2016,16,3,2,0,0,0,0,FOODS_3_823_WI_3_validation,1432,2,0,9,2,,2.980469
2016-06-19,11621,2,6,2016,16,3,2,0,0,0,0,FOODS_3_824_WI_3_validation,1433,2,0,9,2,,2.480469
2016-06-19,11621,2,6,2016,16,3,2,0,0,0,0,FOODS_3_825_WI_3_validation,1434,2,0,9,2,,3.980469
2016-06-19,11621,2,6,2016,16,3,2,0,0,0,0,FOODS_3_826_WI_3_validation,1435,2,0,9,2,,1.280273


In [27]:
melted_df['store_id'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int8)

In [28]:
melted_df[melted_df['store_id'].isna()]

Unnamed: 0_level_0,wm_yr_wk,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,id,item_id,dept_id,cat_id,store_id,state_id,sold,sell_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1


# Split into Seperate DFs by Store and Pickle

store_0=melted_df.loc[melted_df['store_id']==0]
store_1=melted_df.loc[melted_df['store_id']==1]
store_2=melted_df.loc[melted_df['store_id']==2]
store_3=melted_df.loc[melted_df['store_id']==3]
store_4=melted_df.loc[melted_df['store_id']==4]
store_5=melted_df.loc[melted_df['store_id']==5]
store_6=melted_df.loc[melted_df['store_id']==6]
store_7=melted_df.loc[melted_df['store_id']==7]
store_8=melted_df.loc[melted_df['store_id']==8]
store_9=melted_df.loc[melted_df['store_id']==9]

storeid_df = [x for store_id, x in melted_df.groupby('store_id') ]

In [29]:
import pickle

store_range = melted_df['store_id'].unique()
store_range = store_range.tolist()

for i,value in enumerate(store_range):
    melted_df[melted_df['store_id'] == value].to_pickle(r'StoreID_'+str(value)+r'.pickle')

In [30]:
!mv *.pickle /content/drive/My\ Drive/m5