# Imports

In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from itertools import product

In [2]:
train = pd.read_csv('Data/Grocery/sales_train.csv.gz')
shops = pd.read_csv('Data/Grocery/shops.csv')
items = pd.read_csv('Data/Grocery/items.csv')
test = pd.read_csv('Data/Grocery/test.csv.gz')
items_cats = pd.read_csv('Data/Grocery/item_categories.csv')
sub = pd.read_csv('Data/Grocery/sample_submission.csv.gz')

In [3]:
trn = train[2750000:].copy()

In [4]:
trn.shape

(185849, 6)

In [5]:
trn.dtypes

date               object
date_block_num      int64
shop_id             int64
item_id             int64
item_price        float64
item_cnt_day      float64
dtype: object

In [6]:
trn.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
2750000,07.07.2015,30,49,8534,2299.0,1.0
2750001,14.07.2015,30,49,8543,1400.0,1.0
2750002,30.07.2015,30,49,13076,1199.0,1.0
2750003,09.07.2015,30,49,12421,249.0,1.0
2750004,26.07.2015,30,49,12465,229.0,1.0


# Preprocessing data

Since the competition task is to make a monthly prediction, we need to aggregate the data to montly level

In [7]:
# extract all unique item ids every month in a list
grid = []
for month in trn.date_block_num.unique():
    item_list = trn[trn.date_block_num == month].item_id.unique()
    shop_list = trn[trn.date_block_num == month].shop_id.unique()
    grid.append(np.array(list(product(*[[month], shop_list, item_list])))) # Use product to get month, shop, item cartesian combination

In [8]:
grid = pd.DataFrame(np.vstack(grid), columns=['date_block_num', 'shop_id', 'item_id'])

#### Adding the total amount of items sold per store month

In [9]:
temp = trn.groupby(['date_block_num', 'shop_id', 'item_id'])[['item_cnt_day']].sum()
temp.rename(columns={'item_cnt_day':'Target'}, inplace=True)

In [10]:
all_data = grid.merge(temp, on=['date_block_num', 'shop_id', 'item_id'], how='left').fillna(0)

#### Add the mean of items sold per store per month

In [11]:
temp_mean = trn.groupby(['date_block_num', 'shop_id', 'item_id'])[['item_cnt_day']].mean()
temp_mean.rename(columns={'item_cnt_day':'Target_mean'}, inplace=True)

In [12]:
all_data = all_data.merge(temp_mean, on=['date_block_num', 'shop_id', 'item_id'], how='left').fillna(0)

In [13]:
all_data.sort_values(['date_block_num', 'shop_id', 'item_id'], inplace=True)

In [14]:
all_data.reset_index(drop=True, inplace=True)

#### Merge with shops, items and categories

In [15]:
all_data = all_data.merge(shops, on='shop_id')

In [16]:
all_data = all_data.merge(items, on='item_id')

In [17]:
all_data = all_data.merge(items_cats, on='item_category_id')

In [18]:
all_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,Target,Target_mean,shop_name,item_name,item_category_id,item_category_name
0,30,2,31,0.0,0.0,"Адыгея ТЦ ""Мега""",007: КООРДИНАТЫ «СКАЙФОЛЛ» (BD),37,Кино - Blu-Ray
1,31,2,31,0.0,0.0,"Адыгея ТЦ ""Мега""",007: КООРДИНАТЫ «СКАЙФОЛЛ» (BD),37,Кино - Blu-Ray
2,32,2,31,0.0,0.0,"Адыгея ТЦ ""Мега""",007: КООРДИНАТЫ «СКАЙФОЛЛ» (BD),37,Кино - Blu-Ray
3,33,2,31,1.0,1.0,"Адыгея ТЦ ""Мега""",007: КООРДИНАТЫ «СКАЙФОЛЛ» (BD),37,Кино - Blu-Ray
4,30,18,31,0.0,0.0,"Красноярск ТЦ ""Июнь""",007: КООРДИНАТЫ «СКАЙФОЛЛ» (BD),37,Кино - Blu-Ray


In [19]:
all_data.shape

(762113, 9)

#### Mean Encoded Feature 

#### Before Regularization

#### After Regularization

#### Add text size to text features

In [20]:
df=all_data.copy()

In [21]:
def text_count(df, col):
    df[col+'_size'] = df[col].str.len()

In [22]:
text_count(df, 'shop_name')
text_count(df, 'item_name')
text_count(df, 'item_category_name')

#### Distinct word Count

In [23]:
def word_count(df, col):
    df[col+'_count'] = df[col].str.lower().str.split().str.len()

In [24]:
word_count(df, 'shop_name')
word_count(df, 'item_name')
word_count(df, 'item_category_name')

#### Add Count Frequency to text features

In [25]:
# Context manager to temporarily set options in the with statement context
def display_all(df):
    with pd.option_context('display.max_rows', 1000, 'display.max_columns', 1000):
        display(df)

In [34]:
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
feat_count = 25
txt_vect = feature_extraction.text.TfidfVectorizer(max_features=feat_count)

In [36]:
columns = ['shop_name', 'item_name', 'item_category_name']

for col in columns:
    item_cat_tf = txt_vect.fit_transform(df[col])
    df_temp = pd.DataFrame(item_cat_tf.toarray(), columns=txt_vect.get_feature_names())
    for i in df_temp.columns:
        df[col+'_tf_' + str(i)] = df_temp[i]

In [37]:
df.shape

(762113, 90)

#### Label Encoding

In [38]:
df_encode = df.copy()

In [39]:
for i in df_encode.columns:
    df_encode[i] = df_encode[i].factorize()[0]

In [40]:
bag = {}
for num, col_1  in enumerate(df_encode.columns):
    for col_2 in df_encode.columns[num+1:]:
        if col_2 not in bag and np.all(df_encode[col_2] == df_encode[col_1]):
            bag[col_1] = col_2

In [41]:
bag

{'shop_id': 'shop_name',
 'item_id': 'item_name',
 'item_category_id': 'item_category_name',
 'shop_name_tf_курск': 'shop_name_tf_пушкинский',
 'item_category_name_tf_blu': 'item_category_name_tf_ray',
 'item_category_name_tf_cd': 'item_category_name_tf_производства'}

In [42]:
bag.values()

dict_values(['shop_name', 'item_name', 'item_category_name', 'shop_name_tf_пушкинский', 'item_category_name_tf_ray', 'item_category_name_tf_производства'])

In [45]:
df.drop(bag.values(), axis = 1, inplace=True)

In [47]:
df.shape

(762113, 84)

In [49]:
# df_encode.drop(df_encode.index, inplace=True)

Steps to improve model:
1. Knowing this in Russia, add the bank and the school holidays
2. Add features that display the summer season, christmas and other important holidays or peak seasons in russia
3. check whether some items wherein specific discount and check sales relevant to them
4. Are same products sold for different prices in different stores
5. Price fluctuations over this period, does this affect the prcie?
6. How about stores that have closed, example shop 23
7. #### Check whether some data can point to a sales/marketing campaign, and add it as a feature then add days of saledate-campaign date as a new feature

In [50]:
display_all(df.head())

Unnamed: 0,date_block_num,shop_id,item_id,Target,Target_mean,item_category_id,shop_name_size,item_name_size,item_category_name_size,shop_name_count,item_name_count,item_category_name_count,shop_name_tf_ii,shop_name_tf_воронеж,shop_name_tf_к7,shop_name_tf_курск,shop_name_tf_магазин,shop_name_tf_мега,shop_name_tf_молл,shop_name_tf_москва,shop_name_tf_мтрц,shop_name_tf_новгород,shop_name_tf_новосибирск,shop_name_tf_паркхаус,shop_name_tf_рио,shop_name_tf_ростовнадону,shop_name_tf_с21,shop_name_tf_самара,shop_name_tf_тк,shop_name_tf_трк,shop_name_tf_трц,shop_name_tf_тц,shop_name_tf_тюмень,shop_name_tf_уфа,shop_name_tf_центральный,shop_name_tf_якутск,item_name_tf_1с,item_name_tf_360,item_name_tf_3d,item_name_tf_bd,item_name_tf_cd,item_name_tf_edition,item_name_tf_jewel,item_name_tf_mp3,item_name_tf_of,item_name_tf_pc,item_name_tf_ps3,item_name_tf_the,item_name_tf_xbox,item_name_tf_арт,item_name_tf_версия,item_name_tf_для,item_name_tf_игра,item_name_tf_коллекция,item_name_tf_настольная,item_name_tf_регион,item_name_tf_русская,item_name_tf_русские,item_name_tf_субтитры,item_name_tf_фигурка,item_name_tf_цифровая,item_category_name_tf_360,item_category_name_tf_blu,item_category_name_tf_cd,item_category_name_tf_dvd,item_category_name_tf_pc,item_category_name_tf_ps3,item_category_name_tf_xbox,item_category_name_tf_атрибутика,item_category_name_tf_аудиокниги,item_category_name_tf_винил,item_category_name_tf_игры,item_category_name_tf_издания,item_category_name_tf_кино,item_category_name_tf_книги,item_category_name_tf_локального,item_category_name_tf_музыка,item_category_name_tf_настольные,item_category_name_tf_подарки,item_category_name_tf_программы,item_category_name_tf_стандартные,item_category_name_tf_сувениры,item_category_name_tf_фигурки,item_category_name_tf_цифра
0,30,2,31,0.0,0.0,37,16,31,14,3,4,3,0.0,0.0,0.0,0.0,0.0,0.866732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.498774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.634458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.441504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,31,2,31,0.0,0.0,37,16,31,14,3,4,3,0.0,0.0,0.0,0.0,0.0,0.866732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.498774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.634458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.441504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,32,2,31,0.0,0.0,37,16,31,14,3,4,3,0.0,0.0,0.0,0.0,0.0,0.866732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.498774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.634458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.441504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,33,2,31,1.0,1.0,37,16,31,14,3,4,3,0.0,0.0,0.0,0.0,0.0,0.866732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.498774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.634458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.441504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,30,18,31,0.0,0.0,37,20,31,14,3,4,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.634458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.441504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
