# XGBoost Model

https://www.kaggle.com/scottwied/predict-future-sales-top-11-solution

In [9]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import time

from math import sqrt
from numpy import loadtxt
from itertools import product
from tqdm import tqdm
from sklearn import preprocessing
from xgboost import plot_tree
from matplotlib import pyplot
import gc

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Make sure that we are using the 'tensorflow'
print(f"Curretly running conda env: {os.environ['CONDA_DEFAULT_ENV']}")

Curretly running conda env: tensorflow


### Data Loading

In [3]:
sales_train = pd.read_csv('../data/sales_train.csv.gz', encoding='utf-8', compression='gzip')
items = pd.read_csv('../data/items.csv.gz', encoding='utf-8', compression='gzip')
shops = pd.read_csv('../data/shops.csv.gz', encoding='utf-8', compression='gzip')
item_categories = pd.read_csv('../data/item_categories.csv.gz', encoding='utf-8', compression='gzip')
test = pd.read_csv('../data/test.csv.gz', encoding='utf-8', compression='gzip')
sample_submission = pd.read_csv('../data/sample_submission.csv.gz', encoding='utf-8', compression='gzip')

## Examine data

In [4]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [5]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [7]:
test.shape

(214200, 3)

In [6]:
sample_submission.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


In [8]:
sample_submission.shape

(214200, 2)

## Data Aggregation

In [24]:
# Blocks are like sequential month numbers.  Let's create a list of all uniquer block numbers
block_nums = sales_train['date_block_num'].unique()
block_nums

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33])

In [22]:
# For every month we create a grid from all shops/items combinations from that month
grid = []
for block_num in block_nums:
    cur_shops = sales_train[sales_train['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales_train[sales_train['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))
    
index_cols = ['shop_id', 'item_id', 'date_block_num']

grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

In [41]:
grid.head()

Unnamed: 0,shop_id,item_id,date_block_num
0,59,22154,0
1,59,2552,0
2,59,2554,0
3,59,2555,0
4,59,2564,0


In [35]:
sales_train_agg = (
    sales_train
    .assign(item_cnt_day_clipped = lambda x: x['item_cnt_day'].clip(0,20))
    .groupby(['shop_id', 'item_id', 'date_block_num'])
    .agg({'item_cnt_day_clipped':'sum', 
          'item_price':'mean'})
    .reset_index()
    .rename(columns = {'item_cnt_day_clipped' : 'item_cnt_month'})
    .assign(item_cnt_month = lambda x: x['item_cnt_month'].clip(0,30))
)

In [36]:
trainset = pd.merge(grid, sales_train_agg, how='left', on=index_cols)
trainset.item_cnt_month = trainset.item_cnt_month.fillna(0)
trainset.item_price = trainset.item_price.fillna(0)
trainset.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price
0,59,22154,0,1.0,999.0
1,59,2552,0,0.0,0.0
2,59,2554,0,0.0,0.0
3,59,2555,0,0.0,0.0
4,59,2564,0,0.0,0.0


In [39]:
# Each item has an item_category_id
items[['item_id', 'item_category_id']].head()

Unnamed: 0,item_id,item_category_id
0,0,40
1,1,76
2,2,40
3,3,40
4,4,40


In [40]:
# Add item_category id as a new column in the trainset dataframe
trainset = pd.merge(trainset, items[['item_id', 'item_category_id']], on = 'item_id')
trainset.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_category_id
0,59,22154,0,1.0,999.0,37
1,25,22154,0,5.0,999.0,37
2,24,22154,0,1.0,999.0,37
3,23,22154,0,0.0,0.0,37
4,19,22154,0,0.0,0.0,37


In [None]:
# Write the trainset dataframe to a new file
# trainset.to_csv('./trainset_with_grid.csv.gz', commpression='gzip')

In [42]:
trainset.shape

(10913850, 6)

In [43]:
sales_train.shape

(2935849, 6)

## Train a baseline model to use as a benchmark

In [52]:
baseline_features = ['shop_id', 'item_id', 'item_category_id', 'date_block_num', 'item_price', 'item_cnt_month']
train = (
    trainset
    .filter(baseline_features)
    .query("date_block_num >= 0")
    .query("date_block_num <= 33")
)
train.head()

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,item_price,item_cnt_month
0,59,22154,37,0,999.0,1.0
1,25,22154,37,0,999.0,5.0
2,24,22154,37,0,999.0,1.0
3,23,22154,37,0,0.0,0.0
4,19,22154,37,0,0.0,0.0


In [46]:
# Remove pandas index column
train = train.set_index('shop_id')
train.item_cnt_month = train.item_cnt_month.astype(int)
train['item_cnt_month'] = train.item_cnt_month.fillna(0).clip(0,20)
train.head()

Unnamed: 0_level_0,item_id,item_category_id,date_block_num,item_cnt_month
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
59,22154,37,0,1
25,22154,37,0,5
24,22154,37,0,1
23,22154,37,0,0
19,22154,37,0,0


In [47]:
train.shape

(10913850, 4)

In [66]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [67]:
testset = (
    test
    .merge(items[['item_id', 'item_category_id']], on = 'item_id', how = 'left')
    .set_index('shop_id')
    .filter(['item_id', 'item_category_id'])
)

testset.head()

Unnamed: 0_level_0,item_id,item_category_id
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5,5037,19
5,5320,55
5,5233,19
5,5232,23
5,5268,20


In [63]:
testset.shape

(214200, 2)

In [64]:
trainset.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_category_id
0,59,22154,0,1.0,999.0,37
1,25,22154,0,5.0,999.0,37
2,24,22154,0,1.0,999.0,37
3,23,22154,0,0.0,0.0,37
4,19,22154,0,0.0,0.0,37
