# Features

In [105]:
import numpy as np
import pandas as pd
import time
import sys
import pickle
from xgboost import XGBRegressor
from itertools import product
from sklearn.preprocessing import LabelEncoder
import gc
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

sys.version_info

sys.version_info(major=3, minor=6, micro=3, releaselevel='final', serial=0)

## Data

In [82]:
items = pd.read_csv('C:\\Users\\guidil\\Downloads\\Coursera_FinalProject\\Coursera_FinalProject\\items.csv')
shops = pd.read_csv('C:\\Users\\guidil\\Downloads\\Coursera_FinalProject\\Coursera_FinalProject\\shops.csv')
cats = pd.read_csv('C:\\Users\\guidil\\Downloads\\Coursera_FinalProject\\Coursera_FinalProject\\item_categories.csv')
train = pd.read_csv('C:\\Users\\guidil\\Downloads\\Coursera_FinalProject\\Coursera_FinalProject\\sales_train.csv.gz', compression='gzip')
# I set index to avoid removing ID column later
test  = pd.read_csv('C:\\Users\\guidil\\Downloads\\Coursera_FinalProject\\Coursera_FinalProject\\test.csv.gz', compression='gzip').set_index('ID') 

elimino outlier individuati una fase precedente di EDA

In [83]:
train = train[train.item_price<100000]
train = train[train.item_cnt_day<1001]

Deduplico negozi in base al nome

In [84]:
# Якутск Орджоникидзе, 56
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

Pulizia su shop name e city name

In [85]:
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]

cats['split'] = cats['item_category_name'].str.split('-')
cats['type'] = cats['split'].map(lambda x: x[0].strip())
cats['type_code'] = LabelEncoder().fit_transform(cats['type'])
# if subtype is nan then type
cats['subtype'] = cats['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
cats['subtype_code'] = LabelEncoder().fit_transform(cats['subtype'])
cats = cats[['item_category_id','type_code', 'subtype_code']]

items.drop(['item_name'], axis=1, inplace=True)

Monthly sales

In [86]:
monthly_sales = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = train[train.date_block_num==i]
    monthly_sales.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
    
monthly_sales = pd.DataFrame(np.vstack(monthly_sales), columns=cols)
monthly_sales['date_block_num'] = monthly_sales['date_block_num'].astype(np.int8)
monthly_sales['shop_id'] = monthly_sales['shop_id'].astype(np.int8)
monthly_sales['item_id'] = monthly_sales['item_id'].astype(np.int16)

Unisco le vendite con le descrizioni di prodotto e categoria

In [87]:
ts = time.time()
monthly_sales = pd.merge(monthly_sales, shops, on=['shop_id'], how='left')\
                  .merge(items, on=['item_id'], how='left')\
                  .merge(cats, on=['item_category_id'], how='left')
list_ = ['city_code','item_category_id','type_code','subtype_code']
for idx in list_:
    monthly_sales[idx] = monthly_sales[idx].astype(np.int8)

In [88]:
ts = time.time()
group = train.groupby(['date_block_num','shop_id','item_id']).agg({
    'item_cnt_day': ['sum', 'count']
})
group.columns = ['item_cnt_month', 'orders']
group.reset_index(inplace=True)

monthly_sales = pd.merge(monthly_sales, group, on=cols, how='left')
monthly_sales['item_cnt_month'] = monthly_sales['item_cnt_month'].fillna(0).clip(0,20).astype(np.float16)
monthly_sales['orders'] = monthly_sales['orders'].fillna(0).astype(np.float16)

In [89]:
group = train.groupby(['date_block_num','item_id']).agg({'item_price': ['mean']})
group.columns = ['item_price']
group.reset_index(inplace=True)

monthly_sales = pd.merge(monthly_sales, group, on=['date_block_num','item_id'], how='left')
monthly_sales['item_price'] = monthly_sales['item_price'].astype(np.float32) 

## Mean encoded features

By date and item id

In [90]:
means = monthly_sales.groupby(['date_block_num', 'item_id']).agg({
    'item_cnt_month': ['mean'],
    'orders': ['sum'],
    'item_price': ['mean']
})
means.columns = [ 'date_item_avg_item_cnt', 'date_item_sum_orders', 'date_item_avg_item_price' ]
means.reset_index(inplace=True)

monthly_sales = pd.merge(monthly_sales, means, on=['date_block_num','item_id'], how='left')
monthly_sales['date_item_avg_item_cnt'] = monthly_sales['date_item_avg_item_cnt'].astype(np.float16)
monthly_sales['date_item_sum_orders'] = monthly_sales['date_item_sum_orders'].astype(np.float16)
monthly_sales['date_item_avg_item_price'] = monthly_sales['date_item_avg_item_price'].astype(np.float32)

In [91]:
means = monthly_sales.groupby(['shop_id', 'item_id']).agg({
    'item_cnt_month': ['mean'],
    'orders': ['sum'],
    'item_price': ['mean']
})
means.columns = [ 'shop_item_avg_item_cnt', 'shop_item_sum_orders', 'shop_item_avg_item_price' ]
means.reset_index(inplace=True)

monthly_sales = pd.merge(monthly_sales, means, on=['shop_id','item_id'], how='left')
monthly_sales['shop_item_avg_item_cnt'] = monthly_sales['shop_item_avg_item_cnt'].astype(np.float16)
monthly_sales['shop_item_sum_orders'] = monthly_sales['shop_item_sum_orders'].astype(np.float16)
monthly_sales['shop_item_avg_item_price'] = monthly_sales['shop_item_avg_item_price'].astype(np.float32)

In [92]:
means = monthly_sales.groupby(['date_block_num', 'shop_id']).agg({
    'item_cnt_month': ['mean'],
    'orders': ['sum'],
    'item_price': ['mean']
})
means.columns = [ 'date_shop_avg_item_cnt', 'date_shop_sum_orders', 'date_shop_avg_item_price' ]
means.reset_index(inplace=True)

monthly_sales = pd.merge(monthly_sales, means, on=['date_block_num','shop_id'], how='left')
monthly_sales['date_shop_avg_item_cnt'] = monthly_sales['date_shop_avg_item_cnt'].astype(np.float16)
monthly_sales['date_shop_sum_orders'] = monthly_sales['date_shop_sum_orders'].astype(np.float16)
monthly_sales['date_shop_avg_item_price'] = monthly_sales['date_shop_avg_item_price'].astype(np.float32)

In [93]:
means = matrix.groupby(['date_block_num', 'item_category_id']).agg({
    'item_cnt_month': ['mean'],
    'orders': ['sum'],
    'item_price': ['mean']
})
means.columns = [ 'date_cat_avg_item_cnt', 'date_cat_sum_orders', 'date_cat_avg_item_price' ]
means.reset_index(inplace=True)

monthly_sales = pd.merge(monthly_sales, means, on=['date_block_num','item_category_id'], how='left')
monthly_sales['date_cat_avg_item_cnt'] = monthly_sales['date_cat_avg_item_cnt'].astype(np.float16)
monthly_sales['date_cat_sum_orders'] = monthly_sales['date_cat_sum_orders'].astype(np.float16)
monthly_sales['date_cat_avg_item_price'] = monthly_sales['date_cat_avg_item_price'].astype(np.float32)

In [94]:
means = monthly_sales.groupby(['date_block_num', 'item_id', 'city_code']).agg({
    'item_cnt_month': ['mean']
})
means.columns = [ 'date_item_city_avg_item_cnt' ]
means.reset_index(inplace=True)

monthly_sales = pd.merge(monthly_sales, means, on=['date_block_num','item_id','city_code'], how='left')
monthly_sales['date_item_city_avg_item_cnt'] = monthly_sales['date_item_city_avg_item_cnt'].astype(np.float16)

Test set

In [95]:
cats.columns

Index(['item_category_id', 'type_code', 'subtype_code'], dtype='object')

In [96]:
test['date_block_num'] = 34
test = pd.merge(test, shops, on='shop_id', how='left')\
         .merge(items, on='item_id', how='left')\
         .merge(cats, on='item_category_id', how='left')

In [97]:
test['item_id'] = test['item_id'].astype(np.int16)
list_ = ['date_block_num','shop_id','city_code','item_category_id','type_code','subtype_code']
for idx in list_:
    print (idx)
    test[idx] = test[idx].astype(np.int8)

date_block_num
shop_id
city_code
item_category_id
type_code
subtype_code


In [98]:
monthly_sales = pd.concat([monthly_sales, test], ignore_index=True, keys=cols)
monthly_sales.fillna(0, inplace=True)

## Lag features

In [100]:
ts = time.time()
def shift(df, month):
    copy = df.drop([
        'city_code',
        'item_category_id',
        'type_code',
        'subtype_code',
        'item_price'
    ], axis=1)
    copy['date_block_num'] += month
    return copy

data = monthly_sales
lags = [1, 2, 3, 4, 5, 12]
for i in lags:
    data = pd.merge(data, shift(monthly_sales, i), on=['date_block_num','shop_id','item_id'], how='left',  suffixes=['', '_'+str(i)])

time.time() - ts

117.25255274772644

In [102]:
data = data[data.date_block_num > 11]
data.drop([
    'orders', 
    'item_price',
    'date_item_avg_item_cnt', 
    'date_item_sum_orders',
    'date_item_avg_item_price', 
    'shop_item_avg_item_cnt',
    'shop_item_sum_orders', 
    'shop_item_avg_item_price',
    'date_shop_avg_item_cnt', 
    'date_shop_sum_orders',
    'date_shop_avg_item_price', 
    'date_cat_avg_item_cnt', 
    'date_cat_sum_orders', 
    'date_cat_avg_item_price', 
    'date_item_city_avg_item_cnt'
], axis=1, inplace=True)

4.968716144561768

In [103]:
def fill_na(df):
    for col in df.columns:
        if df[col].isnull().any():
            if ('item_cnt' in col):
                df[col].fillna(0, inplace=True)
            if ('orders' in col):
                df[col].fillna(0, inplace=True)
            if ('item_price' in col):
                df[col].fillna(df[col].median(), inplace=True)
    return df

data = fill_na(data)

XGBoost

In [106]:
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)
del data
gc.collect()

716

In [107]:
model = XGBRegressor(
    max_depth=7,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    colsample_bylevel=0.8, 
    subsample=0.8, 
    eta=0.3, 
    num_round=1000,
    seed=42)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

[0]	validation_0-rmse:1.12629
Will train until validation_0-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:1.09619
[2]	validation_0-rmse:1.06701
[3]	validation_0-rmse:1.04424
[4]	validation_0-rmse:1.02063
[5]	validation_0-rmse:1.00086
[6]	validation_0-rmse:0.985725
[7]	validation_0-rmse:0.972595
[8]	validation_0-rmse:0.963573
[9]	validation_0-rmse:0.951309
[10]	validation_0-rmse:0.943397
[11]	validation_0-rmse:0.937655
[12]	validation_0-rmse:0.931164
[13]	validation_0-rmse:0.92564
[14]	validation_0-rmse:0.921455
[15]	validation_0-rmse:0.91802
[16]	validation_0-rmse:0.915781
[17]	validation_0-rmse:0.913033
[18]	validation_0-rmse:0.910575
[19]	validation_0-rmse:0.90874
[20]	validation_0-rmse:0.906774
[21]	validation_0-rmse:0.905634
[22]	validation_0-rmse:0.904765
[23]	validation_0-rmse:0.903503
[24]	validation_0-rmse:0.901586
[25]	validation_0-rmse:0.901203
[26]	validation_0-rmse:0.900537
[27]	validation_0-rmse:0.899917
[28]	validation_0-rmse:0.899591
[29]	validation_0-rmse:0.8

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.8,
       colsample_bytree=0.8, eta=0.3, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=7, min_child_weight=300, missing=None,
       n_estimators=100, n_jobs=1, nthread=None, num_round=1000,
       objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.8)

In [108]:
Y_pred = model.predict(X_valid).clip(0, 20)
Y_test = model.predict(X_test).clip(0, 20)

In [112]:
test = pd.read_csv('C:\\Users\\guidil\\Downloads\\Coursera_FinalProject\\Coursera_FinalProject\\test.csv.gz', compression='gzip').set_index('ID')
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('C:\\Users\\guidil\\Downloads\\Coursera_FinalProject\\Coursera_FinalProject\\xgb_submission.csv', index=False)