In [1]:
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 
import gc
from itertools import product
from sklearn import preprocessing

def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [2]:
sales = pd.read_csv('../readonly/final_project_data/sales_train.csv.gz')
shops = pd.read_csv('../readonly/final_project_data/shops.csv')
items = pd.read_csv('../readonly/final_project_data/items.csv')
item_cats = pd.read_csv('../readonly/final_project_data/item_categories.csv')
test = pd.read_csv('../readonly/final_project_data/test.csv.gz')

In [3]:
samples = pd.read_csv('../readonly/final_project_data/sample_submission.csv.gz')

In [4]:
# remove some outliers
sales = sales[sales.item_price<100000]
sales = sales[sales.item_cnt_day<1001]

In [5]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [6]:
items.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [7]:
samples.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


In [8]:
# Create "grid" with columns
# For every month we create a grid from all shops/items combinations from that month
grid = []
for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Aggregations
sales['item_cnt_day'] = sales['item_cnt_day'].clip(0,20)
groups = sales.groupby(['shop_id', 'item_id', 'date_block_num'])
trainset = groups.agg({'item_cnt_day':'sum', 'item_price':'mean'}).reset_index()
trainset = trainset.rename(columns = {'item_cnt_day' : 'item_cnt_month'})
trainset['item_cnt_month'] = trainset['item_cnt_month'].clip(0,20)

trainset = pd.merge(grid,trainset,how='left',on=index_cols)
trainset.item_cnt_month = trainset.item_cnt_month.fillna(0)

# Get category id
trainset = pd.merge(trainset, items[['item_id', 'item_category_id']], on = 'item_id')
trainset.to_csv('trainset_with_grid.csv')

In [9]:
# merge train/test set so it can apply EDA at the same time
testset = test.merge(items[['item_id', 'item_category_id']], on = 'item_id', how = 'left')
testset['date_block_num'] = 34
testset['item_cnt_month'] = -1
train_test_set = pd.concat([trainset, testset], axis = 0) 

# Note pandas cannot have Nan and int in same colume. So ID will converted to float32
# downcast for save memory
train_test_set = downcast_dtypes(train_test_set)
del trainset, testset

In [10]:
# train_test_set with ID not null is in test_set
train_test_set.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_category_id,ID
0,59,22154,0,1.0,999.0,37,
1,25,22154,0,5.0,999.0,37,
2,24,22154,0,1.0,999.0,37,
3,23,22154,0,0.0,,37,
4,19,22154,0,0.0,,37,


In [11]:
train_test_set[train_test_set['ID'].notnull()].head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_category_id,ID
0,5,5037,34,-1.0,,19,0.0
1,5,5320,34,-1.0,,55,1.0
2,5,5233,34,-1.0,,19,2.0
3,5,5232,34,-1.0,,23,3.0
4,5,5268,34,-1.0,,20,4.0


In [12]:
# Process 1: take item_category_name first word(prefix) as a feature
category_meta_list = list(item_cats.item_category_name)
for index in range(len(category_meta_list)):
    category_meta_list[index] = category_meta_list[index].split()[0]

item_cats['item_category_meta_id'] = preprocessing.LabelEncoder().fit_transform(category_meta_list)
item_cats['item_category_meta_name'] = category_meta_list
train_test_set = train_test_set.merge(item_cats[['item_category_id', 'item_category_meta_id']], on = 'item_category_id', how = 'left')
extra_features = ['item_category_meta_id']
train_test_set.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_category_id,ID,item_category_meta_id
0,59,22154,0,1.0,999.0,37,,7
1,25,22154,0,5.0,999.0,37,,7
2,24,22154,0,1.0,999.0,37,,7
3,23,22154,0,0.0,,37,,7
4,19,22154,0,0.0,,37,,7


In [13]:
# Process 2: take shop first word(prefix) as a feature
shop_prefix_list = list(shops.shop_name)
for index in range(len(shop_prefix_list)):
    shop_prefix_list[index] = shop_prefix_list[index].split()[0]

shops['shop_prefix_id'] = preprocessing.LabelEncoder().fit_transform(shop_prefix_list)
shops['shop_prefix_name'] = shop_prefix_list
train_test_set = train_test_set.merge(shops[['shop_id', 'shop_prefix_id']], on = 'shop_id', how = 'left')
extra_features.append('shop_prefix_id')
train_test_set.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_category_id,ID,item_category_meta_id,shop_prefix_id
0,59,22154,0,1.0,999.0,37,,7,31
1,25,22154,0,5.0,999.0,37,,7,14
2,24,22154,0,1.0,999.0,37,,7,14
3,23,22154,0,0.0,,37,,7,14
4,19,22154,0,0.0,,37,,7,13


In [14]:
# Process 3: There are 12 months in a year, so add a feature with current month
train_test_set['date_block_num_month'] = train_test_set['date_block_num'] % 12
extra_features.append('date_block_num_month')
train_test_set.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_category_id,ID,item_category_meta_id,shop_prefix_id,date_block_num_month
0,59,22154,0,1.0,999.0,37,,7,31,0
1,25,22154,0,5.0,999.0,37,,7,14,0
2,24,22154,0,1.0,999.0,37,,7,14,0
3,23,22154,0,0.0,,37,,7,14,0
4,19,22154,0,0.0,,37,,7,13,0


In [15]:
# Process 4: Add item text feature (Use TFiDF)
# We only use the most common FEATURE_COUNT words
from sklearn.feature_extraction.text import TfidfVectorizer

FEATURE_COUNT = 32
vectorizer = TfidfVectorizer(max_features=FEATURE_COUNT)
tfidf_vector = vectorizer.fit_transform(items.item_name).astype(np.float32).toarray()

column_list = []
for index in range(FEATURE_COUNT):
    column_name = 'tfidf_'+str(index)
    column_list.append(column_name)
    items[column_name] = pd.Series(tfidf_vector[:, index])
    extra_features.append(column_name)
    
column_list.append('item_id')
train_test_set = train_test_set.merge(items[column_list], on = 'item_id', how = 'left')
train_test_set.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_category_id,ID,item_category_meta_id,shop_prefix_id,date_block_num_month,...,tfidf_22,tfidf_23,tfidf_24,tfidf_25,tfidf_26,tfidf_27,tfidf_28,tfidf_29,tfidf_30,tfidf_31
0,59,22154,0,1.0,999.0,37,,7,31,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,25,22154,0,5.0,999.0,37,,7,14,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,24,22154,0,1.0,999.0,37,,7,14,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,23,22154,0,0.0,,37,,7,14,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,19,22154,0,0.0,,37,,7,13,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Process 5: Add shop item lag feature
LAG_RANGE = 12

for lag in range(1, LAG_RANGE+1):
    feature_name = 'shopitem_sales_pre_' + str(lag)
    tmp = train_test_set[['shop_id','item_id','date_block_num', 'item_cnt_month']].copy()
    tmp.loc[:, 'date_block_num'] += lag
    tmp.rename(columns={'item_cnt_month': feature_name}, inplace=True)
    train_test_set = train_test_set.merge(tmp[['shop_id', 'item_id', 'date_block_num', feature_name]], on = ['shop_id', 'item_id', 'date_block_num'], how = 'left')
    train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
    extra_features.append(feature_name)
train_test_set.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_category_id,ID,item_category_meta_id,shop_prefix_id,date_block_num_month,...,shopitem_sales_pre_3,shopitem_sales_pre_4,shopitem_sales_pre_5,shopitem_sales_pre_6,shopitem_sales_pre_7,shopitem_sales_pre_8,shopitem_sales_pre_9,shopitem_sales_pre_10,shopitem_sales_pre_11,shopitem_sales_pre_12
0,59,22154,0,1.0,999.0,37,,7,31,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,25,22154,0,5.0,999.0,37,,7,14,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,24,22154,0,1.0,999.0,37,,7,14,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,23,22154,0,0.0,,37,,7,14,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,19,22154,0,0.0,,37,,7,13,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Process 6: Add item lag feature
LAG_RANGE = 12
groups = train_test_set.groupby(by = ['item_id', 'date_block_num'])
for lag in range(1, LAG_RANGE+1):
    feature_name = 'item_sales_pre_' + str(lag)
    tmp = groups.agg({'item_cnt_month':'mean'})
    tmp = tmp.reset_index()
    tmp.loc[:, 'date_block_num'] += lag
    tmp.rename(columns={'item_cnt_month': feature_name}, inplace=True)
    train_test_set = train_test_set.merge(tmp, on = ['item_id', 'date_block_num'], how = 'left')
    train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
    extra_features.append(feature_name)
train_test_set.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_category_id,ID,item_category_meta_id,shop_prefix_id,date_block_num_month,...,item_sales_pre_3,item_sales_pre_4,item_sales_pre_5,item_sales_pre_6,item_sales_pre_7,item_sales_pre_8,item_sales_pre_9,item_sales_pre_10,item_sales_pre_11,item_sales_pre_12
0,59,22154,0,1.0,999.0,37,,7,31,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,25,22154,0,5.0,999.0,37,,7,14,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,24,22154,0,1.0,999.0,37,,7,14,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,23,22154,0,0.0,,37,,7,14,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,19,22154,0,0.0,,37,,7,13,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# Process 7: Add shop lag feature
LAG_RANGE = 12
groups = train_test_set.groupby(by = ['shop_id', 'date_block_num'])
for lag in range(1, LAG_RANGE+1):
    feature_name = 'shop_sales_pre_' + str(lag)
    tmp = groups.agg({'item_cnt_month':'mean'})
    tmp = tmp.reset_index()
    tmp.loc[:, 'date_block_num'] += lag
    tmp.rename(columns={'item_cnt_month': feature_name}, inplace=True)
    train_test_set = train_test_set.merge(tmp, on = ['shop_id', 'date_block_num'], how = 'left')
    train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
    extra_features.append(feature_name)
train_test_set.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_category_id,ID,item_category_meta_id,shop_prefix_id,date_block_num_month,...,shop_sales_pre_3,shop_sales_pre_4,shop_sales_pre_5,shop_sales_pre_6,shop_sales_pre_7,shop_sales_pre_8,shop_sales_pre_9,shop_sales_pre_10,shop_sales_pre_11,shop_sales_pre_12
0,59,22154,0,1.0,999.0,37,,7,31,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,25,22154,0,5.0,999.0,37,,7,14,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,24,22154,0,1.0,999.0,37,,7,14,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,23,22154,0,0.0,,37,,7,14,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,19,22154,0,0.0,,37,,7,13,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Process 8: Add shop item price lag feature
LAG_RANGE = 12

for lag in range(1, LAG_RANGE+1):
    feature_name = 'shopitem_price_pre_' + str(lag)
    tmp = train_test_set[['shop_id','item_id','date_block_num', 'item_price']].copy()
    tmp.loc[:, 'date_block_num'] += lag
    tmp.rename(columns={'item_price': feature_name}, inplace=True)
    train_test_set = train_test_set.merge(tmp[['shop_id', 'item_id', 'date_block_num', feature_name]], on = ['shop_id', 'item_id', 'date_block_num'], how = 'left')
    train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
    extra_features.append(feature_name)
train_test_set.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_category_id,ID,item_category_meta_id,shop_prefix_id,date_block_num_month,...,shopitem_price_pre_3,shopitem_price_pre_4,shopitem_price_pre_5,shopitem_price_pre_6,shopitem_price_pre_7,shopitem_price_pre_8,shopitem_price_pre_9,shopitem_price_pre_10,shopitem_price_pre_11,shopitem_price_pre_12
0,59,22154,0,1.0,999.0,37,,7,31,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,25,22154,0,5.0,999.0,37,,7,14,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,24,22154,0,1.0,999.0,37,,7,14,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,23,22154,0,0.0,,37,,7,14,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,19,22154,0,0.0,,37,,7,13,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# Now it is time to train/validate/test
train_set = train_test_set[train_test_set['date_block_num'] <= 32]
val_set = train_test_set[train_test_set['date_block_num'] == 33]
test_set = train_test_set[train_test_set['date_block_num'] == 34]

features = extra_features + ['shop_id', 'item_id', 'date_block_num',
       'item_category_id']

train_x = train_set[features].values
train_y = train_set['item_cnt_month'].values

val_x = val_set[features].values
val_y = val_set['item_cnt_month'].values

test_x = test_set[features].values

In [23]:
import xgboost as xgb
model = xgb.XGBRegressor(max_depth=12, n_estimators=1024, min_child_weight=300, colsample_bytree=0.8, subsample=1, eta=0.3,seed = 1, nthread = 32)
model.fit(train_x, train_y, eval_metric='rmse', eval_set=[(train_x, train_y), (val_x, val_y)], early_stopping_rounds = 10)

[0]	validation_0-rmse:1.10247	validation_1-rmse:1.04470
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:1.01031	validation_1-rmse:0.97562
[2]	validation_0-rmse:0.95699	validation_1-rmse:0.93668
[3]	validation_0-rmse:0.92821	validation_1-rmse:0.91480
[4]	validation_0-rmse:0.90502	validation_1-rmse:0.90779
[5]	validation_0-rmse:0.89053	validation_1-rmse:0.90634
[6]	validation_0-rmse:0.88284	validation_1-rmse:0.90272
[7]	validation_0-rmse:0.87504	validation_1-rmse:0.90133
[8]	validation_0-rmse:0.87123	validation_1-rmse:0.89664
[9]	validation_0-rmse:0.86721	validation_1-rmse:0.89488
[10]	validation_0-rmse:0.86484	validation_1-rmse:0.89407
[11]	validation_0-rmse:0.86099	validation_1-rmse:0.90094
[12]	validation_0-rmse:0.85882	validation_1-rmse:0.89995
[13]	validation_0-rmse:0.85491	validation_1-rmse:0.89933
[14]	validation_0-rmse:0.85317	validation_1-rmse:0.89

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=12,
             min_child_weight=300, missing=nan, monotone_constraints='()',
             n_estimators=1024, n_jobs=32, nthread=32, num_parallel_tree=1,
             random_state=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=1, subsample=1, tree_method='approx', validate_parameters=1,
             verbosity=None)

In [24]:
test_y=np.clip(model.predict(test_x), a_min=0,a_max=20)

In [25]:
samples['item_cnt_month']=pd.Series(test_y)

In [26]:
# Save result
samples.to_csv('submission.csv', index=False)