## Import libraries

In [1]:
# system and performance
import gc
import time
import os
import pickle


# date management
import datetime
import calendar


# data management
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

from itertools import product

# visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


# machine learning
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [2]:
def create_directory(path):
    if not os.path.isdir(path):
        os.mkdir(path)
        print('directory '+path+' created succesfully !')
    else:
        print('directory '+path+' already exists')

In [3]:
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

In [4]:
# path to data
RAW_DATA_FOLDER = '../data/raw/'
DATA_FOLDER = '../data/'

##### Define core variable space and macro to reset variable space

In [5]:
loaded=%who_ls
loaded.append('loaded')

In [6]:
all_vars=%who_ls
all_vars.append('all_vars')
for var in list(set(all_vars)-set(loaded)):
    exec('del '+var)
del var

In [7]:
%macro reset_variable_space 6
loaded.append('reset_variable_space')

Macro `reset_variable_space` created. To execute, type its name (without quotes).
=== Macro contents: ===
all_vars=get_ipython().run_line_magic('who_ls', '')
all_vars.append('all_vars')
for var in list(set(all_vars)-set(loaded)):
    exec('del '+var)
del var
 

## -------------------------------------------------------------

# 7 - ASSEMBLE PREDICTION

In [8]:
def predict_seniority(seniority,ntree=0):

    # import model and test set
    xgbreg = pickle.load(open(os.path.join(DATA_FOLDER,'predictions/models/xgbreg_seniority'+str(seniority)+'/model.pickle'), 'rb'))
    X_test = pd.read_pickle(os.path.join(DATA_FOLDER,'predictions/models/xgbreg_seniority'+str(seniority)+'/X_test.pkl'))

    # form prediction
    Y_pred_test=xgbreg.predict(X_test,ntree_limit=ntree).clip(0,20)

    Y_pred=pd.read_pickle(os.path.join(DATA_FOLDER,'processed/train_'+str(seniority)+'.pkl'))
    Y_pred=Y_pred.loc[Y_pred['month_id']==34,['shop_id','item_id']]
    Y_pred['prediction']=Y_pred_test

    return Y_pred

In [9]:
# build global prediction

# number of trees used for predictions for pairs of seniority 0,1,2 respectively
ntrees=[200,400,2000]

Y_0=predict_seniority(seniority=0,ntree=ntrees[0])
Y_1=predict_seniority(seniority=1,ntree=ntrees[1])
Y_2=predict_seniority(seniority=2,ntree=ntrees[2])
Y_pred=pd.concat([Y_0,Y_1,Y_2],axis=0,sort=False)

test_df = pd.read_csv(os.path.join(RAW_DATA_FOLDER, 'test.csv'))
test_df=test_df.join(Y_pred.set_index(['shop_id','item_id']),on=['shop_id','item_id'])

print(test_df.info(null_counts=True))
test_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214200 entries, 0 to 214199
Data columns (total 4 columns):
ID            214200 non-null int64
shop_id       214200 non-null int64
item_id       214200 non-null int64
prediction    214200 non-null float32
dtypes: float32(1), int64(3)
memory usage: 5.7 MB
None


Unnamed: 0,ID,shop_id,item_id,prediction
0,0,5,5037,0.471891
1,1,5,5320,0.397861
2,2,5,5233,0.831199
3,3,5,5232,0.483190
4,4,5,5268,3.773052
...,...,...,...,...
214195,214195,45,18454,0.087514
214196,214196,45,16188,0.036839
214197,214197,45,15757,0.058053
214198,214198,45,19648,0.018960


In [10]:
# build and export submission
submission = pd.DataFrame({
    "ID": test_df['ID'], 
    "item_cnt_month": test_df['prediction']
})

submission.to_csv(os.path.join(DATA_FOLDER,'predictions/xgb_prediction.csv'), index=False)

In [11]:
# clear memory
del test_df
del Y_0, Y_1, Y_2, Y_pred
del submission

gc.collect()

11

In [12]:
reset_variable_space