In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as tp
import time
import tqdm
from calendar import monthrange
import calendar
from xgboost import XGBRegressor
from xgboost import plot_importance
import sklearn
from itertools import product
#from tqdm import tqdm_notebook as tqdm

In [2]:
sales_train = pd.read_csv('sales_train.csv')
print('Size of dataset before outliers: {}'.format(sales_train.shape))
sales_train = sales_train[sales_train['item_price']<100000]
sales_train = sales_train[sales_train['item_cnt_day']<1000]
sales_train = sales_train[sales_train['item_price']>0]
print('Size of dataset after removing outliers: {}'.format(sales_train.shape))
ts = time.time()
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = sales_train[sales_train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
    
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)
time.time() - ts
exp = sales_train.groupby(['date_block_num','item_id','shop_id'])['item_cnt_day'].sum()
exp = pd.DataFrame(exp).reset_index()
exp.rename(columns={'item_cnt_day':'item_cnt_month'}, inplace = True)
#exp.sort_values(by=['date_block_num','item_id','shop_id'], inplace = True)
matrix = pd.merge(matrix, exp, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month']
                                .fillna(0)#.astype(np.float16))
                                .clip(0,20) # NB clip target here
                                .astype(np.float16))
train = matrix
lags = [1,2,3,6]
train.info()

Size of dataset before outliers: (2935849, 6)
Size of dataset after removing outliers: (2935845, 6)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10913804 entries, 0 to 10913803
Data columns (total 4 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date_block_num  int8   
 1   shop_id         int8   
 2   item_id         int16  
 3   item_cnt_month  float16
dtypes: float16(1), int16(1), int8(2)
memory usage: 145.7 MB


In [3]:
#train = pd.read_csv('train.csv')
#train = train.drop('Unnamed: 0', axis = 1)
#train.sort_values(by=['date_block_num','item_id','shop_id'], inplace = True)
#train.info()
#train.item_cnt_month = train.item_cnt_month.clip(0,20)
#lags = [1,2,3,6]

In [4]:
# prepare the test dataset to merge with the training dataset
test = pd.read_csv('test.csv')
itemcat = pd.read_csv('items.csv')
#itemcat.head()
test = test.drop('ID',axis=1)
test['date_block_num'] = 34
test['item_cnt_month'] = 0
test.sort_values(by=['shop_id','item_id'], inplace = True)
itemcat.head()
train = pd.merge(train,itemcat[['item_id','item_category_id']], on = ['item_id'], how = 'left')
train.rename(columns={'item_category_id':'item_cat'}, inplace= True)
test = pd.merge(test,itemcat[['item_id','item_category_id']], on = ['item_id'], how = 'left')
test.rename(columns={'item_category_id':'item_cat'}, inplace= True)

In [5]:
test.info()
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214200 entries, 0 to 214199
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   shop_id         214200 non-null  int64
 1   item_id         214200 non-null  int64
 2   date_block_num  214200 non-null  int64
 3   item_cnt_month  214200 non-null  int64
 4   item_cat        214200 non-null  int64
dtypes: int64(5)
memory usage: 9.8 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10913804 entries, 0 to 10913803
Data columns (total 5 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date_block_num  int8   
 1   shop_id         int8   
 2   item_id         int16  
 3   item_cnt_month  float16
 4   item_cat        int64  
dtypes: float16(1), int16(1), int64(1), int8(2)
memory usage: 229.0 MB


In [6]:
#test = test.drop('ID',axis=1)
#test.head()
matrix = pd.concat([train,test], keys = ['shop_id','item_id','date_block_num'], ignore_index = True, sort= False)
matrix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11128004 entries, 0 to 11128003
Data columns (total 5 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date_block_num  int64  
 1   shop_id         int64  
 2   item_id         int64  
 3   item_cnt_month  float64
 4   item_cat        int64  
dtypes: float64(1), int64(4)
memory usage: 424.5 MB


In [7]:
def downcast(df):
    #finding floating point columns
    float_cols = [c for c in df if df[c].dtype == 'float64']
    #finding integer columns
    int_cols = [c for c in df if df[c].dtype in ['int32','int64']]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

train = downcast(matrix)
del matrix
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11128004 entries, 0 to 11128003
Data columns (total 5 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date_block_num  int16  
 1   shop_id         int16  
 2   item_id         int16  
 3   item_cnt_month  float32
 4   item_cat        int16  
dtypes: float32(1), int16(4)
memory usage: 127.4 MB


In [8]:
train.date_block_num.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34], dtype=int16)

In [9]:
# Function for introducing lag features in the dataset. "COL" is the intended feature
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]] #made a temporary dataframe
    for i in tqdm.tqdm(lags):
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

ts = time.time()
train = lag_feature(train, lags, 'item_cnt_month')
time.time() - ts

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:27<00:00,  6.89s/it]


27.6822829246521

In [10]:
train.fillna(0, inplace = True)

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11128004 entries, 0 to 11128003
Data columns (total 9 columns):
 #   Column                Dtype  
---  ------                -----  
 0   date_block_num        int16  
 1   shop_id               int16  
 2   item_id               int16  
 3   item_cnt_month        float32
 4   item_cat              int16  
 5   item_cnt_month_lag_1  float32
 6   item_cnt_month_lag_2  float32
 7   item_cnt_month_lag_3  float32
 8   item_cnt_month_lag_6  float32
dtypes: float32(5), int16(4)
memory usage: 382.0 MB


In [18]:
# Price analysis. In this section we will dome some experiment on price analysis will see if there are ups and
# downs in price

In [19]:
train.columns

Index(['date_block_num', 'item_id', 'shop_id', 'item_cnt_month', 'item_cat',
       'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
       'item_cnt_month_lag_6', 'item_price'],
      dtype='object')

In [12]:
# Now we shall do some investigation on item price over both shop and time

In [13]:
exp = sales_train.groupby(['shop_id','item_id'])['item_price'].max()
exp = pd.DataFrame(exp).reset_index()
exp.rename(columns={'item_price':'max'}, inplace = True)
exp.head()

Unnamed: 0,shop_id,item_id,max
0,0,30,265.0
1,0,31,434.0
2,0,32,221.0
3,0,33,347.0
4,0,35,247.0


In [14]:
exp1 = sales_train.groupby(['shop_id','item_id'])['item_price'].min()
exp1 = pd.DataFrame(exp1).reset_index()
exp1.rename(columns={'item_price':'min'}, inplace = True)
exp1.head()

Unnamed: 0,shop_id,item_id,min
0,0,30,265.0
1,0,31,434.0
2,0,32,221.0
3,0,33,347.0
4,0,35,247.0


In [15]:
matrix = pd.merge(exp,exp1, on=['shop_id','item_id'],how = 'left')

In [16]:
matrix.head()

Unnamed: 0,shop_id,item_id,max,min
0,0,30,265.0,265.0
1,0,31,434.0,434.0
2,0,32,221.0,221.0
3,0,33,347.0,347.0
4,0,35,247.0,247.0


In [17]:
matrix['change'] = np.abs(matrix['max'] -matrix['min'])

In [18]:
matrix

Unnamed: 0,shop_id,item_id,max,min,change
0,0,30,265.0,265.0,0.0
1,0,31,434.0,434.0,0.0
2,0,32,221.0,221.0,0.0
3,0,33,347.0,347.0,0.0
4,0,35,247.0,247.0,0.0
...,...,...,...,...,...
424118,59,22154,999.0,999.0,0.0
424119,59,22155,149.0,149.0,0.0
424120,59,22162,399.0,349.0,50.0
424121,59,22164,749.0,699.0,50.0


In [19]:
# This means there is a change of item price over TIME. Now we need to see if there is variation of item price
# over the SHOPS. 

In [20]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [21]:
shop_matrix = sales_train.pivot_table(index=['date','item_id'], columns = 'shop_id', values = 'item_price',aggfunc= np.mean, fill_value=0)

In [22]:
shop_matrix = shop_matrix.droplevel('date',axis=0)

In [23]:
shop_matrix_2d = shop_matrix.values

In [24]:
for i in range (0,shop_matrix_2d.shape[0]):
    maxs = np.max(shop_matrix_2d[i])
    shop_matrix_2d[i] = shop_matrix_2d[i]/maxs

In [25]:
shop_matrix_2d.shape

(1541612, 60)

In [26]:
shop_index = np.zeros((60,1)) # shop_index determines the usual practice of selling a product in a shopt
for i in range(0,shop_matrix_2d.shape[1]):
    temp = shop_matrix_2d[:][i]
    shop_index[i] = np.mean(temp[temp>0])
    #shop_index[i] = np.mean(np.unique(shop_matrix_2d[:][i]))

In [27]:
train['shop_index'] = ''

In [28]:
for i in range(0,60):
    train.loc[train.shop_id==i,'shop_index'] = shop_index[i]

In [29]:
np.min(train.shop_index)

0.7208835341365462

In [30]:
def nday_month(x):
    if x<12:
        year = 2013
    if x>=12 and x<22:
        year = 2014
    if x>=22:
        year = 2015
    month = np.mod(x,12) + 1
    temp = monthrange(year,month)[1]
    return temp

In [31]:
nday_month(32)

30

In [32]:
train.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_cat,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_6,shop_index
0,0,0,19,0.0,40,0.0,0.0,0.0,0.0,0.999733
1,0,0,27,0.0,19,0.0,0.0,0.0,0.0,0.999733
2,0,0,28,0.0,30,0.0,0.0,0.0,0.0,0.999733
3,0,0,29,0.0,23,0.0,0.0,0.0,0.0,0.999733
4,0,0,32,6.0,40,0.0,0.0,0.0,0.0,0.999733


In [33]:
train['day_in_month'] = ''

In [34]:
train['day_in_month'] = train.date_block_num.apply(lambda x: nday_month(x))

In [35]:
train.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_cat,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_6,shop_index,day_in_month
0,0,0,19,0.0,40,0.0,0.0,0.0,0.0,0.999733,31
1,0,0,27,0.0,19,0.0,0.0,0.0,0.0,0.999733,31
2,0,0,28,0.0,30,0.0,0.0,0.0,0.0,0.999733,31
3,0,0,29,0.0,23,0.0,0.0,0.0,0.0,0.999733,31
4,0,0,32,6.0,40,0.0,0.0,0.0,0.0,0.999733,31


In [36]:
def wday_month(x):
    if x<12:
        year = 2013
    if x>=12 and x<22:
        year = 2014
    if x>=22:
        year = 2015
    month = np.mod(x,12) + 1
    temp = np.array(calendar.monthcalendar(year,month))[:,calendar.SATURDAY]
    return (temp>0).sum()

In [37]:
train['wday_in_month'] = ''

In [38]:
train['wday_in_month'] = train.date_block_num.apply(lambda x: wday_month(x))

In [39]:
train['month_index'] = ''

In [40]:
train['month_index'] = train.date_block_num.apply(lambda x: np.mod(x,12)+1)

In [41]:
def downcast(df):
    #finding floating point columns
    float_cols = [c for c in df if df[c].dtype == 'float64']
    #finding integer columns
    int_cols = [c for c in df if df[c].dtype in ['int32','int64']]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

train = downcast(train)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11128004 entries, 0 to 11128003
Data columns (total 13 columns):
 #   Column                Dtype  
---  ------                -----  
 0   date_block_num        int16  
 1   shop_id               int16  
 2   item_id               int16  
 3   item_cnt_month        float32
 4   item_cat              int16  
 5   item_cnt_month_lag_1  float32
 6   item_cnt_month_lag_2  float32
 7   item_cnt_month_lag_3  float32
 8   item_cnt_month_lag_6  float32
 9   shop_index            object 
 10  day_in_month          int16  
 11  wday_in_month         int16  
 12  month_index           int16  
dtypes: float32(5), int16(7), object(1)
memory usage: 530.6+ MB


In [42]:
exp = train.groupby(['date_block_num','item_id'])['item_cnt_month'].mean()
exp = pd.DataFrame(exp).reset_index()
exp = exp.rename(columns={'item_cnt_month':'avg_item_sale'})
temp = pd.merge(train,exp,on=['item_id','date_block_num'],how = 'left')
temp.avg_item_sale = temp.avg_item_sale.astype(np.float32)
# adding lag feature
temp = lag_feature(temp,lags,'avg_item_sale')
temp.fillna(0,inplace=True)
temp = temp.drop(columns=['avg_item_sale'],axis=1)
train = temp
traincopy = train.copy()

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:31<00:00,  8.02s/it]


In [43]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11128004 entries, 0 to 11128003
Data columns (total 17 columns):
 #   Column                Dtype  
---  ------                -----  
 0   date_block_num        int16  
 1   shop_id               int16  
 2   item_id               int16  
 3   item_cnt_month        float32
 4   item_cat              int16  
 5   item_cnt_month_lag_1  float32
 6   item_cnt_month_lag_2  float32
 7   item_cnt_month_lag_3  float32
 8   item_cnt_month_lag_6  float32
 9   shop_index            float64
 10  day_in_month          int16  
 11  wday_in_month         int16  
 12  month_index           int16  
 13  avg_item_sale_lag_1   float32
 14  avg_item_sale_lag_2   float32
 15  avg_item_sale_lag_3   float32
 16  avg_item_sale_lag_6   float32
dtypes: float32(9), float64(1), int16(7)
memory usage: 700.4 MB


In [44]:
exp = train.groupby(['date_block_num','shop_id'])['item_cnt_month'].mean()
exp = pd.DataFrame(exp).reset_index()
exp = exp.rename(columns={'item_cnt_month':'avg_shop_sale'})
temp = pd.merge(train,exp,on=['shop_id','date_block_num'],how = 'left')
temp.avg_shop_sale = temp.avg_shop_sale.astype(np.float32)
# adding lag feature
temp = lag_feature(temp,lags,'avg_shop_sale')
temp.fillna(0,inplace=True)
temp = temp.drop(columns=['avg_shop_sale'],axis=1)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:32<00:00,  8.21s/it]


In [45]:
train = temp
traincopy = train.copy()

In [46]:
exp = train.groupby(['date_block_num','item_cat'])['item_cnt_month'].mean()
exp = pd.DataFrame(exp).reset_index()
exp = exp.rename(columns={'item_cnt_month':'avg_cat_sale'})
temp = pd.merge(train,exp,on=['item_cat','date_block_num'],how = 'left')
temp.avg_cat_sale = temp.avg_cat_sale.astype(np.float32)
# adding lag feature
temp = lag_feature(temp,lags,'avg_cat_sale')
temp.fillna(0,inplace=True)
temp = temp.drop(columns=['avg_cat_sale'],axis=1)
train = temp
traincopy = train.copy

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:33<00:00,  8.48s/it]


In [47]:
exp = train.groupby(['date_block_num','item_id'])['item_cnt_month'].sum()
exp = pd.DataFrame(exp).reset_index()
exp = exp.rename(columns={'item_cnt_month':'tot_item_sale'})
temp = pd.merge(train,exp,on=['item_id','date_block_num'],how = 'left')
temp.tot_item_sale = temp.tot_item_sale.astype(np.float32)
# adding lag feature
temp = lag_feature(temp,lags,'tot_item_sale')
temp.fillna(0,inplace=True)
temp = temp.drop(columns=['tot_item_sale'],axis=1)
train = temp
traincopy = train.copy()

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:33<00:00,  8.68s/it]


In [48]:
exp = train.groupby(['date_block_num','shop_id'])['item_cnt_month'].sum()
exp = pd.DataFrame(exp).reset_index()
exp = exp.rename(columns={'item_cnt_month':'tot_shop_sale'})
temp = pd.merge(train,exp,on=['shop_id','date_block_num'],how = 'left')
temp.tot_shop_sale = temp.tot_shop_sale.astype(np.float32)
# adding lag feature
temp = lag_feature(temp,lags,'tot_shop_sale')
temp.fillna(0,inplace=True)
temp = temp.drop(columns=['tot_shop_sale'],axis=1)
train = temp
traincopy = train.copy()

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:34<00:00,  9.03s/it]


In [49]:
exp = train.groupby(['date_block_num','item_cat'])['item_cnt_month'].sum()
exp = pd.DataFrame(exp).reset_index()
exp = exp.rename(columns={'item_cnt_month':'tot_cat_sale'})
temp = pd.merge(train,exp,on=['item_cat','date_block_num'],how = 'left')
temp.tot_cat_sale = temp.tot_cat_sale.astype(np.float32)
# adding lag feature
temp = lag_feature(temp,lags,'tot_cat_sale')
temp.fillna(0,inplace=True)
temp = temp.drop(columns=['tot_cat_sale'],axis=1)
train = temp
traincopy = train.copy()

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:35<00:00,  9.27s/it]


In [None]:
# Price Trend Feature
matrix = train.copy()
def strend(row):
    for i in lags:
        if row['pr_trend_lag_'+str(i)]:
            return row['pr_trend_lag_'+str(i)]
    return 0

exp = sales_train.groupby('item_id')['item_price'].mean()
exp = pd.DataFrame(exp)
exp = exp.reset_index()
exp = exp.rename(columns={'item_price':'avg_item_price'})
matrix = pd.merge(matrix,exp,on=['item_id'],how='left')
matrix = lag_feature(matrix,lags,'avg_item_price')
matrix.fillna(0,inplace=True)
for i in tqdm.tqdm(lags):
    matrix['pr_trend_lag_'+str(i)] = ''
    matrix['pr_trend_lag_'+str(i)] = (matrix['avg_item_price_lag_'+str(i)] - matrix['avg_item_price'])/matrix['avg_item_price']
matrix['slope_price'] = matrix.apply(strend, axis=1)
drop_item = ['avg_item_price']
for i in tqdm.tqdm(lags):
    drop_item += ['pr_trend_lag_'+str(i)]
matrix = matrix.drop(columns=drop_item,axis=1)
#train = matrix
#traincopy = train.copy()

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:38<00:00,  9.98s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  4.14it/s]


In [None]:
train = matrix
traincopy = train.copy()
del matrix

In [53]:
############### MODEL PREPARATION ###################
train = traincopy
train.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month', 'item_cat',
       'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
       'item_cnt_month_lag_6', 'shop_index', 'day_in_month', 'wday_in_month',
       'month_index', 'avg_item_sale_lag_1', 'avg_item_sale_lag_2',
       'avg_item_sale_lag_3', 'avg_item_sale_lag_6', 'avg_shop_sale_lag_1',
       'avg_shop_sale_lag_2', 'avg_shop_sale_lag_3', 'avg_shop_sale_lag_6',
       'avg_cat_sale_lag_1', 'avg_cat_sale_lag_2', 'avg_cat_sale_lag_3',
       'avg_cat_sale_lag_6', 'tot_item_sale_lag_1', 'tot_item_sale_lag_2',
       'tot_item_sale_lag_3', 'tot_item_sale_lag_6', 'tot_shop_sale_lag_1',
       'tot_shop_sale_lag_2', 'tot_shop_sale_lag_3', 'tot_shop_sale_lag_6',
       'tot_cat_sale_lag_1', 'tot_cat_sale_lag_2', 'tot_cat_sale_lag_3',
       'tot_cat_sale_lag_6'],
      dtype='object')

In [None]:
#monthly sale
exp = sales_train.groupby('date_block_num')['item_cnt_day'].sum()
monthly_sale = pd.DataFrame(exp).reset_index()
temp = pd.merge(train,monthly_sale,on=['date_block_num'],how= 'left')
temp = temp.rename(columns = {'item_cnt_day':'monthly_sale'})
temp = lag_feature(temp,lags,'monthly_sale')
temp.fillna(0,inplace= True)
temp = temp.drop(columns=['monthly_sale'],axis=1)
train = temp
traincopy = train.copy()

In [None]:
train = traincopy
train = train.drop(columns=['item_price'],axis=1)
#train['item_cnt_month'] = train.item_cnt_month.clip(0,20)
train.fillna(0,inplace=True)
train['shop_index'] = train['shop_index'].astype(np.float32)
train = train[train['date_block_num']>= np.max(lags)]
valid_month = 33
test_month = 34
xtrain = train[train.date_block_num<valid_month].drop(columns='item_cnt_month', axis =1)
ytrain = train[train.date_block_num<valid_month]['item_cnt_month']
xvalid = train[train.date_block_num==valid_month].drop(columns='item_cnt_month', axis =1)
yvalid = train[train.date_block_num==valid_month]['item_cnt_month']
xtest = train[train.date_block_num==test_month].drop(columns='item_cnt_month', axis =1)

In [None]:
ts = time.time()
import sklearn
model = XGBRegressor(
    max_depth=12,
    n_estimators=1000,
    min_child_weight=0.5, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.1,    
    seed=42)

model.fit(
    xtrain, 
    ytrain, 
    eval_metric="rmse", 
    eval_set=[(xtrain, ytrain), (xvalid, yvalid)], 
    verbose=True, 
    early_stopping_rounds = 10)

time.time() - ts

In [None]:
def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)
plot_features(model, (10,14))

In [76]:
ypred = model.predict(xtest)

In [77]:
ypred

array([1.0228751, 0.9711656, 1.0630636, ..., 0.9609761, 2.1035473,
       1.4725301], dtype=float32)

In [78]:
## Preparing submission file
test = pd.read_csv('test.csv')

In [79]:
temp = xtest[['shop_id','item_id']]
temp['item_cnt_month'] = ypred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [80]:
submission = pd.concat((remainder[['shop_id','item_id','item_cnt_month']],temp))

In [81]:
exp = pd.merge(test,submission,on=['shop_id','item_id'], how= 'left')
exp = exp.drop(columns=['shop_id','item_id'], axis = 1)
exp.item_cnt_month = exp.item_cnt_month.clip(0,20)

In [82]:
exp.to_csv('submission_xgboost1.csv', index = False)

In [83]:
train.shape

(1369259, 46)