In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

from catboost import CatBoostRegressor

import lightgbm as lgb

from category_encoders import OneHotEncoder, SumEncoder, HelmertEncoder

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max.columns', None)

In [3]:
stores = pd.read_csv('store.csv')
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data_prepared = pd.read_csv('train_data.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Prepare train and test

In [4]:
test_data.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0
1,2,3,4,2015-09-17,1.0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,0
3,4,8,4,2015-09-17,1.0,1,0,0
4,5,9,4,2015-09-17,1.0,1,0,0


### test has no Customers column

In [5]:
train_data_prepared.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,month,year,day,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Date_index,CompetitionOpenSinceDay,CompetitionOpenSinceDate,qty_days_with_competitor
0,1,5,2015-07-31,5263,555,1,1,,1,7,2015,31,c,basic,1270.0,9.0,2008.0,0,,,,2015-07-31,1,2008-09-01,2524.0
1,2,5,2015-07-31,6064,625,1,1,,1,7,2015,31,a,basic,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015-07-31,1,2007-11-01,2829.0
2,3,5,2015-07-31,8314,821,1,1,,1,7,2015,31,a,basic,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",2015-07-31,1,2006-12-01,3164.0
3,4,5,2015-07-31,13995,1498,1,1,,1,7,2015,31,c,extended,620.0,9.0,2009.0,0,,,,2015-07-31,1,2009-09-01,2159.0
4,5,5,2015-07-31,4822,559,1,1,,1,7,2015,31,a,basic,29910.0,4.0,2015.0,0,,,,2015-07-31,1,2015-04-01,121.0


In [6]:
stores['CompetitionDistance'] = stores['CompetitionDistance'].fillna(0)

In [7]:
ASSORTMENT_MAPPING = {'a': 'basic', 'b': 'extra', 'c': 'extended'}
STATE_HOLIDAYS_MAPPING = {'a': 'public holiday', 'b': 'Easter holiday', 'c': 'Christmas', '0': None}

In [8]:
stores['Assortment'] = stores.Assortment.astype(str).map(ASSORTMENT_MAPPING)

In [9]:
test_data.head(1)

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0


In [10]:
test_data.shape, test_data.Store.nunique()

((41088, 8), 856)

In [11]:
STORES_IN_BOTH = list(set(train_data.Store) & set(test_data.Store))

In [12]:
len(STORES_IN_BOTH)

856

In [13]:
test_data['Date'] = pd.to_datetime(test_data['Date']).dt.date

test_data['month'] = pd.to_datetime(test_data.Date).dt.month
test_data['year'] = pd.to_datetime(test_data.Date).dt.year
test_data['day'] = pd.to_datetime(test_data.Date).dt.day

test_data['StateHoliday'] = test_data.StateHoliday.astype(str).map(STATE_HOLIDAYS_MAPPING)

In [14]:
test_data = test_data.merge(stores, on='Store', how='left')

In [15]:
test_data['CompetitionOpenSinceDay'] = 1

test_data['CompetitionOpenSinceDate'] = pd.to_datetime(dict(year=test_data.CompetitionOpenSinceYear,
                                                            month=test_data.CompetitionOpenSinceMonth,
                                                            day=test_data.CompetitionOpenSinceDay
                                                            )
                                                       ).dt.date

In [16]:
test_data['qty_days_with_competitor'] = (test_data['Date'] - test_data['CompetitionOpenSinceDate']
                                         ).dt.days.fillna(0)

In [17]:
test_data.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,month,year,day,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionOpenSinceDay,CompetitionOpenSinceDate,qty_days_with_competitor
0,1,1,4,2015-09-17,1.0,1,,0,9,2015,17,c,basic,1270.0,9.0,2008.0,0,,,,1,2008-09-01,2572.0
1,2,3,4,2015-09-17,1.0,1,,0,9,2015,17,a,basic,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",1,2006-12-01,3212.0
2,3,7,4,2015-09-17,1.0,1,,0,9,2015,17,a,extended,24000.0,4.0,2013.0,0,,,,1,2013-04-01,899.0
3,4,8,4,2015-09-17,1.0,1,,0,9,2015,17,a,basic,7520.0,10.0,2014.0,0,,,,1,2014-10-01,351.0
4,5,9,4,2015-09-17,1.0,1,,0,9,2015,17,a,extended,2030.0,8.0,2000.0,0,,,,1,2000-08-01,5525.0


In [18]:
train_data_prepared.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'month', 'year', 'day', 'StoreType',
       'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval', 'Date_index',
       'CompetitionOpenSinceDay', 'CompetitionOpenSinceDate',
       'qty_days_with_competitor'],
      dtype='object')

In [19]:
train_data_prepared.head(2)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,month,year,day,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Date_index,CompetitionOpenSinceDay,CompetitionOpenSinceDate,qty_days_with_competitor
0,1,5,2015-07-31,5263,555,1,1,,1,7,2015,31,c,basic,1270.0,9.0,2008.0,0,,,,2015-07-31,1,2008-09-01,2524.0
1,2,5,2015-07-31,6064,625,1,1,,1,7,2015,31,a,basic,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015-07-31,1,2007-11-01,2829.0


In [20]:
train_data_prepared['StateHoliday'] = train_data_prepared['StateHoliday'].fillna('no_state_holiday')

In [21]:
train_data_prepared['PromoInterval'] = train_data_prepared['PromoInterval'].fillna('no_promo')

In [22]:
for col in train_data_prepared.columns:
    print(f'{col} has nans: {train_data_prepared[col].hasnans}')

Store has nans: False
DayOfWeek has nans: False
Date has nans: False
Sales has nans: False
Customers has nans: False
Open has nans: False
Promo has nans: False
StateHoliday has nans: False
SchoolHoliday has nans: False
month has nans: False
year has nans: False
day has nans: False
StoreType has nans: False
Assortment has nans: False
CompetitionDistance has nans: False
CompetitionOpenSinceMonth has nans: True
CompetitionOpenSinceYear has nans: True
Promo2 has nans: False
Promo2SinceWeek has nans: True
Promo2SinceYear has nans: True
PromoInterval has nans: False
Date_index has nans: False
CompetitionOpenSinceDay has nans: False
CompetitionOpenSinceDate has nans: True
qty_days_with_competitor has nans: False


In [23]:
for col in test_data.columns:
    print(f'{col} has nans: {test_data[col].hasnans}')

Id has nans: False
Store has nans: False
DayOfWeek has nans: False
Date has nans: False
Open has nans: True
Promo has nans: False
StateHoliday has nans: True
SchoolHoliday has nans: False
month has nans: False
year has nans: False
day has nans: False
StoreType has nans: False
Assortment has nans: False
CompetitionDistance has nans: False
CompetitionOpenSinceMonth has nans: True
CompetitionOpenSinceYear has nans: True
Promo2 has nans: False
Promo2SinceWeek has nans: True
Promo2SinceYear has nans: True
PromoInterval has nans: True
CompetitionOpenSinceDay has nans: False
CompetitionOpenSinceDate has nans: True
qty_days_with_competitor has nans: False


In [24]:
test_data['StateHoliday'] = test_data['StateHoliday'].fillna('no_state_holiday')

test_data['PromoInterval'] = test_data['PromoInterval'].fillna('no_promo')

In [25]:
train_data_prepared_dropped = train_data_prepared[train_data_prepared.Store.isin(test_data.Store)]

In [26]:
train_data_prepared_dropped.shape

(773231, 25)

# Models

## Predict quantity of customers for test

### Scale numeric variables

In [27]:
train_data_prepared_dropped.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,month,year,day,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Date_index,CompetitionOpenSinceDay,CompetitionOpenSinceDate,qty_days_with_competitor
0,1,5,2015-07-31,5263,555,1,1,no_state_holiday,1,7,2015,31,c,basic,1270.0,9.0,2008.0,0,,,no_promo,2015-07-31,1,2008-09-01,2524.0
2,3,5,2015-07-31,8314,821,1,1,no_state_holiday,1,7,2015,31,a,basic,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",2015-07-31,1,2006-12-01,3164.0
6,7,5,2015-07-31,15344,1414,1,1,no_state_holiday,1,7,2015,31,a,extended,24000.0,4.0,2013.0,0,,,no_promo,2015-07-31,1,2013-04-01,851.0
7,8,5,2015-07-31,8492,833,1,1,no_state_holiday,1,7,2015,31,a,basic,7520.0,10.0,2014.0,0,,,no_promo,2015-07-31,1,2014-10-01,303.0
8,9,5,2015-07-31,8565,687,1,1,no_state_holiday,1,7,2015,31,a,extended,2030.0,8.0,2000.0,0,,,no_promo,2015-07-31,1,2000-08-01,5477.0


In [28]:
train_data_prepared_dropped.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'month', 'year', 'day', 'StoreType',
       'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval', 'Date_index',
       'CompetitionOpenSinceDay', 'CompetitionOpenSinceDate',
       'qty_days_with_competitor'],
      dtype='object')

In [30]:
NUMERIC_FEATURES = ['Customers', 'CompetitionDistance', 'qty_days_with_competitor']
NUMERIC_FEATURES_CUSTOMERS = ['CompetitionDistance', 'qty_days_with_competitor']

CATEGORICAL_FEATURES = ['DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'month',
                        'year', 'day', 'StoreType','Assortment', 'Promo2', 'PromoInterval']

In [33]:
standard_scaler_cus = StandardScaler()

train_numeric_transformed_cus = pd.DataFrame(standard_scaler_cus.fit_transform(
    train_data_prepared_dropped[NUMERIC_FEATURES_CUSTOMERS]), columns=NUMERIC_FEATURES_CUSTOMERS,
                                         index=train_data_prepared_dropped.index
                                        )

In [34]:
train_numeric_transformed_cus.head()

Unnamed: 0,CompetitionDistance,qty_days_with_competitor
0,-0.525614,0.637733
2,1.240641,0.938491
6,2.596236,-0.148466
7,0.332791,-0.40599
8,-0.421232,2.025448


In [35]:
sum_encoder_cus = SumEncoder(cols=CATEGORICAL_FEATURES)

In [83]:
train_categorical_encoded_cus = sum_encoder_cus.fit_transform(
    train_data_prepared_dropped[CATEGORICAL_FEATURES], train_data_prepared_dropped['Sales'])

  elif pd.api.types.is_categorical(cols):


In [84]:
train_categorical_encoded.head()

Unnamed: 0,intercept,DayOfWeek_0,DayOfWeek_1,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,DayOfWeek_5,Open_0,Promo_0,StateHoliday_0,...,day_29,StoreType_0,StoreType_1,StoreType_2,Assortment_0,Assortment_1,Promo2_0,PromoInterval_0,PromoInterval_1,PromoInterval_2
0,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
2,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,-1.0,0.0,1.0,0.0
6,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
7,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
8,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0


In [85]:
train_data_preprocessed = train_numeric_transformed.join(train_categorical_encoded)

In [86]:
train_data_preprocessed.shape

(773231, 68)

In [87]:
train_data_preprocessed.head()

Unnamed: 0,Customers,CompetitionDistance,qty_days_with_competitor,intercept,DayOfWeek_0,DayOfWeek_1,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,DayOfWeek_5,...,day_29,StoreType_0,StoreType_1,StoreType_2,Assortment_0,Assortment_1,Promo2_0,PromoInterval_0,PromoInterval_1,PromoInterval_2
0,-0.136562,-0.525614,0.637733,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
2,0.442956,1.240641,0.938491,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,-1.0,0.0,1.0,0.0
6,1.734888,2.596236,-0.148466,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
7,0.469099,0.332791,-0.40599,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
8,0.151018,-0.421232,2.025448,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0


In [89]:
model = CatBoostRegressor(verbose=True)

In [90]:
model.fit(train_data_preprocessed, train_data_prepared_dropped['Sales'])

Learning rate set to 0.127503
0:	learn: 3340.8342539	total: 206ms	remaining: 3m 25s
1:	learn: 3008.6242305	total: 275ms	remaining: 2m 17s
2:	learn: 2720.2363226	total: 346ms	remaining: 1m 55s
3:	learn: 2476.2996266	total: 421ms	remaining: 1m 44s
4:	learn: 2266.5091935	total: 497ms	remaining: 1m 38s
5:	learn: 2083.0808072	total: 569ms	remaining: 1m 34s
6:	learn: 1929.9200042	total: 668ms	remaining: 1m 34s
7:	learn: 1794.9431831	total: 763ms	remaining: 1m 34s
8:	learn: 1683.6344572	total: 853ms	remaining: 1m 33s
9:	learn: 1584.0008585	total: 928ms	remaining: 1m 31s
10:	learn: 1502.9727012	total: 997ms	remaining: 1m 29s
11:	learn: 1432.8210235	total: 1.06s	remaining: 1m 27s
12:	learn: 1374.4186891	total: 1.13s	remaining: 1m 25s
13:	learn: 1324.3481985	total: 1.2s	remaining: 1m 24s
14:	learn: 1282.9465102	total: 1.28s	remaining: 1m 23s
15:	learn: 1246.0916308	total: 1.34s	remaining: 1m 22s
16:	learn: 1213.8185459	total: 1.41s	remaining: 1m 21s
17:	learn: 1185.6604489	total: 1.48s	remaining

155:	learn: 813.3834557	total: 10.8s	remaining: 58.6s
156:	learn: 812.4482965	total: 10.9s	remaining: 58.6s
157:	learn: 811.9729808	total: 11s	remaining: 58.6s
158:	learn: 811.7780112	total: 11.1s	remaining: 58.5s
159:	learn: 811.2486298	total: 11.1s	remaining: 58.4s
160:	learn: 810.9637531	total: 11.2s	remaining: 58.3s
161:	learn: 810.7344763	total: 11.2s	remaining: 58.1s
162:	learn: 810.4180174	total: 11.3s	remaining: 58s
163:	learn: 809.5478102	total: 11.4s	remaining: 57.9s
164:	learn: 808.0691659	total: 11.4s	remaining: 57.9s
165:	learn: 807.6740041	total: 11.5s	remaining: 57.8s
166:	learn: 805.7842530	total: 11.6s	remaining: 57.9s
167:	learn: 805.5377341	total: 11.7s	remaining: 57.8s
168:	learn: 803.9432713	total: 11.8s	remaining: 57.9s
169:	learn: 803.2435341	total: 11.8s	remaining: 57.8s
170:	learn: 802.9790149	total: 11.9s	remaining: 57.7s
171:	learn: 802.0249453	total: 12s	remaining: 57.6s
172:	learn: 800.7400393	total: 12s	remaining: 57.5s
173:	learn: 798.7746173	total: 12.1s

309:	learn: 709.5103285	total: 22s	remaining: 48.9s
310:	learn: 709.0054140	total: 22.1s	remaining: 48.9s
311:	learn: 708.7572462	total: 22.1s	remaining: 48.8s
312:	learn: 708.0705395	total: 22.2s	remaining: 48.7s
313:	learn: 707.9266624	total: 22.3s	remaining: 48.7s
314:	learn: 707.7699162	total: 22.3s	remaining: 48.6s
315:	learn: 707.0283260	total: 22.4s	remaining: 48.5s
316:	learn: 705.8086130	total: 22.5s	remaining: 48.5s
317:	learn: 705.6832750	total: 22.6s	remaining: 48.4s
318:	learn: 705.5336983	total: 22.6s	remaining: 48.4s
319:	learn: 704.7957201	total: 22.7s	remaining: 48.3s
320:	learn: 704.0024053	total: 22.8s	remaining: 48.2s
321:	learn: 703.8246719	total: 22.9s	remaining: 48.2s
322:	learn: 702.6436257	total: 23s	remaining: 48.1s
323:	learn: 701.7656612	total: 23s	remaining: 48.1s
324:	learn: 700.6033383	total: 23.1s	remaining: 48s
325:	learn: 699.8620364	total: 23.2s	remaining: 48s
326:	learn: 698.8763367	total: 23.3s	remaining: 47.9s
327:	learn: 698.2424693	total: 23.3s	r

462:	learn: 638.8457028	total: 33.4s	remaining: 38.7s
463:	learn: 638.5235525	total: 33.5s	remaining: 38.7s
464:	learn: 638.4403238	total: 33.5s	remaining: 38.6s
465:	learn: 638.3304231	total: 33.6s	remaining: 38.5s
466:	learn: 638.0081791	total: 33.7s	remaining: 38.4s
467:	learn: 637.9192687	total: 33.7s	remaining: 38.3s
468:	learn: 637.6909282	total: 33.8s	remaining: 38.3s
469:	learn: 637.0945404	total: 33.9s	remaining: 38.2s
470:	learn: 636.2218306	total: 33.9s	remaining: 38.1s
471:	learn: 635.4710330	total: 34s	remaining: 38.1s
472:	learn: 634.8765239	total: 34.1s	remaining: 38s
473:	learn: 634.6905527	total: 34.2s	remaining: 38s
474:	learn: 634.3632381	total: 34.3s	remaining: 37.9s
475:	learn: 633.8800835	total: 34.4s	remaining: 37.8s
476:	learn: 633.7941506	total: 34.4s	remaining: 37.7s
477:	learn: 633.7083727	total: 34.5s	remaining: 37.7s
478:	learn: 633.1851632	total: 34.6s	remaining: 37.6s
479:	learn: 632.8650473	total: 34.7s	remaining: 37.5s
480:	learn: 632.5204454	total: 34.

618:	learn: 591.7304397	total: 44.9s	remaining: 27.7s
619:	learn: 591.6550928	total: 45s	remaining: 27.6s
620:	learn: 591.1178521	total: 45.1s	remaining: 27.5s
621:	learn: 590.7940703	total: 45.2s	remaining: 27.4s
622:	learn: 590.3292957	total: 45.2s	remaining: 27.4s
623:	learn: 590.0929412	total: 45.3s	remaining: 27.3s
624:	learn: 590.0139048	total: 45.4s	remaining: 27.3s
625:	learn: 589.4521383	total: 45.5s	remaining: 27.2s
626:	learn: 589.2875016	total: 45.6s	remaining: 27.1s
627:	learn: 589.1987357	total: 45.7s	remaining: 27s
628:	learn: 588.8966957	total: 45.7s	remaining: 27s
629:	learn: 588.6796319	total: 45.8s	remaining: 26.9s
630:	learn: 588.6029904	total: 45.9s	remaining: 26.8s
631:	learn: 588.1630389	total: 46s	remaining: 26.8s
632:	learn: 587.7977230	total: 46.1s	remaining: 26.7s
633:	learn: 587.4972724	total: 46.2s	remaining: 26.7s
634:	learn: 587.3402354	total: 46.3s	remaining: 26.6s
635:	learn: 587.2795589	total: 46.4s	remaining: 26.6s
636:	learn: 586.9403327	total: 46.5s

772:	learn: 556.7436492	total: 55.8s	remaining: 16.4s
773:	learn: 556.6359127	total: 55.9s	remaining: 16.3s
774:	learn: 556.5243279	total: 55.9s	remaining: 16.2s
775:	learn: 556.2318637	total: 56s	remaining: 16.2s
776:	learn: 556.0201920	total: 56.1s	remaining: 16.1s
777:	learn: 555.9621450	total: 56.1s	remaining: 16s
778:	learn: 555.8659504	total: 56.2s	remaining: 15.9s
779:	learn: 555.3314471	total: 56.3s	remaining: 15.9s
780:	learn: 555.2226269	total: 56.3s	remaining: 15.8s
781:	learn: 555.1600054	total: 56.4s	remaining: 15.7s
782:	learn: 554.9989306	total: 56.5s	remaining: 15.6s
783:	learn: 554.6067489	total: 56.6s	remaining: 15.6s
784:	learn: 554.2596694	total: 56.6s	remaining: 15.5s
785:	learn: 553.9253688	total: 56.7s	remaining: 15.4s
786:	learn: 553.8939530	total: 56.7s	remaining: 15.4s
787:	learn: 553.4797286	total: 56.8s	remaining: 15.3s
788:	learn: 553.4412329	total: 56.9s	remaining: 15.2s
789:	learn: 553.2994523	total: 57s	remaining: 15.1s
790:	learn: 553.2316479	total: 57s

926:	learn: 528.2057449	total: 1m 6s	remaining: 5.21s
927:	learn: 528.1775658	total: 1m 6s	remaining: 5.14s
928:	learn: 528.1270292	total: 1m 6s	remaining: 5.07s
929:	learn: 528.0284375	total: 1m 6s	remaining: 5s
930:	learn: 527.7454053	total: 1m 6s	remaining: 4.93s
931:	learn: 527.4894723	total: 1m 6s	remaining: 4.86s
932:	learn: 527.1614450	total: 1m 6s	remaining: 4.79s
933:	learn: 527.0101933	total: 1m 6s	remaining: 4.71s
934:	learn: 526.8013205	total: 1m 6s	remaining: 4.64s
935:	learn: 526.7496874	total: 1m 6s	remaining: 4.57s
936:	learn: 526.7029857	total: 1m 6s	remaining: 4.5s
937:	learn: 526.4464398	total: 1m 6s	remaining: 4.42s
938:	learn: 526.3907906	total: 1m 7s	remaining: 4.35s
939:	learn: 526.2465042	total: 1m 7s	remaining: 4.28s
940:	learn: 526.0143837	total: 1m 7s	remaining: 4.21s
941:	learn: 525.9775994	total: 1m 7s	remaining: 4.14s
942:	learn: 525.9468337	total: 1m 7s	remaining: 4.07s
943:	learn: 525.8055873	total: 1m 7s	remaining: 4s
944:	learn: 525.7655419	total: 1m 7

<catboost.core.CatBoostRegressor at 0x7fd1c2fcca30>

In [92]:
r2_score(train_data_prepared_dropped['Sales'], model.predict(train_data_preprocessed))

0.9805588501754268

In [94]:
test_data.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,month,year,...,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionOpenSinceDate
0,1,1,4,2015-09-17,1.0,1,no_state_holiday,0,9,2015,...,c,basic,1270.0,9.0,2008.0,0,,,no_promo,2008-09-01
1,2,3,4,2015-09-17,1.0,1,no_state_holiday,0,9,2015,...,a,basic,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",2006-12-01
2,3,7,4,2015-09-17,1.0,1,no_state_holiday,0,9,2015,...,a,extended,24000.0,4.0,2013.0,0,,,no_promo,2013-04-01
3,4,8,4,2015-09-17,1.0,1,no_state_holiday,0,9,2015,...,a,basic,7520.0,10.0,2014.0,0,,,no_promo,2014-10-01
4,5,9,4,2015-09-17,1.0,1,no_state_holiday,0,9,2015,...,a,extended,2030.0,8.0,2000.0,0,,,no_promo,2000-08-01


In [95]:
train_data_prepared_dropped.head(2)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,month,...,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Date_index,CompetitionOpenSinceDay,CompetitionOpenSinceDate,qty_days_with_competitor
0,1,5,2015-07-31,5263,555,1,1,no_state_holiday,1,7,...,9.0,2008.0,0,,,no_promo,2015-07-31,1,2008-09-01,2524.0
2,3,5,2015-07-31,8314,821,1,1,no_state_holiday,1,7,...,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",2015-07-31,1,2006-12-01,3164.0


In [93]:
test_data_num = pd.DataFrame(standard_scaler.transform(
    test_data[numeric_features]), columns=numeric_features,
                                         index=test_data.index
                                        )

KeyError: "['qty_days_with_competitor', 'Customers'] not in index"

In [None]:
r2_score(test_data['Sales'], model.predict(test_data))