<a href="https://colab.research.google.com/github/vvivvi/kaggle-c1/blob/master/Kaggle_C1_text_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 
import lightgbm as lgb
from sklearn.metrics import r2_score
import catboost
import gc

from catboost import CatBoostRegressor, Pool

import re
import os

for p in [np, pd, scipy, sklearn, lgb, catboost]:
    print (p.__name__, p.__version__)
    
DATA_FOLDER = 'competitive-data-science-predict-future-sales'
test_spec = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv'))


numpy 1.18.1
pandas 0.25.3
scipy 1.4.1
sklearn 0.22.1
lightgbm 2.3.1
catboost 0.22


In [2]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [3]:
!pip install catboost



In [4]:
def write_predictions_by_array(array, filename):
  df=pd.DataFrame(array)
  df.columns=['item_cnt_month']
  df.to_csv(os.path.join(DATA_FOLDER, filename), index_label='ID')

In [5]:
def clipped_rmse(gt, predicted,clip_min=0, clip_max=20):
  target=np.minimum(np.maximum(gt,clip_min), clip_max)
  return np.sqrt((target-predicted)**2).mean()

In [6]:
index_cols=['item_id','shop_id','date_block_num']
category_data=pd.read_csv(DATA_FOLDER + '/category.csv')
lagged_basic=pd.read_csv(DATA_FOLDER + '/lagged_basic.csv') 
targets = pd.read_csv(DATA_FOLDER + '/targets.csv') 
all_data = pd.merge(category_data, lagged_basic, on=index_cols)
all_data = pd.merge(all_data, targets, on=index_cols)

all_data=downcast_dtypes(all_data)
gc.collect()

0

In [7]:
category_data.columns

Index(['Unnamed: 0', 'shop_id', 'item_id', 'date_block_num',
       'item_category_id', 'item_name_category_tfidf_unigram_32',
       'item_name_category_tfidf_unigram_256',
       'item_name_category_tfidf_bigram_32',
       'item_name_category_tfidf_bigram_256', 'item_name_category_frequent_32',
       'item_name_category_frequent_256'],
      dtype='object')

In [8]:
dates = all_data['date_block_num']

date_block_val = 33
date_block_test = 35 # Dec 2015

dates_train = dates[dates <  date_block_val]
dates_val  = dates[dates == date_block_val]
dates_test  = dates[dates == date_block_test]

to_drop_cols=[col for col in all_data.columns.values if ((re.search('^target_',col) and not re.search('lag',col)) or re.search('Unnamed',col)) ]

#X_train = all_data.loc[dates <  date_block_val, to_keep_cols]
#X_val =  all_data.loc[dates == date_block_val, to_keep_cols]
#X_trainval =  all_data.loc[dates < date_block_test, to_keep_cols]
#X_test =  all_data.loc[dates == date_block_test, to_keep_cols]

y_train = np.clip(all_data.loc[dates <  date_block_val, 'target'].values,0,20)
y_trainval = np.clip(all_data.loc[dates <  date_block_test, 'target'].values,0,20)
y_val =  np.clip(all_data.loc[dates == date_block_val, 'target'].values,0,20)

In [9]:
# find out mapping from test data indices to submission ids

shop_item2submissionid={}
for idx, row in test_spec.iterrows():
    shop_item2submissionid[str(row['shop_id'])+'_'+str(row['item_id'])] = row['ID']
    
test_data=all_data.loc[dates == date_block_test, ['shop_id','item_id']]    
    
testidx2submissionidx=np.zeros(test_data.shape[0], dtype=np.int32)
for idx in range(test_data.shape[0]):
    row =test_data.iloc[idx]
    testidx2submissionidx[idx] = shop_item2submissionid[str(row['shop_id'])+'_'+str(row['item_id'])]
    
print(len(testidx2submissionidx))    

#invert the mapping
submissionidx2testidx=np.zeros(test_data.shape[0], dtype=np.int32)
for i in range(test_data.shape[0]):
    submissionidx2testidx[testidx2submissionidx[i]]=i
    
del test_data
gc.collect()


214200


22

In [10]:
to_keep_cols = ['shop_id','item_id','item_category_id']
# to_keep_cols = [col for col in to_keep_cols if not re.search('name',col)]
to_keep_cols = [i for i in to_keep_cols if not i in to_drop_cols]

lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':2
              }
model = lgb.train(lgb_params, lgb.Dataset(all_data.loc[dates <  date_block_val, to_keep_cols], label=y_train), 100)
pred_lgb_val = np.clip(model.predict(all_data.loc[dates ==  date_block_val, to_keep_cols]), 0, 20)
print('Validation R-squared for LightGBM is %f' % r2_score(y_val, pred_lgb_val))
print('Clipped RMSE of lgb predictions is ', clipped_rmse(y_val, pred_lgb_val))
# Validation R-squared for LightGBM is 0.197921
# Clipped RMSE of lgb predictions is  0.40996664568708163
model = lgb.train(lgb_params, lgb.Dataset(all_data.loc[dates <  date_block_test, to_keep_cols], label=y_trainval), 100)
pred_lgb_test = np.clip(model.predict(all_data.loc[dates ==  date_block_test, to_keep_cols]), 0, 20)
write_predictions_by_array(pred_lgb_test[submissionidx2testidx], 'submission-lgb-basic-categries.csv')

Validation R-squared for LightGBM is 0.197921
Clipped RMSE of lgb predictions is  0.40996664568708163


In [11]:
to_keep_cols = [col for col in all_data.columns.values if re.search('name',col)]
to_keep_cols = [i for i in to_keep_cols if not i in to_drop_cols]

lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':2
              }
model = lgb.train(lgb_params, lgb.Dataset(all_data.loc[dates <  date_block_val, to_keep_cols], label=y_train), 100)
pred_lgb_val = np.clip(model.predict(all_data.loc[dates ==  date_block_val, to_keep_cols]), 0, 20)
print('Validation R-squared for LightGBM is %f' % r2_score(y_val, pred_lgb_val))
print('Clipped RMSE of lgb predictions is ', clipped_rmse(y_val, pred_lgb_val))
# Validation R-squared for LightGBM is 0.068788
# Clipped RMSE of lgb predictions is  0.45511637813975203
model = lgb.train(lgb_params, lgb.Dataset(all_data.loc[dates <  date_block_test, to_keep_cols], label=y_trainval), 100)
pred_lgb_test = np.clip(model.predict(all_data.loc[dates ==  date_block_test, to_keep_cols]), 0, 20)
write_predictions_by_array(pred_lgb_test[submissionidx2testidx], 'submission-lgb-text-categories.csv')

Validation R-squared for LightGBM is 0.068788
Clipped RMSE of lgb predictions is  0.45511637813975203


In [12]:
to_keep_cols = [col for col in all_data.columns.values if re.search('name',col)]
to_keep_cols += ['shop_id','item_id','item_category_id']
to_keep_cols = [i for i in to_keep_cols if not i in to_drop_cols]

lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':2
              }
model = lgb.train(lgb_params, lgb.Dataset(all_data.loc[dates <  date_block_val, to_keep_cols], label=y_train), 100)
pred_lgb_val = np.clip(model.predict(all_data.loc[dates ==  date_block_val, to_keep_cols]), 0, 20)
print('Validation R-squared for LightGBM is %f' % r2_score(y_val, pred_lgb_val))
print('Clipped RMSE of lgb predictions is ', clipped_rmse(y_val, pred_lgb_val))

# Validation R-squared for LightGBM is 0.218179
# Clipped RMSE of lgb predictions is  0.40772495453592217

model = lgb.train(lgb_params, lgb.Dataset(all_data.loc[dates <  date_block_test, to_keep_cols], label=y_trainval), 100)
pred_lgb_test = np.clip(model.predict(all_data.loc[dates ==  date_block_test, to_keep_cols]), 0, 20)
write_predictions_by_array(pred_lgb_test[submissionidx2testidx], 'submission-lgb-basic-and-text-categories.csv')

Validation R-squared for LightGBM is 0.218179
Clipped RMSE of lgb predictions is  0.40772495453592217


In [None]:
to_keep_cols = [c for c in all_data.columns.values if re.search('lag',c)]
# to_keep_cols = [col for col in to_keep_cols if not re.search('name',col)]
to_keep_cols = [i for i in to_keep_cols if not i in to_drop_cols]

lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':2
              }
model = lgb.train(lgb_params, lgb.Dataset(all_data.loc[dates <  date_block_val, to_keep_cols], label=y_train), 200)
pred_lgb_val = np.clip(model.predict(all_data.loc[dates ==  date_block_val, to_keep_cols]), 0, 20)
print('Validation R-squared for LightGBM is %f' % r2_score(y_val, pred_lgb_val))
print('Clipped RMSE of lgb predictions is ', clipped_rmse(y_val, pred_lgb_val))
model = lgb.train(lgb_params, lgb.Dataset(all_data.loc[dates <  date_block_test, to_keep_cols], label=y_trainval), 200)
pred_lgb_test = np.clip(model.predict(all_data.loc[dates ==  date_block_test, to_keep_cols]), 0, 20)
write_predictions_by_array(pred_lgb_test[submissionidx2testidx], 'submission-lgb-basic-lagged.csv')

In [15]:
to_keep_cols = [c for c in all_data.columns.values if re.search('lag',c)] + ['shop_id','item_id','item_category_id']
# to_keep_cols = [col for col in to_keep_cols if not re.search('name',col)]
to_keep_cols = [i for i in to_keep_cols if not i in to_drop_cols]

lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':2
              }
model = lgb.train(lgb_params, lgb.Dataset(all_data.loc[dates <  date_block_val, to_keep_cols], label=y_train), 100)
pred_lgb_val = np.clip(model.predict(all_data.loc[dates ==  date_block_val, to_keep_cols]), 0, 20)
print('Validation R-squared for LightGBM is %f' % r2_score(y_val, pred_lgb_val))
print('Clipped RMSE of lgb predictions is ', clipped_rmse(y_val, pred_lgb_val))
# Validation R-squared for LightGBM is 0.335086
#Clipped RMSE of lgb predictions is  0.35535191672828514
model = lgb.train(lgb_params, lgb.Dataset(all_data.loc[dates <  date_block_test, to_keep_cols], label=y_trainval), 100)
pred_lgb_test = np.clip(model.predict(all_data.loc[dates ==  date_block_test, to_keep_cols]), 0, 20)
write_predictions_by_array(pred_lgb_test[submissionidx2testidx], 'submission-lgb-basic-and-lagged.csv')
# LB 1.027 / 1.024

SyntaxError: invalid syntax (<ipython-input-15-f2c6d42c1784>, line 22)

In [13]:
to_keep_cols = [c for c in all_data.columns.values if re.search('lag',c)] + ['shop_id','item_id','item_category_id']
to_keep_cols += [col for col in all_data.columns.values if re.search('name',col)]
# to_keep_cols = [col for col in to_keep_cols if not re.search('name',col)]
to_keep_cols = [i for i in to_keep_cols if not i in to_drop_cols]

lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':2
              }
model = lgb.train(lgb_params, lgb.Dataset(all_data.loc[dates <  date_block_val, to_keep_cols], label=y_train), 100)
pred_lgb_val = np.clip(model.predict(all_data.loc[dates ==  date_block_val, to_keep_cols]), 0, 20)
print('Validation R-squared for LightGBM is %f' % r2_score(y_val, pred_lgb_val))
print('Clipped RMSE of lgb predictions is ', clipped_rmse(y_val, pred_lgb_val))
# Validation R-squared for LightGBM is 0.364015
# Clipped RMSE of lgb predictions is  0.3477432847525624
model = lgb.train(lgb_params, lgb.Dataset(all_data.loc[dates <  date_block_test, to_keep_cols], label=y_trainval), 100)
pred_lgb_test = np.clip(model.predict(all_data.loc[dates ==  date_block_test, to_keep_cols]), 0, 20)
write_predictions_by_array(pred_lgb_test[submissionidx2testidx], 'submission-lgb-basic-and-lagged-and-text-categories.csv')
# LB scores are: 0.956601 and 0.95988

Validation R-squared for LightGBM is 0.364015
Clipped RMSE of lgb predictions is  0.3477432847525624


In [None]:
from sklearn import linear_model
to_keep_cols = [col for col in all_data.columns.values if  re.search('lag',col)]
# to_keep_cols = [col for col in to_keep_cols if not re.search('name',col)]
to_keep_cols = [i for i in to_keep_cols if not i in to_drop_cols]

model=CatBoostRegressor(iterations=1000, task_type='GPU')
model.fit(all_data.loc[dates <  date_block_val, to_keep_cols].values, y_train, verbose=1)
pred_val = np.clip(model.predict(all_data.loc[dates ==  date_block_val, to_keep_cols].values), 0, 20)
print('Validation R-squared for LassoLars model is %f' % r2_score(y_val, pred_val))
print('Clipped RMSE of LassoLars predictions is ', clipped_rmse(y_val, pred_val))