<a href="https://colab.research.google.com/github/vvivvi/kaggle-c1/blob/master/Kaggle_C1_text_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 
import lightgbm 

import re
import os

for p in [np, pd, scipy, sklearn, lightgbm]:
    print (p.__name__, p.__version__)
    
DATA_FOLDER = 'competitive-data-science-predict-future-sales'
test_spec = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv'))


numpy 1.18.1
pandas 0.25.3
scipy 1.4.1
sklearn 0.22.1
lightgbm 2.3.1


In [2]:
def write_predictions_by_array(array, filename):
  df=pd.DataFrame(array)
  df.columns=['item_cnt_month']
  df.to_csv(os.path.join(DATA_FOLDER, filename), index_label='ID')

In [3]:
def clipped_rmse(gt, predicted,clip_min=0, clip_max=20):
  target=np.minimum(np.maximum(gt,clip_min), clip_max)
  return np.sqrt((target-predicted)**2).mean()

In [4]:
all_data=pd.read_csv(DATA_FOLDER + '/all_data_with_lagged_targets.csv')


In [5]:
to_keep_cols =  [col for col in all_data.columns.values if  re.search('target_lag',col)] + [col for col in all_data.columns.values if  re.search('name_category',col)] + ['item_name_cyrillic_fraction', 'shop_id','item_id','item_category_id']
to_keep_cols

['target_lag_2',
 'target_lag_3',
 'target_lag_5',
 'target_lag_12',
 'item_name_category_tfidf_unigram_32',
 'item_name_category_tfidf_unigram_256',
 'item_name_category_tfidf_bigram_32',
 'item_name_category_tfidf_bigram_256',
 'item_name_category_frequent_32',
 'item_name_category_frequent_256',
 'item_name_cyrillic_fraction',
 'shop_id',
 'item_id',
 'item_category_id']

In [6]:
all_data.columns.values

array(['Unnamed: 0', 'Unnamed: 0.1', 'shop_id', 'item_id',
       'date_block_num', 'target', 'target_shop', 'target_item',
       'item_name', 'item_category_id',
       'item_name_category_tfidf_unigram_32',
       'item_name_category_tfidf_unigram_256',
       'item_name_category_tfidf_bigram_32',
       'item_name_category_tfidf_bigram_256',
       'item_name_category_frequent_32',
       'item_name_category_frequent_256', 'item_name_cyrillic_fraction',
       'target_category', 'target_category_tfidf_unigram_32',
       'target_category_tfidf_unigram_256',
       'target_category_tfidf_bigram_32',
       'target_category_tfidf_bigram_256', 'target_category_frequent_32',
       'target_category_frequent_256', 'target_category_within_shop',
       'target_category_tfidf_unigram_32_within_shop',
       'target_category_tfidf_unigram_256_within_shop',
       'target_category_tfidf_bigram_32_within_shop',
       'target_category_tfidf_bigram_256_within_shop',
       'target_category_fr

In [7]:
dates = all_data['date_block_num']

date_block_val = 33
date_block_test = 35 # Dec 2015

dates_train = dates[dates <  date_block_val]
dates_val  = dates[dates == date_block_val]
dates_test  = dates[dates == date_block_test]

to_drop_cols=[col for col in all_data.columns.values if ((not re.search('lag',col)) 
              and (not re.search('_id',col))
              and (not re.search('fraction',col)))] + ['item_name']

#to_keep_cols = ['shop_id','item_id','item_category_id']
# to_keep_cols = ['target_lag_2']
to_keep_cols =  [col for col in all_data.columns.values if  re.search('target_lag',col)]
to_keep_cols += [col for col in all_data.columns.values if re.search('shop_lag',col)] 
to_keep_cols += [col for col in all_data.columns.values if  re.search('category_lag',col)] 
to_keep_cols += [col for col in all_data.columns.values if  re.search('name_category',col)]
to_keep_cols += ['item_name_cyrillic_fraction', 'shop_id','item_id','item_category_id']

#X_train = all_data.loc[dates <  date_block_val, to_keep_cols]
#X_val =  all_data.loc[dates == date_block_val, to_keep_cols]
#X_trainval =  all_data.loc[dates < date_block_test, to_keep_cols]
#X_test =  all_data.loc[dates == date_block_test, to_keep_cols]

y_train = np.clip(all_data.loc[dates <  date_block_val, 'target'].values,0,20)
y_trainval = np.clip(all_data.loc[dates <  date_block_test, 'target'].values,0,20)
y_val =  np.clip(all_data.loc[dates == date_block_val, 'target'].values,0,20)

In [9]:
# find out mapping from test data indices to submission ids

shop_item2submissionid={}
for idx, row in test_spec.iterrows():
    shop_item2submissionid[str(row['shop_id'])+'_'+str(row['item_id'])] = row['ID']
    
test_data=all_data.loc[dates == date_block_test, ['shop_id','item_id']]    
    
testidx2submissionidx=np.zeros(test_data.shape[0], dtype=np.int32)
for idx in range(test_data.shape[0]):
    row =test_data.iloc[idx]
    testidx2submissionidx[idx] = shop_item2submissionid[str(row['shop_id'])+'_'+str(row['item_id'])]
    
print(len(testidx2submissionidx))    

#invert the mapping
submissionidx2testidx=np.zeros(test_data.shape[0], dtype=np.int32)
for i in range(test_data.shape[0]):
    submissionidx2testidx[testidx2submissionidx[i]]=i


214200


In [None]:
len(submissionidx2testidx)

In [10]:
import lightgbm as lgb
from sklearn.metrics import r2_score

lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':2
              }
model = lgb.train(lgb_params, lgb.Dataset(all_data.loc[dates <  date_block_val, to_keep_cols], label=y_train), 100)
pred_lgb_val = np.clip(model.predict(all_data.loc[dates ==  date_block_val, to_keep_cols]), 0, 20)
print('Validation R-squared for LightGBM is %f' % r2_score(y_val, pred_lgb_val))
print('Clipped RMSE of lgb predictions is ', clipped_rmse(y_val, pred_lgb_val))

model = lgb.train(lgb_params, lgb.Dataset(all_data.loc[dates <  date_block_test, to_keep_cols], label=y_trainval), 100)
pred_lgb_test = np.clip(model.predict(all_data.loc[dates ==  date_block_test, to_keep_cols]), 0, 20)
write_predictions_by_array(pred_lgb_test[submissionidx2testidx], 'submission-lgb-misc')
# basic ids: LB score 1.112 / 1.106
# target lag 2: 1.056 / 1.058
# misc 0.958/0.958


Validation R-squared for LightGBM is 0.399198
Clipped RMSE of lgb predictions is  0.3447454704194131


In [None]:
X_test

In [None]:
!pip install catboost

In [None]:
import catboost

from catboost import CatBoostRegressor, Pool

model=CatBoostRegressor(iterations=100)
model.fit(X_train, y_train, verbose=1)
pred_catboost_val = model.predict(X_val.values)

print('Validation R-squared for catboost is %f' % r2_score(y_val, pred_catboost_val))



In [None]:
model=CatBoostRegressor(iterations=100)
model.fit(X_trainval, y_trainval, verbose=1)
pred_catboost_test = model.predict(X_test)
write_predictions_by_array(np.clip(pred_catboost_test,0,20), 'submission-catboost-trainval-ext.csv')

# LB score 0.986/0.996


In [None]:
import lightgbm as lgb

lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':2
              }

model = lgb.train(lgb_params, lgb.Dataset(X_val, label=y_val), 100)
pred_lgb_test = np.clip(model.predict(X_test), 0, 20)
write_predictions_by_array(pred_lgb_test, 'submission-lgb-valonly.csv')
# LB score 1.10/1.10
