<a href="https://colab.research.google.com/github/vvivvi/kaggle-c1/blob/master/Kaggle_C1_text_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 
import lightgbm 
import gc
import re
from tqdm import tqdm_notebook

import matplotlib.pyplot as plt
%matplotlib inline 


for p in [np, pd, scipy, sklearn, lightgbm]:
    print (p.__name__, p.__version__)

numpy 1.18.1
pandas 0.25.3
scipy 1.4.1
sklearn 0.22.1
lightgbm 2.3.1


In [3]:
def write_predictions_by_array(array, filename):
  df=pd.DataFrame(array)
  df.columns=['item_cnt_month']
  df.to_csv(os.path.join(DATA_FOLDER, filename), index_label='ID')

In [4]:
def clipped_rmse(gt, predicted,clip_min=0, clip_max=20):
  target=np.minimum(np.maximum(gt,clip_min), clip_max)
  return np.sqrt((target-predicted)**2).mean()

In [5]:
#from itertools import product
#import gc
#from tqdm import tqdm_notebook


def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [6]:
import os

#!wget -O competitive-data-science-predict-future-sales.zip https://github.com/vvivvi/kaggle-c1/blob/master/competitive-data-science-predict-future-sales.zip?raw=true
#!mkdir competitive-data-science-predict-future-sales
#!unzip -o competitive-data-science-predict-future-sales.zip -d competitive-data-science-predict-future-sales

DATA_FOLDER = 'competitive-data-science-predict-future-sales'

#sales    = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv'))
items           = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
#item_cats = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
#shops           = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))
#sample_submission = pd.read_csv(os.path.join(DATA_FOLDER, 'sample_submission.csv'))
test_spec = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv'))


In [7]:
test_shops=test_spec['shop_id'].unique()
test_items=test_spec['item_id'].unique()
date_block_val = 33
date_block_test = 35 # Dec 2015
#sales = sales[sales['shop_id'].isin(test_shops)]

In [None]:
#all_data.to_csv('all_data_with_category_targets.csv')
all_data=pd.read_csv(DATA_FOLDER + '/all_data_with_category_targets.csv')
all_data=downcast_dtypes(all_data)
gc.collect()

In [None]:
cols_to_rename = [col for col in all_data.columns.values if re.search('target',col)] 
cols_to_rename

In [None]:
# List of columns that we will use to create lags
# cols_to_rename = list(all_data.columns.difference(index_cols)) 
cols_to_rename = [col for col in all_data.columns.values if re.search('target',col)] 

index_cols = ['shop_id', 'item_id', 'date_block_num']

shift_range = [2, 3, 5,12] 
# there's one month gap between and test periods

for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift
gc.collect()

# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 

# List of all lagged features
fit_cols = [col for col in all_data.columns if col.split('_')[-1] in [str(item) for item in shift_range]] 

all_data = downcast_dtypes(all_data)
gc.collect();

In [None]:
all_data.columns.values

In [None]:
all_data.to_csv(DATA_FOLDER + '/all_data_with_lagged_targets.csv')

In [None]:
all_data.sha

In [None]:
dates = all_data['date_block_num']

dates_train = dates[dates <  date_block_val]
dates_val  = dates[dates == date_block_val]
dates_test  = dates[dates == date_block_test]

to_drop_cols=[col for col in all_data.columns.values if ((not re.search('lag',col)) 
              and (not re.search('_id',col))
              and (not re.search('fraction',col)))] + ['item_name']

X_train = all_data.loc[dates <  date_block_val].drop(to_drop_cols, axis=1)
X_val =  all_data.loc[dates == date_block_val].drop(to_drop_cols, axis=1)
X_trainval =  all_data.loc[dates < date_block_test].drop(to_drop_cols, axis=1)
X_test =  all_data.loc[dates == date_block_test].drop(to_drop_cols, axis=1)

y_train = np.clip(all_data.loc[dates <  date_block_val, 'target'].values,0,20)
y_trainval = np.clip(all_data.loc[dates <  date_block_test, 'target'].values,0,20)
y_val =  np.clip(all_data.loc[dates == date_block_val, 'target'].values,0,20)

In [None]:
X_train.columns

In [None]:
import lightgbm as lgb
from sklearn.metrics import r2_score

lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':2
              }
model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
pred_lgb_val = np.clip(model.predict(X_val), 0, 20)
print('Validation R-squared for LightGBM is %f' % r2_score(y_val, pred_lgb_val))
print('Clipped RMSE of lgb predictions is ', clipped_rmse(y_val, pred_lgb_val))

model = lgb.train(lgb_params, lgb.Dataset(X_trainval, label=y_trainval), 100)
pred_lgb_test = np.clip(model.predict(X_test), 0, 20)
write_predictions_by_array(pred_lgb_test, 'submission-lgb-trainval-all-ext-rich-categories.csv')
# LB score 0.949 / 0.953
# ext: 0.943/0.


In [None]:
X_train

In [None]:
!pip install catboost

In [None]:
import catboost

from catboost import CatBoostRegressor, Pool

model=CatBoostRegressor(iterations=100)
model.fit(X_train, y_train, verbose=1)
pred_catboost_val = model.predict(X_val.values)

print('Validation R-squared for catboost is %f' % r2_score(y_val, pred_catboost_val))



In [None]:
model=CatBoostRegressor(iterations=100)
model.fit(X_trainval, y_trainval, verbose=1)
pred_catboost_test = model.predict(X_test)
write_predictions_by_array(np.clip(pred_catboost_test,0,20), 'submission-catboost-trainval-ext.csv')

# LB score 0.986/0.996


In [None]:
import lightgbm as lgb

lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':2
              }

model = lgb.train(lgb_params, lgb.Dataset(X_val, label=y_val), 100)
pred_lgb_test = np.clip(model.predict(X_test), 0, 20)
write_predictions_by_array(pred_lgb_test, 'submission-lgb-valonly.csv')
# LB score 1.10/1.10
