<a href="https://colab.research.google.com/github/vvivvi/kaggle-c1/blob/master/C1_hyperparameter_search_catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/52/39/128fff65072c8327371e3c594f3c826d29c85b21cb6485980353b168e0e4/catboost-0.24.2-cp36-none-manylinux1_x86_64.whl (66.1MB)
[K     |████████████████████████████████| 66.2MB 47kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.2


In [None]:
import sys
import os.path
import json
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 
from itertools import product
import gc
from tqdm import tqdm_notebook
import re
from catboost import CatBoostRegressor, Pool

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/gdrive') 
  if not os.path.isfile('SETTINGS.json'):
       # hard coded data directory in drive is used if SETTINGS.json not present 
       config={}
       config['DATA_DIR'] = '/content/gdrive/My Drive/kaggle-c1'
       with open('SETTINGS.json', 'w') as outfile:
         json.dump(config, outfile)

with open('SETTINGS.json') as config_file:
    config = json.load(config_file)

DATA_DIR = config['DATA_DIR']

print('Using DATA_DIR ', DATA_DIR)

DATA_FOLDER = DATA_DIR

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
4/6AH-Qc9V2y3PaocVkITx8Fbz2ok6Yul9mLYgHvbefBydItiwlvL8pJM


In [None]:
test_spec = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv'))

index_cols=['item_id','shop_id','date_block_num']
date_block_val = 33
date_block_test = 35 # Dec 2015

In [None]:
# a wrapper class to use pre-defined division to training and hold-out set
# as a cross-validation object

class HoldOut:
    """
    Hold-out cross-validator generator. In the hold-out, the
    data is split only once into a train set and a test set.
    Here the split is given as a input parameter in the class initialisation
    Unlike in other cross-validation schemes, the hold-out
    consists of only one iteration.

    Parameters
    ----------
    train_indices, test_indices : the class just passes on these when yielding splits


    """

    def __init__(self, train_indices, test_indices):
        self.train_indices = train_indices
        self.test_indices = test_indices

    def __iter__(self):
        yield self.train_indices, self.test_indices

In [None]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [None]:
def write_predictions_by_array(array, filename):
  df=pd.DataFrame(np.clip(array,0,20))
  df.columns=['item_cnt_month']
  df.to_csv(os.path.join(DATA_FOLDER, filename), index_label='ID')

In [None]:
def clipped_rmse(gt, predicted,clip_min=0, clip_max=20):
  target=np.minimum(np.maximum(gt,clip_min), clip_max)
  return np.sqrt((target-predicted)**2).mean()

In [None]:
def load_feauture_set(id_string):
  filename=os.path.join(DATA_FOLDER, 'feature_set_{}.csv').format(id_string)
  print('reading from file {}'.format(filename))
  all_data = pd.read_csv(os.path.join(DATA_FOLDER, 'feature_set_basic.csv'))

  all_data['target']=np.clip(all_data['target'],0,20)

  dates=all_data['date_block_num']

  dates_train = (dates>= date_block_val - 13) & (dates<= date_block_val - 2)
  dates_trainval = (dates>= date_block_test - 13) & (dates<= date_block_test - 2)

# extract training, validation and test sets (labels and features)

  y_train=all_data.loc[dates_train, 'target'] 
  y_trainval=all_data.loc[dates_trainval, 'target']
  y_val = all_data.loc[dates == date_block_val, 'target']

  to_drop_cols = ['target','date_block_num']

  X_train = all_data.loc[dates_train].drop(to_drop_cols, axis=1)
  X_trainval = all_data.loc[dates_trainval].drop(to_drop_cols, axis=1)
  X_val = all_data.loc[dates == date_block_val].drop(to_drop_cols, axis=1)
  X_test = all_data.loc[dates == date_block_test].drop(to_drop_cols, axis=1)

# determine how to permute test set predictions for submission generation 

  shop_item2submissionid={}
  for idx, row in test_spec.iterrows():
    shop_item2submissionid[str(row['shop_id'])+'_'+str(row['item_id'])] = row['ID']
    
  test_data=all_data.loc[dates == date_block_test, ['shop_id','item_id']]    
    
  testidx2submissionidx=np.zeros(test_data.shape[0], dtype=np.int32)
  for idx in range(test_data.shape[0]):
    row =test_data.iloc[idx]
    testidx2submissionidx[idx] = shop_item2submissionid[str(row['shop_id'])+'_'+str(row['item_id'])]
 
#invert the mapping
  submissionidx2testidx=np.zeros(test_data.shape[0], dtype=np.int32)
  for i in range(test_data.shape[0]):
    submissionidx2testidx[testidx2submissionidx[i]]=i

  del test_data
  gc.collect()    

  return X_train, y_train, X_trainval, y_trainval, X_val, y_val, X_test, submissionidx2testidx

In [None]:
grid = {'learning_rate': [0.01, 0.03, 0.1,0.3],
        'iterations': [30,100,300,1000],
        'depth': [4,5, 6,8, 10,12],
        'l2_leaf_reg': [0, 0.3, 1, 3, 10],
          'task_type': ['GPU']}

best_params={}          

for id in ['basic','text','within']:

  X_train, y_train, X_trainval, y_trainval, X_val, y_val, X_test, submissionidx2testidx = load_feauture_set(id)

  X_paramsearch =  pd.concat([X_train, X_val],ignore_index=True)
  y_paramsearch = pd.concat([y_train, y_val],ignore_index=True)
  train_indices = np.arange(X_train.shape[0])
  val_indices = np.arange(X_val.shape[0]) + X_train.shape[0]

  model = CatBoostRegressor()

  cv = HoldOut(train_indices=train_indices, test_indices=val_indices)

  grid_search_result = model.grid_search(grid, 
                                       X=X_paramsearch, 
                                       y=y_paramsearch, 
                                       cv=cv,
                                       shuffle=False,
                                       refit=False,
                                       calc_cv_statistics=False,
                                       plot=True)

  print('Best CatBoost parameters for feature set {}:'.format(id))
  print(grid_search_result)
  best_params[id]=grid_search_result

reading from file /content/gdrive/My Drive/kaggle-c1/feature_set_basic.csv


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

bestTest = 1.453284867
bestIteration = 29
0:	loss: 1.4532849	best: 1.4532849 (0)	total: 56.6s	remaining: 7h 31m 47s
bestTest = 1.414045465
bestIteration = 29
1:	loss: 1.4140455	best: 1.4140455 (1)	total: 58.3s	remaining: 3h 52m 14s
bestTest = 1.370855711
bestIteration = 29
2:	loss: 1.3708557	best: 1.3708557 (2)	total: 1m	remaining: 2h 39m 9s
bestTest = 1.357465711
bestIteration = 27
3:	loss: 1.3574657	best: 1.3574657 (3)	total: 1m 1s	remaining: 2h 2m 31s
bestTest = 1.452470606
bestIteration = 29
4:	loss: 1.4524706	best: 1.3574657 (3)	total: 1m 3s	remaining: 1h 40m 32s
bestTest = 1.411912026
bestIteration = 29
5:	loss: 1.4119120	best: 1.3574657 (3)	total: 1m 5s	remaining: 1h 25m 56s
bestTest = 1.373163056
bestIteration = 29
6:	loss: 1.3731631	best: 1.3574657 (3)	total: 1m 7s	remaining: 1h 15m 30s
bestTest = 1.350813065
bestIteration = 27
7:	loss: 1.3508131	best: 1.3508131 (7)	total: 1m 8s	remaining: 1h 7m 37s
bestTest = 1.453011915
bestIteration = 29
8:	loss: 1.4530119	best: 1.3508131 (

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

bestTest = 1.453284867
bestIteration = 29
0:	loss: 1.4532849	best: 1.4532849 (0)	total: 2.47s	remaining: 19m 44s
bestTest = 1.414045465
bestIteration = 29
1:	loss: 1.4140455	best: 1.4140455 (1)	total: 4.23s	remaining: 16m 50s
bestTest = 1.3706624
bestIteration = 29
2:	loss: 1.3706624	best: 1.3706624 (2)	total: 5.99s	remaining: 15m 51s
bestTest = 1.357476795
bestIteration = 27
3:	loss: 1.3574768	best: 1.3574768 (3)	total: 7.74s	remaining: 15m 21s
bestTest = 1.452470606
bestIteration = 29
4:	loss: 1.4524706	best: 1.3574768 (3)	total: 9.48s	remaining: 15m
bestTest = 1.413380488
bestIteration = 29
5:	loss: 1.4133805	best: 1.3574768 (3)	total: 11.2s	remaining: 14m 47s
bestTest = 1.37317485
bestIteration = 29
6:	loss: 1.3731749	best: 1.3574768 (3)	total: 13s	remaining: 14m 36s
bestTest = 1.350813143
bestIteration = 27
7:	loss: 1.3508131	best: 1.3508131 (7)	total: 14.7s	remaining: 14m 26s
bestTest = 1.452446362
bestIteration = 29
8:	loss: 1.4524464	best: 1.3508131 (7)	total: 16.4s	remaining: 

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

bestTest = 1.453284867
bestIteration = 29
0:	loss: 1.4532849	best: 1.4532849 (0)	total: 2.36s	remaining: 18m 51s
bestTest = 1.414045465
bestIteration = 29
1:	loss: 1.4140455	best: 1.4140455 (1)	total: 4.07s	remaining: 16m 11s
bestTest = 1.370855711
bestIteration = 29
2:	loss: 1.3708557	best: 1.3708557 (2)	total: 5.8s	remaining: 15m 21s
bestTest = 1.350894979
bestIteration = 27
3:	loss: 1.3508950	best: 1.3508950 (3)	total: 7.47s	remaining: 14m 49s
bestTest = 1.452470606
bestIteration = 29
4:	loss: 1.4524706	best: 1.3508950 (3)	total: 9.17s	remaining: 14m 30s
bestTest = 1.416602456
bestIteration = 29
5:	loss: 1.4166025	best: 1.3508950 (3)	total: 10.9s	remaining: 14m 19s
bestTest = 1.372543439
bestIteration = 29
6:	loss: 1.3725434	best: 1.3508950 (3)	total: 12.6s	remaining: 14m 10s
bestTest = 1.350813065
bestIteration = 27
7:	loss: 1.3508131	best: 1.3508131 (7)	total: 14.3s	remaining: 14m 3s
bestTest = 1.453011915
bestIteration = 29
8:	loss: 1.4530119	best: 1.3508131 (7)	total: 16s	remain

In [None]:
best_params

NameError: ignored

In [None]:
np.histogram(y_train,40)

NameError: ignored

In [None]:
grid = {'learning_rate': [0.03, 0.1,0.3,0.5],
        'iterations': [1000],
        'depth': [10,11,12,13,14],
        'l2_leaf_reg': [0.3, 1, 3, 10],
          'task_type': ['GPU']}

best_params={}          

for id in ['basic','text','within']:

  X_train, y_train, X_trainval, y_trainval, X_val, y_val, X_test, submissionidx2testidx = load_feauture_set(id)

  X_paramsearch =  pd.concat([X_train, X_val],ignore_index=True)
  y_paramsearch = pd.concat([y_train, y_val],ignore_index=True)
  train_indices = np.arange(X_train.shape[0])
  val_indices = np.arange(X_val.shape[0]) + X_train.shape[0]

  model = CatBoostRegressor()

  cv = HoldOut(train_indices=train_indices, test_indices=val_indices)

  grid_search_result = model.grid_search(grid, 
                                       X=X_paramsearch, 
                                       y=y_paramsearch, 
                                       cv=cv,
                                       shuffle=False,
                                       refit=False,
                                       calc_cv_statistics=False,
                                       plot=True)

  print('Best CatBoost parameters for feature set {}:'.format(id))
  print(grid_search_result)
  best_params[id]=grid_search_result

NameError: ignored

In [None]:
best_params

{}