In [2]:
# Catboost for Avito Demand Prediction Challenge
# https://www.kaggle.com/c/avito-demand-prediction
# By Nick Brooks, April 2018

import time
notebookstart= time.time()

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Viz
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt
from sklearn.cross_validation import KFold





In [3]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None, seed_bool = True):
        if(seed_bool == True):
            params['random_state'] = seed
        self.clf = clf(**params)

    def fit(self, x_train, y_train):
        self.clf.fit(x_train, y_train,
                    cat_features=categorical_features_pos,
                    verbose=True)

    def predict(self, x):
        return self.clf.predict(x)
        
def get_oof(clf, x_train, y, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train.iloc[train_index,:]
        y_tr = y.iloc[train_index]
        x_te = x_train.iloc[test_index,:]

        clf.fit(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

print("\nData Load Stage")
training = pd.read_csv('/home/g492652607/data/train.csv', index_col = "item_id", parse_dates = ["activation_date"])#.sample(1000)
traindex = training.index
testing = pd.read_csv('/home/g492652607/data/test.csv', index_col = "item_id", parse_dates = ["activation_date"])#.sample(1000)
testdex = testing.index
ntrain = training.shape[0]
ntest = testing.shape[0]
y = training.deal_probability.copy()
training.drop("deal_probability",axis=1, inplace=True)
print('Train shape: {} Rows, {} Columns'.format(*training.shape))
print('Test shape: {} Rows, {} Columns'.format(*testing.shape))

# Combine Train and Test
df = pd.concat([training,testing],axis=0)
dfdex=df.index
del training, testing
gc.collect()
print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))

print("Feature Engineering")
df["price"] = np.log(df["price"]+0.001)
df["price"].fillna(-999,inplace=True)
df["image_top_1"].fillna(-999,inplace=True)

print("\nCreate Time Variables")
df["Weekday"] = df['activation_date'].dt.weekday
#df["Weekd of Year"] = df['activation_date'].dt.week
df["Day of Month"] = df['activation_date'].dt.day

# Remove Dead Variables
df.drop(["activation_date","image"],axis=1,inplace=True)

print("\nEncode Variables")
categorical = ["user_id","region","city","parent_category_name","category_name","item_seq_number","user_type","image_top_1"]
messy_categorical = ["param_1","param_2","param_3","title","description"] # Need to find better technique for these
print("Encoding :",categorical + messy_categorical)

# Encoder:
lbl = preprocessing.LabelEncoder()
for col in categorical + messy_categorical:
    df[col] = lbl.fit_transform(df[col].astype(str))
    
print("\nCatboost Modeling Stage")
X = df.loc[traindex,:].copy()
X=X.reset_index()
X.drop(['item_id'],axis=1,inplace=True)
print("Training Set shape",X.shape)
test = df.loc[testdex,:].copy()
test=test.reset_index()
test.drop(['item_id'],axis=1,inplace=True)
print("Submission Set Shape: {} Rows, {} Columns".format(*test.shape))
del df
gc.collect()

# Training and Validation Set
#X_train, X_valid, y_train, y_valid = train_test_split(
#    X, y, test_size=0.10, random_state=23)

# Prepare Categorical Variables
def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols,query_cols,sorter=sidx)]
categorical_features_pos = column_index(X,categorical + messy_categorical)

# Train Model
print("Train CatBoost Decision Tree")
modelstart= time.time()
cb_param={
        'iterations': 700,
        'learning_rate':0.02,
        'depth':12,
        'eval_metric':'RMSE',
   #      'random_seed': 42,
         'bagging_temperature': 0.2,
         'od_type':'Iter',
         'metric_period': 75,
         'od_wait':100}

NFOLDS = 5
SEED=42
kf = KFold(ntrain, n_folds=NFOLDS, random_state=SEED)
#ridge_params = {'alpha':100, 'fit_intercept':True, 'normalize':False, 'copy_X':True,
#                'max_iter':None, 'tol':0.001, 'solver':'auto', 'random_state':SEED}
catboost = SklearnWrapper(clf=CatBoostRegressor, seed = SEED, params = cb_param)
cb_oof_train, cb_oof_test = get_oof(catboost, X, y, test)
rms = sqrt(mean_squared_error(y, cb_oof_train))
print('Catboost OOF RMSE: {}'.format(rms))
print("Modeling Stage")

cb_preds = np.concatenate([cb_oof_train, cb_oof_test])


Data Load Stage
Train shape: 1503424 Rows, 16 Columns
Test shape: 508438 Rows, 16 Columns

All Data shape: 2011862 Rows, 16 Columns
Feature Engineering

Create Time Variables

Encode Variables
Encoding : ['user_id', 'region', 'city', 'parent_category_name', 'category_name', 'item_seq_number', 'user_type', 'image_top_1', 'param_1', 'param_2', 'param_3', 'title', 'description']

Catboost Modeling Stage
Training Set shape (1503424, 16)
Submission Set Shape: 508438 Rows, 16 Columns
Train CatBoost Decision Tree

Fold 0
0:	learn: 0.2927645	total: 3.58s	remaining: 41m 39s
75:	learn: 0.2348006	total: 4m 44s	remaining: 38m 59s
150:	learn: 0.2283820	total: 9m 30s	remaining: 34m 35s
225:	learn: 0.2262077	total: 14m 37s	remaining: 30m 39s
300:	learn: 0.2251868	total: 20m 7s	remaining: 26m 40s
375:	learn: 0.2243626	total: 25m 36s	remaining: 22m 3s
450:	learn: 0.2236626	total: 31m 4s	remaining: 17m 9s
525:	learn: 0.2229804	total: 36m 37s	remaining: 12m 6s
600:	learn: 0.2224342	total: 42m 7s	remaini

In [4]:
sub = pd.DataFrame(cb_preds,columns=['cb_pred'],index=dfdex)
sub=sub.reset_index()
#sub1=sub1.set_index(sub.index)
#cb1=pd.concat([sub1,sub],axis=1)


In [5]:
sub.head()

Unnamed: 0,item_id,cb_pred
0,b912c3c6a6ad,0.099434
1,2dac0150717d,0.140335
2,ba83aefab5dc,0.250393
3,02996f1dd2ea,0.322867
4,7c90be56d2ab,0.411989


In [6]:
sub.to_csv('/home/g492652607/data/catboost.csv', index=False)