In [110]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import datetime
import math
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss
from catboost import CatBoostClassifier, Pool

pd.set_option('display.max_columns', 500)

In [97]:
class Model:
    def __init__(self, cat_features_indices, params={'n_estimators': 500, 'learning_rate': 0.07}):
        self.cat_features_indices = cat_features_indices
        self.params = params
        
    def fit(self, X, y):
        pool = Pool(X, y, cat_features=self.cat_features_indices)
        self.model = CatBoostClassifier()
        self.model.set_params(**self.params)
        
        self.model.fit(pool)
        
    def predict(self, X):
        pool = Pool(X, cat_features=self.cat_features_indices)
        
        pred = self.model.predict_proba(pool)[:, 1]
        return pred
    
    def score(self, X, y):
        pred = self.predict(X)
        
        roc_auc_value = roc_auc_score(y, pred)
        log_loss_value = log_loss(y, pred)
        return roc_auc_value, log_loss_value
    
    def cross_validate(self, X, y):
        roc_auc_list = []
        log_loss_list = []
        kf = StratifiedKFold(5, shuffle=True, random_state=1)
        
        for train_idx, test_idx in kf.split(X, y):
            X_train, X_test = X.iloc[train_idx, :], X.iloc[test_idx, :]
            y_train, y_test = y[train_idx], y[test_idx]
            
            self.fit(X_train, y_train)
            ra, ll = self.score(X_test, y_test)
            
            roc_auc_list.append(ra)
            log_loss_list.append(ll)
        
        self.fit(X, y)
            
        print('ROC AUC: {}'.format(np.mean(roc_auc_folds)))
        print('LOG LOSS: {}'.format(np.mean(log_loss_folds)))
        print('roc auc folds: {}'.format(roc_auc_folds))
        print('log loss folds: {}'.format(log_loss_folds))
        return self.model

In [222]:
def prep_features(data):
    df = data.copy().rename({'Native city': 'city'}, axis=1)
    cities = pd.read_csv('data/cities_features.csv', names=['city', 'f1', 'f2','f3', 'f4','f5', 'f6', 'f7', 'f8'])
    cities = cities.sort_values(by='f1', ascending=False).drop_duplicates('city', keep='first')
    df = df.merge(cities, on='city', how='left')
    df.index = data.index
    df['full_years'] = ((datetime.now() - df['Birth date']).dt.days / 365).apply(math.floor)
    df['contract'] = df['Contract termination date'].isna().astype('int64')
    df['Life status'] = df['Life status'].astype(str)
    return df

In [223]:
train = pd.read_csv('data/train.csv', index_col=0, parse_dates=[1])
features = ['full_years','Gender', 'Level', 'Life status', 'day_00', 'day_01', 'day_02',
       'day_03', 'day_04', 'day_05', 'day_06', 'day_07', 'day_08', 'day_09',
       'day_10', 'day_11', 'day_12', 'day_13', 'evalexpr', 'match_n_match',
       'bsq', 'rush_00', 'rush_01', 'rush_02', 'exam_00', 'exam_01', 'exam_02',
       'exam_final', 'contract', 'Memory entrance game', 'f1', 'f2','f3', 'f4','f5', 'f6', 'f7', 'f8',
        'Logic entrance game']
data = prep_features(train)[features]
y = train['contract_status'].values

In [225]:
X = data

map_feature_index = {j:i for i,j in enumerate(X.columns)}
cat_features = ['Gender', 'Life status']
cat_features_indices = [map_feature_index[i] for i in cat_features]

model_params = {
    'random_state': 12,
    'n_estimators': 500,
    'learning_rate': 0.07,
    'verbose': 500
}

model = Model(cat_features_indices, model_params)
model.cross_validate(X, y)

0:	learn: 0.6079055	total: 14ms	remaining: 6.99s
499:	learn: 0.0068024	total: 2.75s	remaining: 0us
0:	learn: 0.6160149	total: 4.51ms	remaining: 2.25s
499:	learn: 0.0061609	total: 2.52s	remaining: 0us
0:	learn: 0.6068272	total: 2.63ms	remaining: 1.31s
499:	learn: 0.0058749	total: 1.87s	remaining: 0us
0:	learn: 0.6288254	total: 2.63ms	remaining: 1.31s
499:	learn: 0.0065392	total: 1.7s	remaining: 0us
0:	learn: 0.6095084	total: 2.77ms	remaining: 1.38s
499:	learn: 0.0057966	total: 1.68s	remaining: 0us
0:	learn: 0.6183555	total: 2.85ms	remaining: 1.42s
499:	learn: 0.0081450	total: 1.61s	remaining: 0us
ROC AUC: 0.7023253449387719
LOG LOSS: 3.2258532359296543
roc auc folds: [0.6649844720496894, 0.7954192546583851, 0.6828416149068324, 0.6869229319766346, 0.6814584511023176]
log loss folds: [3.7471992916824965, 2.2809040806626504, 3.5842767634167383, 3.0955242653634274, 3.421361778522962]


<catboost.core.CatBoostClassifier at 0x139ef50d0>

In [226]:
def make_prediction_file(model):
    test = pd.read_csv('data/test.csv', index_col=0, parse_dates=[1])
    
    test_df = prep_features(test)[features]
    pred = model.predict(test_df)
    pd.DataFrame(pred, index=test_df.index).reset_index().to_csv('prediction.csv', header=['id', 'contract_status'], index=False)
    return pred


In [227]:
pred = make_prediction_file(model)