In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import datetime
import math
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss
from catboost import CatBoostClassifier, Pool

pd.set_option('display.max_columns', 500)

In [6]:
class Model:
    def __init__(self, cat_features_indices, params={'n_estimators': 500, 'learning_rate': 0.07}):
        self.cat_features_indices = cat_features_indices
        self.params = params
        
    def fit(self, X, y):
        pool = Pool(X, y, cat_features=self.cat_features_indices)
        self.model = CatBoostClassifier()
        self.model.set_params(**self.params)
        
        self.model.fit(pool)
        
    def predict(self, X):
        pool = Pool(X, cat_features=self.cat_features_indices)
        
        pred = self.model.predict_proba(pool)[:, 1]
        return pred
    
    def score(self, X, y):
        pred = self.predict(X)
        
        roc_auc_value = roc_auc_score(y, pred)
        log_loss_value = log_loss(y, pred)
        return roc_auc_value, log_loss_value
    
    def cross_validate(self, X, y):
        roc_auc_list = []
        log_loss_list = []
        kf = StratifiedKFold(5, shuffle=True, random_state=1)
        
        for train_idx, test_idx in kf.split(X, y):
            X_train, X_test = X.iloc[train_idx, :], X.iloc[test_idx, :]
            y_train, y_test = y[train_idx], y[test_idx]
            
            self.fit(X_train, y_train)
            ra, ll = self.score(X_test, y_test)
            
            roc_auc_list.append(ra)
            log_loss_list.append(ll)
        
        self.fit(X, y)
            
        print('ROC AUC: {}'.format(np.mean(roc_auc_list)))
        print('LOG LOSS: {}'.format(np.mean(log_loss_list)))
        print('roc auc folds: {}'.format(roc_auc_list))
        print('log loss folds: {}'.format(log_loss_list))
        return self.model

In [21]:
datetime.now().year

2020

In [45]:
def prep_features(data):
    df = data.copy().rename({'Native city': 'city',
                             'Wave id': 'wave_id'}, axis=1)
    cities = pd.read_csv('data/cities_features.csv', names=['city', 'f1', 'f2','f3', 'f4','f5', 'f6', 'f7', 'f8'])
    cities = cities.sort_values(by='f1', ascending=False).drop_duplicates('city', keep='first')
    df = df.merge(cities, on='city', how='left')
    df.index = data.index
    df['age'] = ((datetime.now() - df['Birth date']).dt.days / 365).apply(math.floor)
    df['contract'] = df['Contract termination date'].isna().astype('int64')
    df['Life status'] = df['Life status'].astype(str)
#     df['lvl_wave_id'] = df['Level']..astype(str) + df['wave_id'].astype(str)
    return df

In [50]:
train = pd.read_csv('data/train.csv', index_col=0, parse_dates=[1])
features = ['age','Gender', 'Level', 'Life status', 'day_00', 'day_01', 'day_02',
       'day_03', 'day_04', 'day_05', 'day_06', 'day_07', 'day_08', 'day_09',
       'day_10', 'day_11', 'day_12', 'day_13', 'evalexpr', 'match_n_match',
       'bsq', 'rush_00', 'rush_01', 'rush_02', 'exam_00', 'exam_01', 'exam_02',
       'exam_final', 'contract', 'Memory entrance game', 'f1', 'f2','f3', 'f4','f5', 'f6', 'f7', 'f8',
        'Logic entrance game', 'wave_id']
data = prep_features(train)[features]
y = train['contract_status'].values

In [51]:
X = data

map_feature_index = {j:i for i,j in enumerate(X.columns)}
cat_features = ['Gender', 'Life status', 'wave_id']
cat_features_indices = [map_feature_index[i] for i in cat_features]

model_params = {
    'random_state': 12,
    'n_estimators': 500,
    'learning_rate': 0.07,
    'verbose': 500
}

model = Model(cat_features_indices, model_params)
model.cross_validate(X, y)

0:	learn: 0.6155728	total: 4.54ms	remaining: 2.26s
499:	learn: 0.0071410	total: 1.36s	remaining: 0us
0:	learn: 0.6310683	total: 3.54ms	remaining: 1.77s
499:	learn: 0.0073879	total: 1.41s	remaining: 0us
0:	learn: 0.6265943	total: 2.64ms	remaining: 1.32s
499:	learn: 0.0071217	total: 1.41s	remaining: 0us
0:	learn: 0.6121372	total: 2.66ms	remaining: 1.33s
499:	learn: 0.0051506	total: 1.48s	remaining: 0us
0:	learn: 0.6136182	total: 2.72ms	remaining: 1.36s
499:	learn: 0.0056933	total: 1.45s	remaining: 0us
0:	learn: 0.6169000	total: 4.27ms	remaining: 2.13s
499:	learn: 0.0079355	total: 1.93s	remaining: 0us
ROC AUC: 0.9513093277717113
LOG LOSS: 0.2012972068688635
roc auc folds: [0.9612068965517241, 0.9703898050974513, 0.9270186335403726, 0.9430132708821234, 0.9549180327868853]
log loss folds: [0.21845127897396088, 0.16501565729581663, 0.21222096392888887, 0.23658124313381104, 0.17421689101184]


<catboost.core.CatBoostClassifier at 0x1a1ec38c50>

In [None]:
def make_prediction_file(model):
    test = pd.read_csv('data/test.csv', index_col=0, parse_dates=[1])
    
    test_df = prep_features(test)[features]
    pred = model.predict(test_df)
    pd.DataFrame(pred, index=test_df.index).reset_index().to_csv('prediction.csv', header=['id', 'contract_status', ''], index=False)
    return pred


In [None]:
pred = make_prediction_file(model)

In [52]:
feature_importance = model.model.get_feature_importance(Pool(X,label=y, cat_features=cat_features_indices))
feature_score = pd.DataFrame(list(zip(X.dtypes.index, feature_importance)), 
                             columns=['Feature','Score'])
feature_score = feature_score.sort_values(by='Score',
                                          ascending=False,
                                          inplace=False,
                                          kind='quicksort',
                                          na_position='last')

In [53]:
feature_score

Unnamed: 0,Feature,Score
39,wave_id,18.353477
40,lvl_wave_id,13.288441
2,Level,11.146946
28,contract,8.62792
26,exam_02,2.871256
19,match_n_match,2.837856
7,day_03,2.52923
27,exam_final,2.494089
18,evalexpr,2.371652
0,age,2.280639
