In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix
from catboost import CatBoostClassifier, Pool, cv

import matplotlib.pyplot as plt
import seaborn as sns

import gc

In [None]:
train_prod = pd.read_pickle("../data/train_prod_v16.pickle")
test_prod = pd.read_pickle("../data/test_prod_v16.pickle")

print(train_prod.shape, test_prod.shape)

In [None]:
train_prod['age_difference'] = train_prod['from_age']-train_prod['to_age']
test_prod['age_difference'] = test_prod['from_age']-test_prod['to_age']

In [None]:
# More Feature Engineering

train_prod['to_swipe_by_session_percentage'] = train_prod['to_total_swipe_counts']/ train_prod['to_total_session_count']
train_prod['from_swipe_by_session_percentage'] = train_prod['from_total_swipe_counts']/ train_prod['to_total_session_count']

test_prod['to_swipe_by_session_percentage'] = test_prod['to_total_swipe_counts']/ test_prod['to_total_session_count']
test_prod['from_swipe_by_session_percentage'] = test_prod['from_total_swipe_counts']/ test_prod['to_total_session_count']

train_prod['to_common_users_left_swipe_percentage'] = train_prod['common_users_swiped_left']/train_prod['to_swipe_left_count']
train_prod['from_common_users_left_swipe_percentage'] = train_prod['common_users_swiped_left']/train_prod['from_swipe_left_count']

train_prod['to_common_users_right_swipe_percentage'] = train_prod['common_users_swiped_right']/train_prod['to_swipe_right_count']
train_prod['from_common_users_right_swipe_percentage'] = train_prod['common_users_swiped_right']/train_prod['from_swipe_right_count']

train_prod['to_overall_common_users_left_swipe_percentage'] = train_prod['common_users_swiped_left']/train_prod['to_total_swipe_counts']
train_prod['from_overall_common_users_left_swipe_percentage'] = train_prod['common_users_swiped_left']/train_prod['from_total_swipe_counts']

train_prod['to_overall_common_users_right_swipe_percentage'] = train_prod['common_users_swiped_right']/train_prod['to_total_swipe_counts']
train_prod['from_overall_common_users_right_swipe_percentage'] = train_prod['common_users_swiped_right']/train_prod['from_total_swipe_counts']

test_prod['to_common_users_left_swipe_percentage'] = test_prod['common_users_swiped_left']/test_prod['to_swipe_left_count']
test_prod['from_common_users_left_swipe_percentage'] = test_prod['common_users_swiped_left']/test_prod['from_swipe_left_count']

test_prod['to_common_users_right_swipe_percentage'] = test_prod['common_users_swiped_right']/test_prod['to_swipe_right_count']
test_prod['from_common_users_right_swipe_percentage'] = test_prod['common_users_swiped_right']/test_prod['from_swipe_right_count']

test_prod['to_overall_common_users_left_swipe_percentage'] = test_prod['common_users_swiped_left']/test_prod['to_total_swipe_counts']
test_prod['from_overall_common_users_left_swipe_percentage'] = test_prod['common_users_swiped_left']/test_prod['from_total_swipe_counts']

test_prod['to_overall_common_users_right_swipe_percentage'] = test_prod['common_users_swiped_right']/test_prod['to_total_swipe_counts']
test_prod['from_overall_common_users_right_swipe_percentage'] = test_prod['common_users_swiped_right']/test_prod['from_total_swipe_counts']

In [None]:
train_prod.fillna(-999, inplace=True)
test_prod.fillna(-999, inplace=True)

In [None]:
dep = 'score'
drop = ['from-to'] + train_prod.columns[train_prod.columns.str.contains("to_self_intro")].tolist()
indep = train_prod.columns.difference([dep]+drop)

print("Indep length:",len(indep))

In [None]:
np.random.seed(100)
train_local_X, test_local_X, train_local_Y, test_local_Y = train_test_split(train_prod[indep],
                                                                            train_prod[dep], 
                                                                            test_size=0.2,
                                                                            stratify=train_prod[dep])

print(train_local_X.shape, train_local_Y.shape, test_local_X.shape, test_local_Y.shape)

# Cat Boost

#### Cross Validation

In [None]:
nrounds = 10000
nfold = 5

cv_dataset = Pool(data=train_prod[indep],
                  label=train_prod[dep],)

params = {"iterations": nrounds
          ,"depth": 10
          ,"learning_rate":0.01
          ,"task_type":'GPU'
          ,"loss_function": "MultiClass"
          ,"eval_metric":'Accuracy'
          ,"verbose": True}

np.random.seed(100)
scores = cv(cv_dataset
            ,params
            ,fold_count=nfold
            ,stratified=True
            ,early_stopping_rounds=20
            ,partition_random_seed=100
            ,seed=100
            ,plot="False")

# Catboost local validation

In [None]:
nrounds = 100000

eval_dataset = Pool(test_local_X, test_local_Y)
np.random.seed(100)
cat_local_model = CatBoostClassifier(iterations=nrounds
                                     ,learning_rate=0.06
                                     ,depth=12
                                     #,subsample=0.8
                                     #,colsample_bylevel=1
                                     ,task_type="CPU"
                                     #,loss_function='RMSE'
                                     ,eval_metric='Accuracy'
                                     ,early_stopping_rounds=20
                                    )

cat_local_model.fit(train_local_X,
                    train_local_Y,
                    eval_set=eval_dataset)

In [None]:
cat_local_prediction = cat_local_model.predict(test_local_X)
cat_local_prediction = cat_local_prediction.reshape(-1)

sns.heatmap(confusion_matrix(test_local_Y, cat_local_prediction), annot=True, fmt='', cmap='Blues')
plt.xlabel('Prediction')
plt.ylabel('Actual')

In [None]:
feature_importance = pd.DataFrame({'features':cat_local_model.feature_names_,
                                   'value':cat_local_model.feature_importances_}).sort_values(['value'], ascending=False)
feature_importance.head(60)

# Prod model

In [None]:
best_round = cat_local_model.best_iteration_
final_round = best_round + int(best_round/nfold)
print("Local best round is:", best_round)
print("The total iteration is going to be:", final_round)

eval_dataset = Pool(test_local_X, test_local_Y)

np.random.seed(100)
cat_prod_model = CatBoostClassifier(iterations=final_round
                                    ,learning_rate=0.08
                                     ,depth=13
                                     ,task_type="GPU"
                                     #,loss_function='RMSE'
                                     ,eval_metric='Accuracy'
                                     ,early_stopping_rounds=20
                                    )

cat_prod_model.fit(train_prod[indep],
                    train_prod[dep],
                    eval_set=eval_dataset)

In [None]:
cat_prod_prediction = cat_prod_model.predict(test_prod[indep])
cat_prod_prediction = cat_prod_prediction.reshape(-1)
cat_prod_prediction

In [None]:
cat_submission = pd.DataFrame({"from-to": test_prod['from-to'],
                               "score": cat_prod_prediction.astype('float')})

cat_submission.to_csv("../submissions/cat_sub_4.csv", index=False)