In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import xgboost as xgb
import lightgbm as lgb

from dask_cuda import LocalCUDACluster
from dask.distributed import Client
from dask import array as da
from dask import dataframe as dd 
import dask
from xgboost.dask import DaskDMatrix

In [None]:
train_prod = pd.read_pickle("../data/train_prod_v16.pickle")
test_prod = pd.read_pickle("../data/test_prod_v16.pickle")

print(train_prod.shape, test_prod.shape)

In [None]:
train_prod['age_difference'] = train_prod['from_age']-train_prod['to_age']
test_prod['age_difference'] = test_prod['from_age']-test_prod['to_age']

In [None]:
train_prod['to_swipe_by_session_percentage'] = train_prod['to_total_swipe_counts']/ train_prod['to_total_session_count']
train_prod['from_swipe_by_session_percentage'] = train_prod['from_total_swipe_counts']/ train_prod['to_total_session_count']

test_prod['to_swipe_by_session_percentage'] = test_prod['to_total_swipe_counts']/ test_prod['to_total_session_count']
test_prod['from_swipe_by_session_percentage'] = test_prod['from_total_swipe_counts']/ test_prod['to_total_session_count']

train_prod['to_common_users_left_swipe_percentage'] = train_prod['common_users_swiped_left']/train_prod['to_swipe_left_count']
train_prod['from_common_users_left_swipe_percentage'] = train_prod['common_users_swiped_left']/train_prod['from_swipe_left_count']

train_prod['to_common_users_right_swipe_percentage'] = train_prod['common_users_swiped_right']/train_prod['to_swipe_right_count']
train_prod['from_common_users_right_swipe_percentage'] = train_prod['common_users_swiped_right']/train_prod['from_swipe_right_count']

train_prod['to_overall_common_users_left_swipe_percentage'] = train_prod['common_users_swiped_left']/train_prod['to_total_swipe_counts']
train_prod['from_overall_common_users_left_swipe_percentage'] = train_prod['common_users_swiped_left']/train_prod['from_total_swipe_counts']

train_prod['to_overall_common_users_right_swipe_percentage'] = train_prod['common_users_swiped_right']/train_prod['to_total_swipe_counts']
train_prod['from_overall_common_users_right_swipe_percentage'] = train_prod['common_users_swiped_right']/train_prod['from_total_swipe_counts']

test_prod['to_common_users_left_swipe_percentage'] = test_prod['common_users_swiped_left']/test_prod['to_swipe_left_count']
test_prod['from_common_users_left_swipe_percentage'] = test_prod['common_users_swiped_left']/test_prod['from_swipe_left_count']

test_prod['to_common_users_right_swipe_percentage'] = test_prod['common_users_swiped_right']/test_prod['to_swipe_right_count']
test_prod['from_common_users_right_swipe_percentage'] = test_prod['common_users_swiped_right']/test_prod['from_swipe_right_count']

test_prod['to_overall_common_users_left_swipe_percentage'] = test_prod['common_users_swiped_left']/test_prod['to_total_swipe_counts']
test_prod['from_overall_common_users_left_swipe_percentage'] = test_prod['common_users_swiped_left']/test_prod['from_total_swipe_counts']

test_prod['to_overall_common_users_right_swipe_percentage'] = test_prod['common_users_swiped_right']/test_prod['to_total_swipe_counts']
test_prod['from_overall_common_users_right_swipe_percentage'] = test_prod['common_users_swiped_right']/test_prod['from_total_swipe_counts']

In [None]:
train_prod.fillna(-999, inplace=True)
test_prod.fillna(-999, inplace=True)

In [None]:
lgb_bottom_importance = [
                         'from_purpose_id_12',
                         'to_unique_degree_count',
                         'from_purpose_id_3',
                         'from_unique_school_count',
                         'rev_strength_4',
                         'to_unique_school_count',
                         'rev_strength_7',
                         'rev_strength_8',
                         'rev_strength_6',
                         'rev_strength_5']

self_intro_columns = train_prod.columns[train_prod.columns.str.contains("_self_intro_")].tolist()

to_self_intro_columns = train_prod.columns[train_prod.columns.str.contains("to_self_intro_")].tolist()
from_self_intro_columns = train_prod.columns[train_prod.columns.str.contains("from_self_intro_")].tolist()

purpose_columns = train_prod.columns[train_prod.columns.str.contains("_purpose_")].tolist()
rev_strength_columns = train_prod.columns[train_prod.columns.str.contains("rev_strength")].tolist()
common_strength_columns = train_prod.columns[train_prod.columns.str.contains("common_strength")].tolist()
review_comments = train_prod.columns[train_prod.columns.str.contains("_review_comments_")].tolist()

others = ['to_review_comments_count', 'from_review_comments_count', 'to_last_login_year']

In [None]:
dep = 'score'
drop = ['from-to', 'user_purpose_cosine_similarity', 'to_last_swipe_year']  + review_comments + rev_strength_columns + purpose_columns + to_self_intro_columns + from_self_intro_columns + common_strength_columns
indep = train_prod.columns.difference([dep]+drop)

print("Indep length:",len(indep))
print("Columns that are dropped:", drop)

In [None]:
np.random.seed(100)
train_local_X, test_local_X, train_local_Y, test_local_Y = train_test_split(train_prod[indep],
                                                                            train_prod[dep], 
                                                                            test_size=0.2,
                                                                            stratify=train_prod[dep])

print(train_local_X.shape, train_local_Y.shape, test_local_X.shape, test_local_Y.shape)

# XGBOOST

In [None]:
def xgb_eval_accuracy(preds, dtrain):
    labels = dtrain.get_label()
    preds = preds.argmax(axis=1)
    acc = accuracy_score(y_pred = preds, y_true = labels)
    return 'Accuracy', acc

In [None]:
dtrain_prod = xgb.DMatrix(data = train_prod[indep], label = train_prod[dep])
dtest_prod = xgb.DMatrix(data = test_prod[indep])
dtrain_local = xgb.DMatrix(data = train_local_X, label = train_local_Y)
dtest_local = xgb.DMatrix(data = test_local_X, label = test_local_Y)

In [None]:
eval_set = [(dtrain_local,'train'), (dtest_local,'test')]

num_rounds = 100000
params = {'objective' : 'multi:softmax'
          ,'num_class' : 4
          #,'eval_metric': 'rmse'
          ,'max_depth' : 15
          ,'eta' : 0.1
          ,'subsample': 1
          ,'colsample_bytree': 1
          ,'tree_method' : 'gpu_hist'
          }

#### XGB cross validation

In [None]:
start = datetime.now()
print("Started training at...", start)

# Cross validation
np.random.seed(100)
xgb_model_cv = xgb.cv(params,
                      dtrain_prod,
                      nfold = 3,
                      num_boost_round = num_rounds,
                      feval = xgb_eval_accuracy,
                      maximize = True,
                      verbose_eval = True,
                      early_stopping_rounds = 30)

end = datetime.now()
print("Started ended at...", end)
print("Total training time:", end - start)

#### XGB local validation

In [None]:
start = datetime.now()
print("Started training at...", start)

np.random.seed(100)
xgb_model_local = xgb.train(params,
                            dtrain_local,
                            evals = eval_set,
                            num_boost_round = num_rounds,
                            feval = xgb_eval_accuracy,
                            maximize = True,
                            verbose_eval = True,
                            early_stopping_rounds = 50)

end = datetime.now()
print("Total training time:", end - start)

#### XGB Prod model

In [None]:
start = datetime.now()
print("Started training at...", start)

local_validation = xgb_model_local.best_iteration
final_round = xgb_model_local.best_iteration + int(0.4*xgb_model_local.best_iteration)

print("Local best iteration:", local_validation)
print("final round:", final_round)

np.random.seed(100)
xgb_model_prod = xgb.train(params,
                           dtrain_prod,
                           evals = eval_set,
                           num_boost_round = final_round,
                           feval = xgb_eval_accuracy,
                           maximize = True,
                           verbose_eval = True)

end = datetime.now()
print("Total training time:", end - start)

#### XGB Feature importance

In [None]:
xgb_feature_imp = pd.DataFrame({'columnm_names': list(xgb_model_local.get_score().keys()),
                                'score': list(xgb_model_local.get_score().values())
                               }).sort_values(['score'], ascending=False)
xgb_feature_imp

#### XGB submission

In [None]:
xgb_prod_prediction = xgb_model_prod.predict(dtest_prod)
xgb_submission = pd.DataFrame({"from-to": test_prod['from-to'],
                               "score": xgb_prod_prediction.astype('float')})

xgb_submission.to_csv("../submissions/xgb_sub_5.csv", index=False)
xgb_submission

# XGB on Multi-GPUs

In [None]:
num_rounds = 100000
params = {'objective' : 'multi:softmax'
          ,'num_class' : 4
          #,'eval_metric': 'rmse'
          ,'max_depth' : 6
          ,'eta' : 0.1
          ,'subsample': 1
          ,'colsample_bytree': 1
          ,'tree_method' : 'gpu_hist'
          }

num_rounds = 10000

with LocalCUDACluster(n_workers=2, threads_per_worker=8) as cluster:
    with Client(cluster) as client:
        print("forming the dask local set")
        train_local_X_dask = dd.from_pandas(train_local_X, npartitions=4)
        train_local_Y_dask = dd.from_pandas(train_local_Y, npartitions=4)
        test_local_X_dask = dd.from_pandas(test_local_X, npartitions=4)
        test_local_Y_dask = dd.from_pandas(test_local_Y, npartitions=4)

        print("forming the dask prod set")
        train_prod_X_dask = dd.from_pandas(train_prod[indep], npartitions=4)
        train_prod_Y_dask = dd.from_pandas(train_prod[dep], npartitions=4)
        test_prod_X_dask = dd.from_pandas(test_prod[indep], npartitions=4)

        print("Forming the DMatrix to be accepted by XGBoost")
        dtrain_local = DaskDMatrix(client, data = train_local_X_dask, label = train_local_Y_dask)
        dtest_local  = DaskDMatrix(client, data = test_local_X_dask, label = test_local_Y_dask)
        dtrain_prod = DaskDMatrix(client, data = train_prod_X_dask, label = train_prod_Y_dask)
        dtest_prod = DaskDMatrix(client, data = test_prod_X_dask)

        eval_set = [(dtrain_local,'train'), (dtest_local,'test')]

        print("")
        start_time = datetime.now() + timedelta(hours=5, minutes=30)
        print("Training started... at:", start_time)
        
        print("Training the local model")
        np.random.seed(100)
        local_model = xgb.dask.train(client, 
                                     params,
                                     dtrain_local,
                                     evals = eval_set,
                                     num_boost_round = num_rounds,
                                     feval = xgb_eval_accuracy,
                                     maximize=True,
                                     verbose_eval = True,
                                     early_stopping_rounds = 20
                                    )
        
        end_time = datetime.now() + timedelta(hours=5, minutes=30)
        print("Local Training ended at:", end_time)

        total_time = (end_time - start_time)
        print("It took {} mins time to complete".format(total_time))
        print("")

        bst_local = local_model['booster']
        history_local = local_model['history']

        xgb_prod_predict = xgb.dask.predict(client, bst_local, dtest_prod)
        xgb_prod_predict = np.array(xgb_prod_predict)

        best_iteration = len(history_local['test']['Accuracy'])
        best_score = history_local['test']['Accuracy'][-1]
        
        print("Best score {} at best iteration {}".format(best_score, best_iteration))

        ###############################################################################
        final_iteration = best_iteration + int(0.2*best_iteration)
        print("Training the prod model")
        print("Final iteration:", final_iteration)

        print("")
        start_time = datetime.now() + timedelta(hours=5, minutes=30)
        print("Training started... at:", start_time)
        
        np.random.seed(100)
        prod_model = xgb.dask.train(client, 
                                     params,
                                     dtrain_prod,
                                     evals = eval_set,
                                     num_boost_round = final_iteration,
                                     feval = xgb_eval_accuracy,
                                     maximize=True,
                                     verbose_eval = True,
                                     early_stopping_rounds = 20
                                    )
        end_time = datetime.now() + timedelta(hours=5, minutes=30)
        print("Local Training ended at:", end_time)

        bst_prod = prod_model['booster']
        history = prod_model['history']

        xgb_prod_predict = xgb.dask.predict(client, bst_prod, dtest_prod)
        xgb_prod_predict = np.array(xgb_prod_predict)

        best_iteration = len(history['test']['Accuracy'])
        best_score = history['test']['Accuracy'][-1]
        

In [None]:
xgb_submission = pd.DataFrame({"from-to": test_prod['from-to'],
                               "score": xgb_prod_predict.astype('float')})

xgb_submission.to_csv("../submissions/xgb_sub_4.csv", index=False)