In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb

import seaborn as sns
import matplotlib.pyplot as plt

import gc

In [None]:
print("Reading the input files")

train_prod = pd.read_pickle("../data/train_prod_v16.pickle")
test_prod = pd.read_pickle("../data/test_prod_v16.pickle")

print(train_prod.shape, test_prod.shape)

In [None]:
train_prod['age_difference'] = train_prod['from_age']-train_prod['to_age']
test_prod['age_difference'] = test_prod['from_age']-test_prod['to_age']

train_prod['to_swipe_by_session_percentage'] = train_prod['to_total_swipe_counts']/ train_prod['to_total_session_count']
train_prod['from_swipe_by_session_percentage'] = train_prod['from_total_swipe_counts']/ train_prod['to_total_session_count']

test_prod['to_swipe_by_session_percentage'] = test_prod['to_total_swipe_counts']/ test_prod['to_total_session_count']
test_prod['from_swipe_by_session_percentage'] = test_prod['from_total_swipe_counts']/ test_prod['to_total_session_count']


train_prod['to_common_users_left_swipe_percentage'] = train_prod['common_users_swiped_left']/train_prod['to_swipe_left_count']
train_prod['from_common_users_left_swipe_percentage'] = train_prod['common_users_swiped_left']/train_prod['from_swipe_left_count']

train_prod['to_common_users_right_swipe_percentage'] = train_prod['common_users_swiped_right']/train_prod['to_swipe_right_count']
train_prod['from_common_users_right_swipe_percentage'] = train_prod['common_users_swiped_right']/train_prod['from_swipe_right_count']

train_prod['to_overall_common_users_left_swipe_percentage'] = train_prod['common_users_swiped_left']/train_prod['to_total_swipe_counts']
train_prod['from_overall_common_users_left_swipe_percentage'] = train_prod['common_users_swiped_left']/train_prod['from_total_swipe_counts']

train_prod['to_overall_common_users_right_swipe_percentage'] = train_prod['common_users_swiped_right']/train_prod['to_total_swipe_counts']
train_prod['from_overall_common_users_right_swipe_percentage'] = train_prod['common_users_swiped_right']/train_prod['from_total_swipe_counts']

test_prod['to_common_users_left_swipe_percentage'] = test_prod['common_users_swiped_left']/test_prod['to_swipe_left_count']
test_prod['from_common_users_left_swipe_percentage'] = test_prod['common_users_swiped_left']/test_prod['from_swipe_left_count']

test_prod['to_common_users_right_swipe_percentage'] = test_prod['common_users_swiped_right']/test_prod['to_swipe_right_count']
test_prod['from_common_users_right_swipe_percentage'] = test_prod['common_users_swiped_right']/test_prod['from_swipe_right_count']

test_prod['to_overall_common_users_left_swipe_percentage'] = test_prod['common_users_swiped_left']/test_prod['to_total_swipe_counts']
test_prod['from_overall_common_users_left_swipe_percentage'] = test_prod['common_users_swiped_left']/test_prod['from_total_swipe_counts']

test_prod['to_overall_common_users_right_swipe_percentage'] = test_prod['common_users_swiped_right']/test_prod['to_total_swipe_counts']
test_prod['from_overall_common_users_right_swipe_percentage'] = test_prod['common_users_swiped_right']/test_prod['from_total_swipe_counts']

In [None]:
print("Filling Null values")
train_prod.fillna(-999, inplace=True)
test_prod.fillna(-999, inplace=True)

In [None]:
lgb_bottom_importance = [
                         'from_purpose_id_12',
                         'to_unique_degree_count',
                         'from_purpose_id_3',
                         'from_unique_school_count',
                         'rev_strength_4',
                         'to_unique_school_count',
                         'rev_strength_7',
                         'rev_strength_8',
                         'rev_strength_6',
                         'rev_strength_5']

self_intro_columns = train_prod.columns[train_prod.columns.str.contains("_self_intro_")].tolist()

to_self_intro_columns = train_prod.columns[train_prod.columns.str.contains("to_self_intro_")].tolist()
from_self_intro_columns = train_prod.columns[train_prod.columns.str.contains("from_self_intro_")].tolist()

purpose_columns = train_prod.columns[train_prod.columns.str.contains("_purpose_")].tolist()
rev_strength_columns = train_prod.columns[train_prod.columns.str.contains("rev_strength")].tolist()
common_strength_columns = train_prod.columns[train_prod.columns.str.contains("common_strength")].tolist()
review_comments = train_prod.columns[train_prod.columns.str.contains("_review_comments_")].tolist()

others = ['to_review_comments_count', 'from_review_comments_count', 'to_last_login_year']

In [None]:
dep = 'score'
drop = ['from-to', 'user_purpose_cosine_similarity', 'to_last_swipe_year']  + review_comments + rev_strength_columns + purpose_columns + to_self_intro_columns + from_self_intro_columns + common_strength_columns
indep = train_prod.columns.difference([dep]+drop)

print("Indep length:",len(indep))
print("Columns that are dropped:", drop)

In [None]:
print("Split to train and test local")
np.random.seed(100)
train_local_X, test_local_X, train_local_Y, test_local_Y = train_test_split(train_prod[indep],
                                                                            train_prod[dep], 
                                                                            test_size=0.2,
                                                                            stratify=train_prod[dep])

print(train_local_X.shape, train_local_Y.shape, test_local_X.shape, test_local_Y.shape)

# Light GBM

In [None]:
def lgb_eval_accuracy(preds, dtrain):
    labels = dtrain.get_label()
    preds = preds.reshape(len(np.unique(labels)), -1)
    preds = preds.argmax(axis = 0)
    acc = accuracy_score(y_pred = preds, y_true = labels)
    return 'Accuracy', acc, True

In [None]:
params = {
#     'device_type':'gpu',
    'nthreads':12,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class':4,
    'metric': 'custom',
    'num_leaves': 170,
    #'max_depth': 10,
    'learning_rate': 0.04,
    'feature_fraction': 0.6,
    'bagging_fraction': 1,
    'bagging_freq': 1,
    'verbose': 1
}

### K-FOLD method

In [None]:
def train_lgbm_model(train_local_X, train_local_Y, test_local_X, test_local_Y, test_prod):
    
    lgb_train_local = lgb.Dataset(train_local_X, train_local_Y, free_raw_data=False)
    lgb_test_local = lgb.Dataset(test_local_X, test_local_Y, reference=lgb_train_local,  free_raw_data=False)

    lgb_test_prod = lgb.Dataset(test_prod[indep], reference=lgb_train_local)

    num_rounds = 100000
    print('Starting training...')
    start = datetime.now()

    np.random.seed(100)
    lgb_model_local = lgb.train(params,
                                lgb_train_local,
                                num_boost_round=num_rounds ,
                                valid_sets=lgb_test_local,
                                feval=lgb_eval_accuracy,
                                early_stopping_rounds=30)

    lgb_model_local.best_iteration
    
    end = datetime.now()
    print("")
    print("Total training time:", end - start)

    lgb_prod_prediction = lgb_model_local.predict(test_prod[indep])
    
    return lgb_prod_prediction


In [None]:
np.random.seed(100)
nfolds = 5
kf = KFold(n_splits=nfolds, shuffle=True, random_state=100)

all_CV_prediction = {}
for i, (train_local_index, test_local_index) in enumerate(kf.split(train_prod[indep])):
    
    train_local_X, train_local_Y = train_prod.loc[train_local_index, indep], train_prod.loc[train_local_index, dep]
    test_local_X, test_local_Y = train_prod.loc[test_local_index, indep], train_prod.loc[test_local_index, dep]
    
    print("Current Fold:", i)
    fold_prediction = train_lgbm_model(train_local_X, train_local_Y, test_local_X, test_local_Y, test_prod)
    
    all_CV_prediction['fold_'+str(i)] = fold_prediction
    print("#############################")
    print("")

# K fold cross validation and average the predictions of all folds

In [None]:

lgb_prod_prediction = lgb_model_prod.predict(test_prod[indep])

final_predictions = np.zeros(shape=(test_prod.shape[0], 4))
print(final_predictions.shape)

for fold in all_CV_prediction:
    print(fold)
    final_predictions = final_predictions + all_CV_prediction[fold]

# Averaging the output from the CV
print("Averging all the predictions")
final_predictions = final_predictions/len(all_CV_prediction)

lgb_prod_prediction = np.argmax(final_predictions, axis=1)


In [None]:
lgb_submission = pd.DataFrame({"from-to": test_prod['from-to'],
                               "score": lgb_prod_prediction.astype('float')})

lgb_submission.to_csv("../submissions/lgb_sub_11.csv", index=False)


In [None]:
######################################################

# Cross Validation

In [None]:
print("forming the LightGBM dataset")
lgb_train_local = lgb.Dataset(train_local_X, train_local_Y, free_raw_data=False)
lgb_test_local = lgb.Dataset(test_local_X, test_local_Y, reference=lgb_train_local,  free_raw_data=False)

lgb_train_prod = lgb.Dataset(train_prod[indep], train_prod[dep])
lgb_test_prod = lgb.Dataset(test_prod[indep], reference=lgb_train_prod)


In [None]:
num_rounds = 10000
print('Starting training...')
start = datetime.now()

np.random.seed(100)
lgb_cv = lgb.cv(params,
                lgb_train_prod,
                nfold=5,
                num_boost_round=num_rounds ,
                #valid_sets=lgb_test_local,
                feval=lgb_eval_accuracy,
                early_stopping_rounds=20,
                verbose_eval=True)

end = datetime.now()
print("")
print("Total training time:", end - start)


# Local Validation

In [None]:
gc.collect()
print("Running the hold out valid")
num_rounds = 10000
print('Starting training...')
start = datetime.now()

np.random.seed(100)
lgb_model_local = lgb.train(params,
                            lgb_train_local,
                            num_boost_round=num_rounds ,
                            valid_sets=lgb_test_local,
                            feval=lgb_eval_accuracy,
                            #categorical_feature=['from', 'to'],
                            early_stopping_rounds=50)

end = datetime.now()
print("")
print("Total training time:", end - start)


In [None]:
local_prediction = lgb_model_local.predict(test_local_X)
local_prediction = local_prediction.argmax(axis=1)

print("Accuracy:", accuracy_score(test_local_Y, local_prediction))
print("Confusion matrix")
confusion_matrix(test_local_Y, local_prediction, labels=np.unique(train_prod.score))

In [None]:
lgb_feature_importance = pd.DataFrame({"features" : lgb_model_local.feature_name(),
                                       "importance" : lgb_model_local.feature_importance()}
                                     ).sort_values(['importance'], ascending=False).reset_index(drop=True)
print("Feature importance top 60")
lgb_feature_importance.head(60)

# Prod model

In [None]:
final_round = lgb_model_local.best_iteration + int(lgb_model_local.best_iteration*0.4)

print("Validation rounds:", lgb_model_local.best_iteration)
print("Final round is:", final_round)

print('Starting training...')
start = datetime.now()

np.random.seed(100)
lgb_model_prod = lgb.train(params,
                            lgb_train_prod,
                            num_boost_round=final_round ,
                            valid_sets=lgb_test_local,
                            feval=lgb_eval_accuracy,
#                             early_stopping_rounds=20
                          )

end = datetime.now()
print("")
print("Total training time:", end - start)


In [None]:
lgb_prod_prediction = lgb_model_prod.predict(test_prod[indep])
lgb_prod_prediction = lgb_prod_prediction.argmax(axis=1)
lgb_prod_prediction

In [None]:
lgb_submission = pd.DataFrame({"from-to": test_prod['from-to'],
                               "score": lgb_prod_prediction.astype('float')})

lgb_submission.to_csv("../submissions/lgb_sub_30.csv", index=False)
