In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool, cv

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

import gc

# from dask_cuda import LocalCUDACluster
# from dask.distributed import Client
# from dask import array as da
# from dask import dataframe as dd 
# import dask
# from xgboost.dask import DaskDMatrix

In [79]:
print("Reading the data...")
train_prod = pd.read_pickle("../data/train_prod_v9.pickle")
test_prod = pd.read_pickle("../data/test_prod_v9.pickle")

gc.collect()

print(train_prod.shape, test_prod.shape)

(903605, 2) (387975, 1)


In [None]:
train_prod['age_difference'] = train_prod['from_age']-train_prod['to_age']
test_prod['age_difference'] = test_prod['from_age']-test_prod['to_age']

gc.collect()

In [81]:
train_prod.fillna(-999, inplace=True)
test_prod.fillna(-999, inplace=True)

In [82]:
lgb_bottom_importance = [
 'from_purpose_id_12',
 'to_unique_degree_count',
 'from_purpose_id_3',
 'from_unique_school_count',
 'rev_strength_4',
 'to_unique_school_count',
 'rev_strength_7',
 'rev_strength_8',
 'rev_strength_6',
 'rev_strength_5']

self_intro_columns = train_prod.columns[train_prod.columns.str.contains("_self_intro_")].tolist()

to_self_intro_columns = train_prod.columns[train_prod.columns.str.contains("to_self_intro_")].tolist()
from_self_intro_columns = train_prod.columns[train_prod.columns.str.contains("from_self_intro_")].tolist()

purpose_columns = train_prod.columns[train_prod.columns.str.contains("_purpose_id_")].tolist()
rev_strength_columns = train_prod.columns[train_prod.columns.str.contains("rev_strength")].tolist()
review_comments = train_prod.columns[train_prod.columns.str.contains("_review_comments_")].tolist()

others = ['to_review_comments_count', 'from_review_comments_count', 'to_last_login_year']

In [83]:
dep = 'score'
drop = ['from-to', 'user_purpose_cosine_similarity']  + review_comments + rev_strength_columns + from_self_intro_columns
indep = train_prod.columns.difference([dep]+drop)

print("Indep length:",len(indep))
print("Columns that are dropped:", drop)

Indep length: 2
Columns that are dropped: ['from-to']


In [84]:
def LogisticRegression_model(prediction_type, train_X, train_Y, test_X, test_Y, test_prod_X):
    LR = LogisticRegression()
    LR.fit(train_X, train_Y)
    
    if prediction_type != 'prob':  
        LR_local_prediction = LR.predict(test_X)
        LR_prod_prediction = LR.predict(test_prod_X)
        
        print("Accuracy:", accuracy_score(test_Y, LR_local_prediction))
    else:
        LR_local_prediction = LR.predict_proba(test_X)
        LR_prod_prediction = LR.predict_proba(test_prod_X)
        
        temp = LR_local_prediction.argmax(axis=1)
        print("Accuracy:", accuracy_score(test_Y, temp))
        
    del LR
    gc.collect()
    
    return LR_local_prediction, LR_prod_prediction

In [None]:
def RandomForest_model(prediction_type, train_X, train_Y, test_X, test_Y, test_prod_X):
    RF = RandomForestClassifier(n_estimators=20, n_jobs=-1)
    RF.fit(train_X, train_Y)
    
    if prediction_type != 'prob':  
        RF_local_prediction = RF.predict(test_X)
        RF_prod_prediction = RF.predict(test_prod_X)
        
        print("Accuracy:", accuracy_score(test_Y, RF_local_prediction))
    else:
        RF_local_prediction = RF.predict_proba(test_X)
        RF_prod_prediction = RF.predict_proba(test_prod_X)
    
        temp = RF_local_prediction.argmax(axis=1)
        print("Accuracy:", accuracy_score(test_Y, temp))
        
    del RF
    gc.collect()
    
    return RF_local_prediction, RF_prod_prediction

In [None]:
def GBM_model(prediction_type, train_X, train_Y, test_X, test_Y, test_prod_X):
    GBM = GradientBoostingClassifier(n_estimators=20)
    GBM.fit(train_X, train_Y)

    if prediction_type != 'prob':  
        GBM_local_prediction = GBM.predict(test_X)
        GBM_prod_prediction = GBM.predict(test_prod_X)
        
        print("Accuracy:", accuracy_score(test_Y, GBM_local_prediction))
    else:
        GBM_local_prediction = GBM.predict_proba(test_X)
        GBM_prod_prediction = GBM.predict_proba(test_prod_X)
        
        temp = GBM_local_prediction.argmax(axis=1)
        print("Accuracy:", accuracy_score(test_Y, temp))
    
    del GBM
    gc.collect()
    
    return GBM_local_prediction, GBM_prod_prediction

In [85]:
def catboost_model(prediction_type, train_X, train_Y, test_X, test_Y, test_prod_X):
    nrounds = 10000

    eval_dataset = Pool(test_X, test_Y)
    np.random.seed(100)
    cat_local_model = CatBoostClassifier(iterations=nrounds
                                         ,learning_rate=0.4
                                         ,depth=13
                                         #,subsample=0.8
                                         #,colsample_bylevel=1
                                         ,task_type="CPU"
                                         #,loss_function='RMSE'
                                         ,eval_metric='Accuracy'
                                         ,early_stopping_rounds=20
                                             ,verbose=False
                                        )

    cat_local_model.fit(train_X, train_Y,
                        eval_set=eval_dataset)
    
    print("Best Iteration:", cat_local_model.best_iteration_)
    print("Best Accuracy:", cat_local_model.best_score_['validation']['Accuracy'])
    
    if prediction_type != 'prob':
        cat_local_prediction = cat_local_model.predict(test_X).reshape(-1)
        cat_prod_prediction = cat_local_model.predict(test_prod_X).reshape(-1)
        
    else:
        cat_local_prediction = cat_local_model.predict_proba(test_X)
        cat_prod_prediction = cat_local_model.predict_proba(test_prod_X)
        
    del eval_dataset, cat_local_model
    gc.collect()
    
    return cat_local_prediction, cat_prod_prediction

In [8]:
def lgb_eval_accuracy(preds, dtrain):
    labels = dtrain.get_label()
    preds = preds.reshape(len(np.unique(labels)), -1)
    preds = preds.argmax(axis = 0)
    acc = accuracy_score(y_pred = preds, y_true = labels)
    return 'Accuracy', acc, True

def lgb_model(prediction_type, train_X, train_Y, test_X, test_Y, test_prod_X):
    
    num_rounds = 10000
    
    lgb_train_local = lgb.Dataset(train_X, train_Y, free_raw_data=False)
    lgb_test_local = lgb.Dataset(test_X, test_Y, reference=lgb_train_local,  free_raw_data=False)

    #lgb_test_prod = lgb.Dataset(test_prod[indep], reference=lgb_train_prod)

    params = {
    #     'device_type':'gpu',
        'nthreads':12,
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class':4,
        'metric': 'custom',
        'num_leaves': 150,
        #'max_depth': 10,
        'learning_rate': 0.03,
        'feature_fraction': 0.6,
        'bagging_fraction': 1,
        'bagging_freq': 1,
        'verbose': 1
    }

    np.random.seed(100)
    lgb_model_local = lgb.train(params,
                                lgb_train_local,
                                num_boost_round=num_rounds ,
                                valid_sets=lgb_test_local,
                                feval=lgb_eval_accuracy,
                                early_stopping_rounds=20,
                                verbose_eval=False)
    
    print("LGB Best iteration:", lgb_model_local.best_iteration)
    print("LGB Best Accuracy:", lgb_model_local.best_score['valid_0']['Accuracy'])
    
    if prediction_type != 'prob':
        lgb_local_prediction = lgb_model_local.predict(test_X).argmax(axis=1)
        lgb_prod_prediction = lgb_model_local.predict(test_prod_X).argmax(axis=1)
    else:
        lgb_local_prediction = lgb_model_local.predict(test_X)
        lgb_prod_prediction = lgb_model_local.predict(test_prod_X)

    del lgb_train_local, lgb_test_local, lgb_model_local
    gc.collect()
    
    return lgb_local_prediction, lgb_prod_prediction


In [98]:
def xgb_eval_accuracy(preds, dtrain):
    labels = dtrain.get_label()
    acc = accuracy_score(y_pred = preds, y_true = labels)
    return 'Accuracy', acc

def XGB_model(prediction_type, train_X, train_Y, test_X, test_Y, test_prod_X):
    
    dtrain_local = xgb.DMatrix(data = train_X, label = train_Y)
    dtest_local = xgb.DMatrix(data = test_X, label = test_Y)
    
    dtest_prod = xgb.DMatrix(data = test_prod_X)

    eval_set = [(dtrain_local,'train'), (dtest_local,'test')]

    num_rounds = 10000
    
    params = {'objective' : 'multi:softprob'
              ,'num_class' : 4
              #,'eval_metric': 'rmse'
              ,'max_depth' : 6
              ,'eta' : 0.2
              ,'subsample': 1
              ,'colsample_bytree': 1
              #,'tree_method' : 'gpu_hist'
              }

    np.random.seed(100)
    xgb_model_local = xgb.train(params,
                                dtrain_local,
                                evals = eval_set,
                                num_boost_round = num_rounds,
                                #feval = xgb_eval_accuracy,
                                #maximize = True,
                                verbose_eval = False,
                                early_stopping_rounds = 30)

          
    if prediction_type != 'prob':
        xgb_local_prediction = xgb_model_local.predict(dtest_local).argmax(axis=1)
        xgb_prod_prediction = xgb_model_local.predict(dtest_prod).argmax(axis=1)        
        print("XGB Best accuracy:", accuracy_score(xgb_local_prediction, test_Y))
    else:
        xgb_local_prediction = xgb_model_local.predict(dtest_local)
        xgb_prod_prediction = xgb_model_local.predict(dtest_prod)
        print("XGB Best accuracy:", accuracy_score(xgb_local_prediction.argmax(axis=1), test_Y))

    print("XGB Best iteration:", xgb_model_local.best_iteration)
    
    del dtrain_local, dtest_local, dtest_prod, eval_set, xgb_model_local
    gc.collect()
    
    return xgb_local_prediction, xgb_prod_prediction

In [None]:
np.random.seed(100)
nfolds = 5
uniq_classes = train_prod['score'].nunique()
kf = KFold(n_splits=nfolds, shuffle=True, random_state=100)

prediction_type = 'prob'

LR_chosen = True
RF_chosen = False
GBM_chosen = False
CAT_chosen = True
LGB_chosen = True
XGB_chosen = True

print(f"Running {nfolds} validation")
print(f"The prediction type chosen is {prediction_type}")

if prediction_type == 'prob':
    if LR_chosen:
        LR_fold=np.empty(shape=(0, uniq_classes))
        LR_prod=np.empty(shape=(test_prod.shape[0], uniq_classes))

    if RF_chosen:
        RF_fold=np.empty(shape=(0, uniq_classes))
        RF_prod=np.empty(shape=(test_prod.shape[0], uniq_classes))

    if GBM_chosen:
        GBM_fold=np.empty(shape=(0, uniq_classes))
        GBM_prod=np.empty(shape=(test_prod.shape[0], uniq_classes))
    
    if CAT_chosen:
        CAT_fold=np.empty(shape=(0, uniq_classes))
        CAT_prod=np.empty(shape=(test_prod.shape[0], uniq_classes))
    
    if LGB_chosen:
        LGB_fold=np.empty(shape=(0, uniq_classes))
        LGB_prod=np.empty(shape=(test_prod.shape[0], uniq_classes))

    if XGB_chosen:
        XGB_fold=np.empty(shape=(0, uniq_classes))
        XGB_prod=np.empty(shape=(test_prod.shape[0], uniq_classes))
        
else:
    if LR_chosen:
        LR_fold=np.empty(shape=(0, 1))
        LR_prod=np.empty(shape=(test_prod.shape[0], 1))

    if RF_chosen:
        RF_fold=np.empty(shape=(0, 1))
        RF_prod=np.empty(shape=(test_prod.shape[0], 1))

    if GBM_chosen:
        GBM_fold=np.empty(shape=(0, 1))
        GBM_prod=np.empty(shape=(test_prod.shape[0], 1))
    
    if CAT_chosen:
        CAT_fold=np.empty(shape=(0, 1))
        CAT_prod=np.empty(shape=(test_prod.shape[0], 1))
    
    if LGB_chosen:
        LGB_fold=np.empty(shape=(0, 1))
        LGB_prod=np.empty(shape=(test_prod.shape[0], 1))
    
    if XGB_chosen:
        XGB_fold=np.empty(shape=(0, 1))
        XGB_prod=np.empty(shape=(test_prod.shape[0], 1))

train_prod_stacked_score = []
train_prod_stack_from_to = []
for i, (train_local_index, test_local_index) in enumerate(kf.split(train_prod[indep])):
    
    train_local_X, train_local_Y = train_prod.loc[train_local_index, indep], train_prod.loc[train_local_index, dep]
    test_local_X, test_local_Y = train_prod.loc[test_local_index, indep], train_prod.loc[test_local_index, dep]
    
    train_prod_stacked_score = train_prod_stacked_score + test_local_Y.values.tolist()
    train_prod_stack_from_to = train_prod_stack_from_to + train_prod['from-to'][test_local_index].values.tolist()
    
    print("##################################################################################################################")
    print("Current Fold:", i)
    print("")
    
    if LR_chosen:
        print("Training the Logistic Regression model")
        LR_fold_prediction, LR_prod_prediction = LogisticRegression_model(prediction_type, train_X=train_local_X, train_Y=train_local_Y, test_X=test_local_X, test_Y=test_local_Y, test_prod_X=test_prod[indep])
        
        if prediction_type == 'prob':
            LR_fold = np.append(LR_fold, LR_fold_prediction, axis=0)
            LR_prod = np.sum([LR_prod, LR_prod_prediction], axis=0)
            #print("LR shape", LR_fold.shape, LR_prod.shape)
        else:
            LR_fold = np.append(LR_fold, LR_fold_prediction.reshape(-1, 1), axis=0)
            #LR_prod = np.append(LR_prod, LR_prod_prediction.reshape(-1, 1), axis=1)
            LR_prod = np.sum([LR_prod, LR_prod_prediction.reshape(-1, 1)], axis=0)
            #print("LR shape", LR_fold.shape, LR_prod.shape)
    
        del LR_fold_prediction, LR_prod_prediction
        gc.collect()
    
    print("")
    if RF_chosen:
        print("Training the RandomForest model")
        RF_fold_prediction, RF_prod_prediction = RandomForest_model(prediction_type, train_X=train_local_X, train_Y=train_local_Y, test_X=test_local_X, test_Y=test_local_Y, test_prod_X=test_prod[indep])
    
        if prediction_type == 'prob':    
            RF_fold = np.append(RF_fold, RF_fold_prediction, axis=0)
            RF_prod = np.sum([RF_prod, RF_prod_prediction], axis=0)
            #print("RF shape", RF_fold.shape, RF_prod.shape)
        else:
            RF_fold = np.append(RF_fold, RF_fold_prediction.reshape(-1, 1), axis=0)
            #RF_prod = np.append(RF_prod, RF_prod_prediction.reshape(-1, 1), axis=1)
            RF_prod = np.sum([RF_prod, RF_prod_prediction.reshape(-1, 1)], axis=0)
            #print("RF shape", RF_fold.shape, RF_prod.shape)

        del RF_fold_prediction, RF_prod_prediction
        gc.collect()
            
    print("")
    if GBM_chosen:
        print("Training the GBM model")
        GBM_fold_prediction, GBM_prod_prediction = GBM_model(prediction_type, train_X=train_local_X, train_Y=train_local_Y, test_X=test_local_X, test_Y=test_local_Y, test_prod_X=test_prod[indep])

        if prediction_type == 'prob':
            GBM_fold = np.append(GBM_fold, GBM_fold_prediction, axis=0)
            GBM_prod = np.sum([GBM_prod, GBM_prod_prediction], axis=0)
            #print("GBM shape", GBM_fold.shape, GBM_prod.shape)
        else:
            GBM_fold = np.append(GBM_fold, GBM_fold_prediction.reshape(-1, 1), axis=0)
            #GBM_prod = np.append(GBM_prod, GBM_prod_prediction.reshape(-1, 1), axis=1)
            GBM_prod = np.sum([GBM_prod, GBM_prod_prediction.reshape(-1, 1)], axis=0)
            #print("GBM shape", GBM_fold.shape, GBM_prod.shape)

        del GBM_fold_prediction, GBM_prod_prediction
        gc.collect()
            
    print("")
    if CAT_chosen:
        print("Training the CAT model")
        CAT_fold_prediction, CAT_prod_prediction = catboost_model(prediction_type, train_X=train_local_X, train_Y=train_local_Y, test_X=test_local_X, test_Y=test_local_Y, test_prod_X=test_prod[indep])

        if prediction_type == 'prob':
            CAT_fold = np.append(CAT_fold, CAT_fold_prediction, axis=0)
            CAT_prod = np.sum([CAT_prod, CAT_prod_prediction], axis=0)
            #print("CAT shape", CAT_fold.shape, CAT_prod.shape)
        else:
            CAT_fold = np.append(CAT_fold, CAT_fold_prediction.reshape(-1, 1), axis=0)
            #CAT_prod = np.append(CAT_prod, CAT_prod_prediction.reshape(-1, 1), axis=1)
            CAT_prod = np.sum([CAT_prod, CAT_prod_prediction.reshape(-1, 1)], axis=0)
            #print("CAT shape", CAT_fold.shape, CAT_prod.shape)

        del CAT_fold_prediction, CAT_prod_prediction
        gc.collect()
        
    print("")
    if LGB_chosen:
        print("Training the LGB model")
        LGB_fold_prediction, LGB_prod_prediction = lgb_model(prediction_type, train_X=train_local_X, train_Y=train_local_Y, test_X=test_local_X, test_Y=test_local_Y, test_prod_X=test_prod[indep])

        if prediction_type == 'prob':
            LGB_fold = np.append(LGB_fold, LGB_fold_prediction, axis=0)
            LGB_prod = np.sum([LGB_prod, LGB_prod_prediction], axis=0)
            #print("LGB shape", LGB_fold.shape, LGB_prod.shape)
        else:
            LGB_fold = np.append(LGB_fold, LGB_fold_prediction.reshape(-1, 1), axis=0)
            #LGB_prod = np.append(LGB_prod, LGB_prod_prediction.reshape(-1, 1), axis=1)
            LGB_prod = np.sum([LGB_prod, LGB_prod_prediction.reshape(-1, 1)], axis=0)
            #print("LGB shape", LGB_fold.shape, LGB_prod.shape)

        del LGB_fold_prediction, LGB_prod_prediction
        gc.collect()
        
    print("")
    if XGB_chosen:
        print("Training the XGB model")
        XGB_fold_prediction, XGB_prod_prediction = XGB_model(prediction_type, train_X=train_local_X, train_Y=train_local_Y, test_X=test_local_X, test_Y=test_local_Y, test_prod_X=test_prod[indep])

        if prediction_type == 'prob':
            XGB_fold = np.append(XGB_fold, XGB_fold_prediction, axis=0)
            XGB_prod = np.sum([XGB_prod, XGB_prod_prediction], axis=0)
            #print("XGB shape", XGB_fold.shape, XGB_prod.shape)
        else:
            XGB_fold = np.append(XGB_fold, XGB_fold_prediction.reshape(-1, 1), axis=0)
            #XGB_prod = np.append(XGB_prod, XGB_prod_prediction.reshape(-1, 1), axis=1)
            XGB_prod = np.sum([XGB_prod, XGB_prod_prediction.reshape(-1, 1)], axis=0)
            #print("XGB shape", XGB_fold.shape, XGB_prod.shape)

        del XGB_fold_prediction, XGB_prod_prediction
        gc.collect()
        
    print("##################################################################################################################")
    
if LR_chosen:
    LR_prod = LR_prod/nfolds
    col_names = ["LR_class_" + str(i) for i in range(uniq_classes)]
    LR_train_stack = pd.DataFrame(LR_fold, columns=col_names)
    LR_test_stack = pd.DataFrame(LR_prod, columns=col_names)

    del LR_fold, LR_prod
    gc.collect()
    
if RF_chosen:
    RF_prod = RF_prod/nfolds
    col_names = ["RF_class_" + str(i) for i in range(uniq_classes)]
    RF_train_stack = pd.DataFrame(RF_fold, columns=col_names)
    RF_test_stack = pd.DataFrame(RF_prod, columns=col_names)

    del RF_fold, RF_prod
    gc.collect()
    
if GBM_chosen:
    GBM_prod = GBM_prod/nfolds
    col_names = ["GBM_class_" + str(i) for i in range(uniq_classes)]
    GBM_train_stack = pd.DataFrame(GBM_fold, columns=col_names)
    GBM_test_stack = pd.DataFrame(GBM_prod, columns=col_names)

    del GBM_fold, GBM_prod
    gc.collect()
    
if CAT_chosen:
    CAT_prod = CAT_prod/nfolds
    col_names = ["CAT_class_" + str(i) for i in range(uniq_classes)]
    CAT_train_stack = pd.DataFrame(CAT_fold, columns=col_names)
    CAT_test_stack = pd.DataFrame(CAT_prod, columns=col_names)

    del CAT_fold, CAT_prod
    gc.collect()
    
if LGB_chosen:
    LGB_prod = LGB_prod/nfolds    
    col_names = ["LGB_class_" + str(i) for i in range(uniq_classes)]
    LGB_train_stack = pd.DataFrame(LGB_fold, columns=col_names)
    LGB_test_stack = pd.DataFrame(LGB_prod, columns=col_names)
    
    del LGB_fold, LGB_prod
    gc.collect()

if XGB_chosen:
    XGB_prod = XGB_prod/nfolds    
    col_names = ["XGB_class_" + str(i) for i in range(uniq_classes)]
    XGB_train_stack = pd.DataFrame(XGB_fold, columns=col_names)
    XGB_test_stack = pd.DataFrame(XGB_prod, columns=col_names)
    
    del XGB_fold, XGB_prod
    gc.collect()

    

In [101]:
def get_final_stacked_df(stack_list):
    temp = pd.DataFrame()
    for df in stack_list:
        temp = pd.concat([temp, df], axis=1)
    return temp

train_prod_list = [#LR_train_stack, RF_train_stack, GBM_train_stack, 
                   CAT_train_stack, LGB_train_stack, XGB_train_stack]
test_prod_list = [#LR_test_stack, #RF_test_stack, GBM_test_stack, 
                  CAT_test_stack, LGB_test_stack, XGB_test_stack]

train_prod_stack = get_final_stacked_df(stack_list=train_prod_list)
test_prod_stack = get_final_stacked_df(stack_list=test_prod_list)

del CAT_train_stack, LGB_train_stack, XGB_train_stack
del CAT_test_stack, LGB_test_stack, XGB_test_stack
gc.collect()

train_prod_stack['from-to'] = train_prod_stack_from_to
test_prod_stack['from-to'] = test_prod['from-to']

train_prod_stack['score'] = train_prod_stacked_score


In [None]:
print("Forming the stacked dataset complete")
print("#############################################################################################")

In [103]:
print(train_prod_stack)

Unnamed: 0,LR_class_0,LR_class_1,LR_class_2,LR_class_3,CAT_class_0,CAT_class_1,CAT_class_2,CAT_class_3,LGB_class_0,LGB_class_1,LGB_class_2,LGB_class_3,XGB_class_0,XGB_class_1,XGB_class_2,XGB_class_3,from-to
0,0.352273,0.507783,0.009373,0.130571,5.382390e-01,4.205111e-01,1.234213e-02,2.890775e-02,0.448086,0.507931,0.007936,0.036046,0.474764,0.375599,0.069657,0.079980,217075-4685538
1,0.548561,0.406225,0.001360,0.043854,3.134227e+06,3.136086e+06,3.136508e+06,1.117196e+05,0.912853,0.522936,0.013222,0.050989,2.203321,1.274404,0.241674,0.280601,6244786-1445883
2,0.252783,0.252753,0.244735,0.249729,3.134535e+06,1.609551e+04,3.146760e+06,4.076247e+05,0.865267,0.555013,0.019608,0.060112,2.009685,1.433352,0.243509,0.313453,23911-18259
3,0.255452,0.256785,0.237771,0.249992,3.135066e+06,7.287505e+05,3.157483e+06,3.158061e+06,0.662424,0.737433,0.025373,0.074770,0.830553,2.648030,0.219623,0.301794,17459-78547
4,0.279815,0.280140,0.195095,0.244950,2.478498e+06,3.110418e+06,1.669032e+06,2.717058e+06,0.798428,0.627707,0.017496,0.056369,1.933084,1.586854,0.206946,0.273116,250404-220746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387970,0.255418,0.257040,0.237430,0.250112,5.435568e-01,1.958267e+00,2.164610e-01,2.817151e-01,0.638127,0.766966,0.023228,0.071678,0.759899,2.779913,0.196460,0.263728,11081-87146
387971,0.551897,0.401093,0.001557,0.045454,2.237595e+00,5.572369e-01,6.211370e-02,1.430545e-01,0.993614,0.439979,0.010651,0.055756,2.886819,0.711750,0.157910,0.243521,6274366-1238849
387972,0.424073,0.395840,0.030757,0.149330,2.199391e+00,5.255323e-01,7.942211e-02,1.956547e-01,0.942218,0.486644,0.013615,0.057524,2.952625,0.635011,0.158758,0.253606,2324651-1067329
387973,0.334416,0.405805,0.054335,0.205444,2.246880e+00,4.588826e-01,9.617525e-02,1.980619e-01,1.027588,0.401354,0.018629,0.052429,3.105410,0.530016,0.162889,0.201685,41869-2419226


In [None]:
print("Writing the stacked train and test dataset")

train_prod_stack.to_pickle("../data/train_prod_stack.pickle")
test_prod_stack.to_pickle("../data/test_prod_stack.pickle")

In [19]:
train_prod_stack = pd.read_pickle("../data/train_prod_stack.pickle")
test_prod_stack = pd.read_pickle("../data/test_prod_stack.pickle")

In [20]:
print("Creating the dataset for the stacked model")
dep='score'
indep = train_prod_stack.columns.difference(['from-to', dep])

np.random.seed(100)
train_stack_local_X, test_stack_local_X, train_stack_local_Y, test_stack_local_Y = train_test_split(train_prod_stack[indep],
                                                                                                    train_prod_stack[dep], 
                                                                                                    test_size=0.2,
                                                                                                    stratify=train_prod_stack[dep])

print(train_stack_local_X.shape, train_stack_local_Y.shape, test_stack_local_X.shape, test_stack_local_Y.shape)

gc.collect()

Creating the dataset for the stacked model
(722884, 12) (722884,) (180721, 12) (180721,)


126

# LightGBM meta model

In [21]:
print("Creating data for the light gbm meta model")
lgb_train_stack_local = lgb.Dataset(train_stack_local_X, train_stack_local_Y, free_raw_data=False)
lgb_test_stack_local = lgb.Dataset(test_stack_local_X, test_stack_local_Y, reference=lgb_train_stack_local,  free_raw_data=False)

lgb_train_stack_prod = lgb.Dataset(train_prod_stack[indep], train_prod_stack[dep])


Creating data for the light gbm meta model


In [None]:
params = {
#     'device_type':'gpu',
    'nthreads':8,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class':4,
    'metric': 'custom',
    'num_leaves': 150,
    #'max_depth': 10,
    'learning_rate': 0.02,
    'feature_fraction': 1,
    'bagging_fraction': 0.6,
    'bagging_freq': 1,
    'verbose': 1
}

num_rounds = 10000
print('Starting training...')
start = datetime.now()

np.random.seed(100)
lgb_model_stack_local = lgb.train(params,
                                  lgb_train_stack_local,
                                  num_boost_round=num_rounds,
                                  valid_sets=lgb_test_stack_local,
                                  feval=lgb_eval_accuracy,
                                  early_stopping_rounds=50)

end = datetime.now()
print("")
print("Total training time:", end - start)

# Prod model

In [None]:
final_round = lgb_model_stack_local.best_iteration + int(lgb_model_stack_local.best_iteration*0.3)

print("Validation rounds:", lgb_model_stack_local.best_iteration)
print("Final round is:", final_round)

print('Starting training...')
start = datetime.now()

np.random.seed(100)
lgb_model_prod = lgb.train(params,
                            lgb_train_stack_prod,
                            num_boost_round=final_round ,
                            valid_sets=lgb_test_stack_local,
                            feval=lgb_eval_accuracy,
#                             early_stopping_rounds=20
                          )

end = datetime.now()
print("")
print("Total training time:", end - start)


In [63]:
print("Running the prediction")
lgb_stack_prod_prediction = lgb_model_stack_local.predict(test_prod_stack[indep])
lgb_stack_prod_prediction = lgb_stack_prod_prediction.argmax(axis=1)
lgb_stack_prod_prediction

Running the prediction


array([0, 0, 0, ..., 0, 0, 1])

In [65]:
print("Writing the prediction to the folder")

lgb_submission = pd.DataFrame({"from-to": test_prod_stack['from-to'],
                               "score": lgb_stack_prod_prediction.astype('float')})

lgb_submission.to_csv("../submissions/lgb_stack_sub_2.csv", index=False)


Writing the prediction to the folder
