In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split

import mlflow
import mlflow.xgboost



In [2]:
DATASET_NAME = 'dataset_v2'
train_df = pd.read_csv(f'../output/train_{DATASET_NAME}.csv')
train_df, valid_df = train_test_split(train_df, test_size=0.2)
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,...,question1_degree,question2_degree,question1_degree_deviation,question2_degree_deviation,question1_second_order_total_degree,question2_second_order_total_degree,question1_max_kcore,question2_max_kcore,match_share,tfidf_word_match_share
348693,348695,477279,477280,How do I focus on studies without getting dist...,I'm in class 12. I have a very good understand...,0,focus study without get distract feel someone,class 12 good understand physic never get goo...,30,39,...,1,1,-0.264051,-0.264051,1,1,0,0,0.0,0.0
153068,153069,209100,240362,Where can I get best photo booth hire in and a...,Where can I find best photo booth Company in S...,1,get best photo booth hire around sydney,find best photo booth company sydney,59,76,...,3,2,0.78228,0.259114,7,5,0,0,0.615385,0.648491
10558,10558,20442,20443,If I delete a contact on whatsapp and my main ...,If I deactivate my WhatsApp account and then r...,0,delete contact whatsapp main contact list con...,deactivate whatsapp account reactivate later ...,51,65,...,4,2,1.305445,0.259114,12,6,0,0,0.416667,0.372969
60936,60936,106487,106488,Who are some of greatest people in the world t...,"What are some good ""social experiments"" for ge...",0,greatest people world step comfort zones,good social experiments get comfort zone,44,54,...,1,1,-0.264051,-0.264051,1,1,0,0,0.166667,0.221396
25021,25021,46671,46672,How do you get free checks from Wells Fargo?,How much does it cost to get a safe deposit bo...,0,get free check wells fargo,much doe cost get safe deposit box wells fargo,40,63,...,1,1,-0.264051,-0.264051,1,1,0,0,0.461538,0.435291


In [12]:
features = [
    'simple_ratio',
    'partial_ratio',
    'token_sort_ratio',
    'token_set_ratio',
    'question1_type',
    'question2_type',
    'question1_punctuation_count',
    'question2_punctuation_count',
    'match_share',
    'tfidf_word_match_share',
    'question1_degree',
    'question2_degree',
    'question1_degree_deviation',
    'question2_degree_deviation',
    'question1_second_order_total_degree',
    'question2_second_order_total_degree',
    'question1_max_kcore',
    'question2_max_kcore',
    ]

train_is_duplicate_df = train_df[['is_duplicate']].copy()
train_features_df = train_df[features].copy()

valid_is_duplicate_df = valid_df[['is_duplicate']].copy()
valid_features_df = valid_df[features].copy()

In [13]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4

num_boost_round = 400
n_fold = 5
early_stopping_rounds=50

with mlflow.start_run():
    d_train = xgb.DMatrix(train_features_df, label=train_is_duplicate_df)
    d_valid = xgb.DMatrix(valid_features_df, label=valid_is_duplicate_df)

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    bst = xgb.train(params, d_train, num_boost_round, watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=10)
    
    cv_results = xgb.cv ( 
        params, 
        d_train, 
        num_boost_round = num_boost_round, 
        seed = 42, 
        nfold = n_fold, 
        early_stopping_rounds = early_stopping_rounds,
        verbose_eval=10
    ) 
    
    mlflow.log_param('features', ','.join(features))
    mlflow.log_param('objective', params['objective'])
    mlflow.log_param('eval_metric', params['eval_metric'])
    mlflow.log_param('eta', params['eta'])
    mlflow.log_param('max_depth', params['max_depth'])
    
    mlflow.log_param('num_boost_round', num_boost_round)
    mlflow.log_param('n_fold', n_fold)
    mlflow.log_param('early_stopping_rounds', early_stopping_rounds)
    
    mlflow.log_metric('train_logloss', cv_results.to_dict()['train-logloss-mean'][num_boost_round-1])
    mlflow.log_metric('valid_logloss', cv_results.to_dict()['test-logloss-mean'][num_boost_round-1])
    
    mlflow.xgboost.log_model(bst, 'quora_xgb')

[0]	train-logloss:0.68331	valid-logloss:0.68329
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.60204	valid-logloss:0.60167
[20]	train-logloss:0.54389	valid-logloss:0.54343
[30]	train-logloss:0.50123	valid-logloss:0.50049
[40]	train-logloss:0.46833	valid-logloss:0.46732
[50]	train-logloss:0.44270	valid-logloss:0.44152
[60]	train-logloss:0.42224	valid-logloss:0.42085
[70]	train-logloss:0.40587	valid-logloss:0.40437
[80]	train-logloss:0.39306	valid-logloss:0.39159
[90]	train-logloss:0.38248	valid-logloss:0.38098
[100]	train-logloss:0.37387	valid-logloss:0.37233
[110]	train-logloss:0.36702	valid-logloss:0.36556
[120]	train-logloss:0.36121	valid-logloss:0.35978
[130]	train-logloss:0.35670	valid-logloss:0.35531
[140]	train-logloss:0.35218	valid-logloss:0.35082
[150]	train-logloss:0.34837	valid-logloss:0.34696
[160]	train-logloss:0.34541	valid-logloss:0.34393
[170]	trai

In [14]:
bst.get_fscore()

{'match_share': 984,
 'question1_second_order_total_degree': 718,
 'question2_second_order_total_degree': 638,
 'simple_ratio': 127,
 'token_sort_ratio': 633,
 'question2_degree': 1026,
 'question1_degree': 717,
 'token_set_ratio': 237,
 'tfidf_word_match_share': 192,
 'partial_ratio': 365,
 'question1_type': 142,
 'question2_type': 91,
 'question2_punctuation_count': 56,
 'question1_punctuation_count': 54}