In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split

import mlflow
import mlflow.xgboost



In [30]:
DATASET_NAME = 'dataset_v2'
train_df = pd.read_csv(f'../output/train_{DATASET_NAME}.csv')
train_df, valid_df = train_test_split(train_df, test_size=0.2)
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,...,question1_punctuation_count,question2_punctuation_count,question1_hash,question2_hash,question1_degree,question2_degree,question1_degree_deviation,question2_degree_deviation,match_share,tfidf_word_match_share
393363,393366,526154,526155,Suggest a best app to download videos or songs...,Why the fall of China worry us?,0,suggest best app download video song window lap,fall china worry us,33,35,...,1,1,Suggest a best app to download videos or songs...,Why the fall of China worry us?,1,1,-0.504718,-0.504718,0.0,0.0
165293,165294,256693,256694,What are the merchant marine requirements?,How can the effects of osmosis on cells be exp...,0,merchant marine requirements,effect osmosis cell explained,32,41,...,1,1,What are the merchant marine requirements?,How can the effects of osmosis on cells be exp...,2,1,0.495282,-0.504718,0.0,0.0
259051,259053,374759,374760,How do I club multiple personal loans?,Can you have multiple personal loans?,0,club multiple personal loans,multiple personal loans,76,100,...,1,1,How do I club multiple personal loans?,Can you have multiple personal loans?,1,1,-0.504718,-0.504718,0.857143,0.85276
151975,151976,21507,87432,What should one do to find purpose of one's life?,What is the purpose of your life?,1,one find purpose one life,purpose life,58,77,...,2,1,What should one do to find purpose of one's life?,What is the purpose of your life?,12,41,10.495282,39.495282,0.571429,0.608611
126744,126745,143926,204202,Does pure love really exist?,Does true and pure love exists?,1,pure love really exist,true pure love exists,56,68,...,1,1,Does pure love really exist?,Does true and pure love exists?,3,5,1.495282,3.495282,0.5,0.494909


In [31]:
features = [
    'simple_ratio',
    'partial_ratio',
    'token_sort_ratio',
    'token_set_ratio',
    'question1_type',
    'question2_type',
    'question1_punctuation_count',
    'question2_punctuation_count',
    'match_share',
    'tfidf_word_match_share'
    ]

train_is_duplicate_df = train_df[['is_duplicate']].copy()
train_features_df = train_df[features].copy()

valid_is_duplicate_df = valid_df[['is_duplicate']].copy()
valid_features_df = valid_df[features].copy()

In [36]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4

num_boost_round = 400
n_fold = 5
early_stopping_rounds=50

with mlflow.start_run():
    d_train = xgb.DMatrix(train_features_df, label=train_is_duplicate_df)
    d_valid = xgb.DMatrix(valid_features_df, label=valid_is_duplicate_df)

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    bst = xgb.train(params, d_train, num_boost_round, watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=10)
    
    cv_results = xgb.cv ( 
        params, 
        d_train, 
        num_boost_round = num_boost_round, 
        seed = 42, 
        nfold = n_fold, 
        early_stopping_rounds = early_stopping_rounds,
        verbose_eval=10
    ) 
    
    mlflow.log_param('features', ','.join(features))
    mlflow.log_param('objective', params['objective'])
    mlflow.log_param('eval_metric', params['eval_metric'])
    mlflow.log_param('eta', params['eta'])
    mlflow.log_param('max_depth', params['max_depth'])
    
    mlflow.log_param('num_boost_round', num_boost_round)
    mlflow.log_param('n_fold', n_fold)
    mlflow.log_param('early_stopping_rounds', early_stopping_rounds)
    
    mlflow.log_metric('train_logloss', cv_results.to_dict()['train-logloss-mean'][num_boost_round-1])
    mlflow.log_metric('valid_logloss', cv_results.to_dict()['test-logloss-mean'][num_boost_round-1])
    
    mlflow.xgboost.log_model(bst, 'quora_xgb')

[0]	train-logloss:0.68590	valid-logloss:0.68584
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.62650	valid-logloss:0.62635
[20]	train-logloss:0.58488	valid-logloss:0.58463
[30]	train-logloss:0.55435	valid-logloss:0.55407
[40]	train-logloss:0.53135	valid-logloss:0.53103
[50]	train-logloss:0.51379	valid-logloss:0.51345
[60]	train-logloss:0.50025	valid-logloss:0.49987
[70]	train-logloss:0.48952	valid-logloss:0.48916
[80]	train-logloss:0.48106	valid-logloss:0.48073
[90]	train-logloss:0.47452	valid-logloss:0.47419
[100]	train-logloss:0.46937	valid-logloss:0.46908
[110]	train-logloss:0.46530	valid-logloss:0.46504
[120]	train-logloss:0.46197	valid-logloss:0.46178
[130]	train-logloss:0.45931	valid-logloss:0.45916
[140]	train-logloss:0.45719	valid-logloss:0.45707
[150]	train-logloss:0.45544	valid-logloss:0.45535
[160]	train-logloss:0.45393	valid-logloss:0.45390
[170]	trai

In [37]:
bst.get_fscore()

{'match_share': 1532,
 'partial_ratio': 965,
 'token_sort_ratio': 923,
 'token_set_ratio': 372,
 'simple_ratio': 342,
 'question1_type': 411,
 'tfidf_word_match_share': 627,
 'question2_punctuation_count': 293,
 'question1_punctuation_count': 181,
 'question2_type': 281}