# Model Tuning

## Libraries/Packages + Data Loading

In [2]:
# Data manipulation/loading/modification
import pandas as pd
import numpy as np

# To save model
import pickle

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Preprocessing and modeling
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, ConfusionMatrixDisplay,\
                            confusion_matrix, RocCurveDisplay, roc_curve,\
                            roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

# Helpers
import os
import sys
module_path = os.path.abspath(os.pardir)
if module_path not in sys.path:
    sys.path.append(module_path)

from src.parse_it import *
from src.modeling import *
from src.EDA import *

In [3]:
df = pd.read_csv('../../data/lemmed_combined.csv',
                 usecols=['text_lem', 'label'])[['text_lem', 'label']]

df.head(10)

Unnamed: 0,text_lem,label
0,ryan steven lochte lkti lok tee bear august am...,0
1,cam ship world war ii era british merchant shi...,0
2,politics vietnam define single party socialist...,0
3,pennsylvania route pa state highway locate mon...,0
4,clubland tv british free air dance music chann...,1
5,william j binder american facial plastic recon...,1
6,startex power texas base retail electricity pr...,1
7,miodrag tomic serbian cyrillic may may februar...,0
8,pietro annigoni complete number portrait queen...,0
9,fuso fusang central battery ironclad build imp...,0


## Data Separation

In [4]:
X = df['text_lem']
y = df['label']

## Train/Test Split: #1

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=138,
                                                    stratify=y)

## Train/Test Split #2

In [6]:
X_tr_val, X_val, y_tr_val, y_val = train_test_split(X_train, y_train,
                                                    random_state=138,
                                                    stratify=y_train)

## Model/Pipeline Creation

In [7]:
xgb_pipe = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=100000)),
    ('xgb', XGBClassifier())
])

In [26]:
xgb_pipe.fit(X_tr_val, y_tr_val)

Pipeline(steps=[('tfidf', TfidfVectorizer(max_features=100000)),
                ('xgb',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=0, num_parallel_tree=1, random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=1, tree_method='exact',
                               validate_parameters=1, verbosity=None))])

In [28]:
cv_res_1 = cross_validate(xgb_pipe,
                          X_val,
                          y_val,
                          scoring=['accuracy', 'f1_macro'],
                          cv=5,
                          verbose=1,
                          n_jobs=-2,
                          return_train_score=True)

pretty_cv(cv_res_1)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 1.000
Test accuracy:     0.945
F-1 Score
--------------------------------
Training F1 score: 1.000
Test F1 score:     0.944


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:  4.3min finished


## Tuning `XGBClassifier`

In [8]:
xgb_depth5 = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=100000)),
    ('xgb', XGBClassifier(max_depth=5))
])

xgb_depth5.fit(X_tr_val, y_tr_val)

Pipeline(steps=[('tfidf', TfidfVectorizer(max_features=100000)),
                ('xgb',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=5, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=0, num_parallel_tree=1, random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=1, tree_method='exact',
                               validate_parameters=1, verbosity=None))])

In [10]:
cv_res_2 = cross_validate(xgb_depth5,
                          X_val,
                          y_val,
                          scoring=['accuracy', 'f1_macro'],
                          cv=5,
                          verbose=1,
                          n_jobs=-2,
                          return_train_score=True)

pretty_cv(cv_res_2)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 1.000
Test accuracy:     0.944
F-1 Score
--------------------------------
Training F1 score: 1.000
Test F1 score:     0.943


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:  4.1min finished


## Saving Model for Deployment

In [30]:
pickle.dump(xgb_pipe, open('../models/xgb_model.sav', 'wb'))