In [8]:
import xarray as xr
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn import svm

from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from imblearn import pipeline
import xgboost as xgb

from imblearn.under_sampling import RandomUnderSampler

from tqdm.notebook import tqdm

sns.set_theme('notebook')

In [2]:
data = pd.read_csv("data/csv_w_labels.csv")
labels = data.pop("mhw_label")
data = data.fillna(-1)

In [3]:
X_train, X_test, y_train, y_test = \
    train_test_split(data, labels, test_size=0.3, shuffle=True, stratify=labels, random_state=1)

In [4]:
len(X_train), len(y_train)

(12225276, 12225276)

In [5]:
# set up model pipeline and cross validate.
scoring = ['accuracy', 'precision', 'recall']

pipe = pipeline.Pipeline(
    [('pca', PCA(n_components=5)),
     ('scale', StandardScaler()),
     ('sample', RandomUnderSampler(random_state=1)),   # tried other strategies, but performed poorly or took too long. 
     ('clf', SGDClassifier(loss='hinge'))], #hinge- linear svm, log - log regression
    verbose=True
)

results = cross_validate(pipe, X_train, y_train, 
                         scoring=scoring, verbose=10, 
                         n_jobs=8, return_estimator=True)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   5 | elapsed:  1.2min remaining:  1.8min
[Parallel(n_jobs=8)]: Done   3 out of   5 | elapsed:  1.2min remaining:   48.0s
[Parallel(n_jobs=8)]: Done   5 out of   5 | elapsed:  1.2min remaining:    0.0s
[Parallel(n_jobs=8)]: Done   5 out of   5 | elapsed:  1.2min finished


In [6]:
results

{'fit_time': array([64.01957417, 64.8337338 , 64.91606331, 64.6774509 , 64.78137851]),
 'score_time': array([3.63818169, 3.70700312, 3.71199822, 3.67741299, 3.66196465]),
 'estimator': [Pipeline(steps=[('pca', PCA(n_components=5)), ('scale', StandardScaler()),
                  ('sample', RandomUnderSampler(random_state=1)),
                  ('clf', SGDClassifier())],
           verbose=True),
  Pipeline(steps=[('pca', PCA(n_components=5)), ('scale', StandardScaler()),
                  ('sample', RandomUnderSampler(random_state=1)),
                  ('clf', SGDClassifier())],
           verbose=True),
  Pipeline(steps=[('pca', PCA(n_components=5)), ('scale', StandardScaler()),
                  ('sample', RandomUnderSampler(random_state=1)),
                  ('clf', SGDClassifier())],
           verbose=True),
  Pipeline(steps=[('pca', PCA(n_components=5)), ('scale', StandardScaler()),
                  ('sample', RandomUnderSampler(random_state=1)),
                  ('clf', SGDCl

In [7]:
best_model = results["estimator"][np.argmax(results["test_recall"])]
y_pred = best_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.93      0.50      0.65   4798619
         1.0       0.10      0.62      0.17    440785

    accuracy                           0.51   5239404
   macro avg       0.52      0.56      0.41   5239404
weighted avg       0.86      0.51      0.61   5239404



In [8]:


xgbc = xgb.XGBClassifier(n_estimators=10, use_label_encoder=False)

pipe = pipeline.Pipeline(
    [('sample', RandomUnderSampler(random_state=1)),   # tried other strategies, but performed poorly or took too long. 
     ('clf', xgbc)], 
    verbose=True
)

results = cross_validate(pipe, X_train, y_train, 
                         scoring=scoring, verbose=10, 
                         n_jobs=8, return_estimator=True)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   5 | elapsed:   29.5s remaining:   44.3s
[Parallel(n_jobs=8)]: Done   3 out of   5 | elapsed:   30.2s remaining:   20.1s
[Parallel(n_jobs=8)]: Done   5 out of   5 | elapsed:   30.3s remaining:    0.0s
[Parallel(n_jobs=8)]: Done   5 out of   5 | elapsed:   30.3s finished


In [9]:
best_model = results["estimator"][np.argmax(results["test_recall"])]
y_pred = best_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.95      0.65      0.77   4798619
         1.0       0.14      0.63      0.23    440785

    accuracy                           0.64   5239404
   macro avg       0.55      0.64      0.50   5239404
weighted avg       0.88      0.64      0.72   5239404



In [10]:
pos_weight = (labels == 0).sum() / (labels == 1).sum()

xgbc = xgb.XGBClassifier(n_estimators=10, use_label_encoder=False, scale_pos_weight=pos_weight)

pipe = pipeline.Pipeline(
    [('sample', RandomUnderSampler(random_state=1)),   # tried other strategies, but performed poorly or took too long. 
     ('clf', xgbc)], 
    verbose=True
)

results = cross_validate(pipe, X_train, y_train, 
                         scoring=scoring, verbose=10, 
                         n_jobs=8, return_estimator=True)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   5 | elapsed:   29.3s remaining:   44.0s
[Parallel(n_jobs=8)]: Done   3 out of   5 | elapsed:   29.4s remaining:   19.5s
[Parallel(n_jobs=8)]: Done   5 out of   5 | elapsed:   29.7s remaining:    0.0s
[Parallel(n_jobs=8)]: Done   5 out of   5 | elapsed:   29.7s finished


In [11]:
best_model = results["estimator"][np.argmax(results["test_recall"])]
y_pred = best_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00   4798619
         1.0       0.08      1.00      0.16    440785

    accuracy                           0.08   5239404
   macro avg       0.54      0.50      0.08   5239404
weighted avg       0.92      0.08      0.01   5239404



In [None]:
from sklearn.model_selection import GridSearchCV

pca = PCA()
xgbc = xgb.XGBClassifier()

pipe = pipeline.Pipeline(
    [('pca', pca), 
     ('sample', RandomUnderSampler(random_state=1)),   # tried other strategies, but performed poorly or took too long. 
     ('clf', xgbc)], 
    verbose=True
)

param_grid = {
    "clf__scale_pos_weight" : np.arange(1, 6), 
    "pca__n_components" : np.arange(1, len(X_test.columns) + 1, 3)
}

search = GridSearchCV(pipe, param_grid, n_jobs=4, verbose=4)
search.fit(X_train, y_train)
                                             

Fitting 5 folds for each of 45 candidates, totalling 225 fits
