In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import matthews_corrcoef, precision_score, recall_score, make_scorer
from imblearn.over_sampling import SMOTE
from joblib import dump

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [8]:
# the classifiers we'll try
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [4]:
import sys
sys.path.append("..")
from pipeline import pre_pipeline

In [2]:
train_wavelets = np.load("../data/preprocessed/train.npy")
train_meta = pd.read_csv(
    "../data/vsb-power-line-fault-detection/metadata_train.csv",
    index_col="signal_id",
)

In [3]:
x = train_wavelets.T
y = train_meta.target.values

In [35]:
# regular old logistic regression
logreg_pipe = make_pipeline(
    StandardScaler(),
    PCA(n_components=117),
    LogisticRegression(max_iter=1000),
)
scores = cross_validate(
    logreg_pipe,
    x,
    y,
    cv=5,
    scoring={
        "mcc": make_scorer(matthews_corrcoef),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score)
    },
    return_train_score=True,
    n_jobs=-1)
scores

{'fit_time': array([122.24217224, 121.94515896, 122.94566202, 122.93026805,
        122.97139025]),
 'score_time': array([1.45846677, 1.75756097, 1.42823792, 1.41776299, 1.431777  ]),
 'test_mcc': array([0.16400168, 0.        , 0.        , 0.        , 0.        ]),
 'train_mcc': array([0.        , 0.11591517, 0.06690475, 0.06690475, 0.1159157 ]),
 'test_precision': array([1., 0., 0., 0., 0.]),
 'train_precision': array([0., 1., 1., 1., 1.]),
 'test_recall': array([0.02857143, 0.        , 0.        , 0.        , 0.        ]),
 'train_recall': array([0.        , 0.01428571, 0.0047619 , 0.0047619 , 0.01428571])}

In [34]:
# logistic regression with oversampling of minority class
logreg_smote_pipe = make_pipeline(
    StandardScaler(),
    StandardScaler(),
    PCA(n_components=117),
    SMOTE(),
    LogisticRegression(max_iter=1000)
)
scores = cross_validate(
    logreg_smote_pipe,
    x,
    y,
    cv=5,
    scoring={
        "mcc": make_scorer(matthews_corrcoef),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score)
    },
    return_train_score=True,
    n_jobs=-1)
scores

{'fit_time': array([109.65180993, 112.22011209, 114.78355002, 116.50059891,
        109.98276973]),
 'score_time': array([2.58175516, 1.74244189, 1.59548402, 1.30442715, 2.11000299]),
 'test_mcc': array([ 0.00624308, -0.02956371,  0.02489893,  0.00477819,  0.00992236]),
 'train_mcc': array([0.03179565, 0.04279495, 0.02562474, 0.04220668, 0.03469203]),
 'test_precision': array([0.06186869, 0.05228758, 0.06675063, 0.06153846, 0.06290116]),
 'train_precision': array([0.0685624 , 0.07154264, 0.06694032, 0.07124842, 0.06932574]),
 'test_recall': array([0.46666667, 0.38095238, 0.5047619 , 0.45714286, 0.46666667]),
 'train_recall': array([0.51666667, 0.53333333, 0.5047619 , 0.53809524, 0.52142857])}

In [37]:
# logistic regression with undersampling of majority class
logreg_under_pipe = make_pipeline(
    StandardScaler(),
    StandardScaler(),
    PCA(n_components=117),
    RandomUnderSampler(),
    LogisticRegression(max_iter=1000)
)
scores = cross_validate(
    logreg_under_pipe,
    x,
    y,
    cv=5,
    scoring={
        "mcc": make_scorer(matthews_corrcoef),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score)
    },
    return_train_score=True,
    n_jobs=-1)
scores


{'fit_time': array([121.28158903, 120.66528726, 123.55399609, 123.60228705,
        123.04901814]),
 'score_time': array([2.46837187, 2.96796179, 1.62382293, 1.65613389, 2.08885789]),
 'test_mcc': array([-0.00797612,  0.02552003,  0.0004757 ,  0.01715315,  0.03106988]),
 'train_mcc': array([0.02388203, 0.04910987, 0.03206789, 0.02160188, 0.02594391]),
 'test_precision': array([0.05858586, 0.06552419, 0.06037001, 0.06391753, 0.0666004 ]),
 'train_precision': array([0.06517924, 0.07045455, 0.06665039, 0.06477013, 0.06542969]),
 'test_recall': array([0.55238095, 0.61904762, 0.59047619, 0.59047619, 0.63809524]),
 'train_recall': array([0.61904762, 0.66428571, 0.65      , 0.60714286, 0.63809524])}

In [39]:
# svc
svc_pipe = make_pipeline(
    StandardScaler(),
    PCA(n_components=117),
    SVC(),
)
scores = cross_validate(
    svc_pipe,
    x,
    y,
    cv=5,
    scoring={
        "mcc": make_scorer(matthews_corrcoef),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score)
    },
    return_train_score=True,
    n_jobs=-1)
scores


{'fit_time': array([128.21110916, 128.07923698, 128.10684395, 127.81513596,
        126.87180185]),
 'score_time': array([1.87585902, 1.96322513, 1.97696185, 2.23807001, 2.68675113]),
 'test_mcc': array([0., 0., 0., 0., 0.]),
 'train_mcc': array([0.09463075, 0.08194675, 0.09463118, 0.09463118, 0.12521225]),
 'test_precision': array([0., 0., 0., 0., 0.]),
 'train_precision': array([1., 1., 1., 1., 1.]),
 'test_recall': array([0., 0., 0., 0., 0.]),
 'train_recall': array([0.00952381, 0.00714286, 0.00952381, 0.00952381, 0.01666667])}

In [40]:
# svc with smote
svc_smote_pipe = make_pipeline(
    StandardScaler(),
    PCA(n_components=117),
    SMOTE(),
    SVC(),
)
scores = cross_validate(
    svc_smote_pipe,
    x,
    y,
    cv=5,
    scoring={
        "mcc": make_scorer(matthews_corrcoef),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score)
    },
    return_train_score=True,
    n_jobs=-1)
scores


{'fit_time': array([136.17364001, 131.87273717, 134.59288907, 136.99522495,
        133.17808414]),
 'score_time': array([2.80311489, 4.17259502, 3.47928119, 2.54791689, 4.29121494]),
 'test_mcc': array([0.18363512, 0.19855285, 0.14140722, 0.15732941, 0.22975782]),
 'train_mcc': array([0.5728796 , 0.5795602 , 0.58276881, 0.56586819, 0.57855454]),
 'test_precision': array([0.16412214, 0.16845878, 0.14173228, 0.16113744, 0.19512195]),
 'train_precision': array([0.37671861, 0.38411215, 0.38862559, 0.37628385, 0.39050388]),
 'test_recall': array([0.40952381, 0.44761905, 0.34285714, 0.32380952, 0.45714286]),
 'train_recall': array([0.97857143, 0.97857143, 0.97619048, 0.95952381, 0.95952381])}

In [None]:
# svc with undersampling
svc_under_pipe = make_pipeline(
    StandardScaler(),
    PCA(n_components=117),
    RandomUnderSampler(),
    SVC(),
)
scores = cross_validate(
    svc_under_pipe,
    x,
    y,
    cv=5,
    scoring={
        "mcc": make_scorer(matthews_corrcoef),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score)
    },
    return_train_score=True,
    n_jobs=-1)
scores


In [42]:
# gradient boosting classifier
gb_pipe = make_pipeline(
    StandardScaler(),
    PCA(n_components=117),
    GradientBoostingClassifier(),
)
scores = cross_validate(
    gb_pipe,
    x,
    y,
    cv=5,
    scoring={
        "mcc": make_scorer(matthews_corrcoef),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score)
    },
    return_train_score=True,
    n_jobs=-1)
scores


{'fit_time': array([184.09811091, 184.66878223, 184.9705379 , 183.92526817,
        183.93814492]),
 'score_time': array([1.86334205, 1.2965138 , 1.0988903 , 2.09017897, 1.56113505]),
 'test_mcc': array([0.09463204, 0.06017486, 0.09463032, 0.16117074, 0.01539284]),
 'train_mcc': array([0.35540504, 0.37412262, 0.34237922, 0.36488126, 0.3801655 ]),
 'test_precision': array([1.        , 0.28571429, 1.        , 0.625     , 0.11111111]),
 'train_precision': array([1., 1., 1., 1., 1.]),
 'test_recall': array([0.00952381, 0.01904762, 0.00952381, 0.04761905, 0.00952381]),
 'train_recall': array([0.13333333, 0.14761905, 0.12380952, 0.14047619, 0.15238095])}

In [44]:
# gradient boosting classifier with smote
gb_smote_pipe = make_pipeline(
    StandardScaler(),
    PCA(n_components=117),
    SMOTE(),
    GradientBoostingClassifier(),
)
scores = cross_validate(
    gb_smote_pipe,
    x,
    y,
    cv=5,
    scoring={
        "mcc": make_scorer(matthews_corrcoef),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score)
    },
    return_train_score=True,
    n_jobs=-1)
scores


{'fit_time': array([190.48775196, 190.37495232, 190.33022285, 190.44678092,
        190.63299727]),
 'score_time': array([1.50466299, 1.53098488, 1.62283325, 1.49462819, 1.32593775]),
 'test_mcc': array([0.17771473, 0.19584146, 0.2379054 , 0.16477304, 0.27436131]),
 'train_mcc': array([0.40708659, 0.40109835, 0.40694417, 0.41327984, 0.38715822]),
 'test_precision': array([0.14906832, 0.15254237, 0.1744186 , 0.14423077, 0.1898017 ]),
 'train_precision': array([0.26143293, 0.26131045, 0.26060606, 0.26941363, 0.24635569]),
 'test_recall': array([0.45714286, 0.51428571, 0.57142857, 0.42857143, 0.63809524]),
 'train_recall': array([0.81666667, 0.79761905, 0.81904762, 0.80952381, 0.8047619 ])}

In [45]:
# gradient boosting classifier with undersampling
gb_under_pipe = make_pipeline(
    StandardScaler(),
    PCA(n_components=117),
    RandomUnderSampler(),
    GradientBoostingClassifier(),
)
scores = cross_validate(
    gb_under_pipe,
    x,
    y,
    cv=5,
    scoring={
        "mcc": make_scorer(matthews_corrcoef),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score)
    },
    return_train_score=True,
    n_jobs=-1)
scores


{'fit_time': array([134.103266  , 132.45036006, 118.85817099, 132.71368909,
        134.27906418]),
 'score_time': array([1.21686697, 1.50433683, 6.08345699, 1.22847199, 1.00822091]),
 'test_mcc': array([0.13414558, 0.17279703, 0.2016784 , 0.13920492, 0.18399306]),
 'train_mcc': array([0.331693  , 0.34318783, 0.33378645, 0.34710796, 0.33809786]),
 'test_precision': array([0.1039604 , 0.11620295, 0.12420382, 0.10815603, 0.11710324]),
 'train_precision': array([0.16443745, 0.17217176, 0.16535122, 0.17348203, 0.16848045]),
 'test_recall': array([0.6       , 0.67619048, 0.74285714, 0.58095238, 0.72380952]),
 'train_recall': array([0.9952381 , 0.99285714, 0.99761905, 1.        , 0.9952381 ])}

It seems like Gradient Boosting with SMOTE scored the highest.

In [48]:
# train model
gb_smote_pipe.fit(x, y)
dump(gb_smote_pipe, "../data/models/gb_smote.joblib")

['../data/models/gb_smote.joblib']