In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
import xgboost as xgb
from sklearn.svm import SVC

from model_utils import *
loc_df, X_loc_train, y_loc_train, X_loc_valid, y_loc_valid, X_loc_test, y_loc_test = load_loc_data('../')
vec_df, X_vec_train, y_vec_train, X_vec_valid, y_vec_valid, X_vec_test, y_vec_test = load_vec_data('../')

In [2]:
def try_thresholds(model, X, y, rang):
    precisions, recalls, specificities = [], [], []
    
    for th in rang:
        test_probs = model.predict_proba(X)[:, 1]
        test_pred = (test_probs >= th).astype(int)
        prec, recal, spec, _ = apply_metric(y, test_pred, False)
        precisions.append(prec)
        recalls.append(recal)
        specificities.append(spec)
        
    return precisions, recalls, specificities
        

def find_highest_spec(precisions, recalls, specficities, threshold, enable_print=False):
    highest_value = 0
    highest_idx = 0
    
    for idx, (val1, val2, val3) in enumerate(zip(precisions, recalls, specficities)):
        if val1 >= threshold and val2 >= threshold:
            if val3 > highest_value:
                highest_value = val3
                highest_idx = idx
    if enable_print:
        print("TEST DATASET")
        print(f"id = {highest_idx}")
        print(f'Precision: {precisions[highest_idx]:.2f}')
        print(f'Recall: {recalls[highest_idx]:.2f}')
        print(f'Specificity: {specficities[highest_idx]:.2f}')
    
    return precisions[highest_idx], recalls[highest_idx], specficities[highest_idx]

def valid_and_test(model, rang, threshold=0.8, type='loc', enable_print=False):
    if type == 'loc':
        X_test = X_loc_test
        X_valid = X_loc_valid
        y_test = y_loc_test
        y_valid = y_loc_valid
    else:
        X_test = X_vec_test
        X_valid = X_vec_valid
        y_test = y_vec_test
        y_valid = y_vec_valid
    test_prec, test_reca, test_spec = try_thresholds(model, X_test, y_test, rang)
    if enable_print:
        print("\nTEST DATASET")
    test_performance = find_highest_spec(test_prec, test_reca, test_spec, threshold, enable_print=enable_print)
    valid_prec, valid_reca, valid_spec = try_thresholds(model, X_valid, y_valid, rang)
    if enable_print:
        print("\nVALID DATASET")
    valid_performance = find_highest_spec(valid_prec, valid_reca, valid_spec, threshold, enable_print=enable_print)
    return test_performance, valid_performance


In [3]:
loc_rfc = RandomForestClassifier(n_estimators=60)
loc_rfc.fit(X_loc_train, y_loc_train)
loc_lrm = LogisticRegression(max_iter=10000)
loc_lrm.fit(X_loc_train, y_loc_train)
loc_xgb = xgb.XGBClassifier(objective='binary:logistic', eval_metric="logloss")
loc_xgb.fit(X_loc_train, y_loc_train)
loc_svm = SVC(kernel='linear', probability=True, random_state=42)
loc_svm.fit(X_loc_train, y_loc_train);

In [4]:
vec_rfc = RandomForestClassifier(n_estimators=60)
vec_rfc.fit(X_vec_train, y_vec_train)
vec_lrm = LogisticRegression(max_iter=10000)
vec_lrm.fit(X_vec_train, y_vec_train)
vec_xgb = xgb.XGBClassifier(objective='binary:logistic', eval_metric="logloss")
vec_xgb.fit(X_vec_train, y_vec_train)
vec_svm = SVC(kernel='linear', probability=True, random_state=42)
vec_svm.fit(X_vec_train, y_vec_train);

In [5]:
iter_rang = np.arange(0.1, 0.99, 0.01)

In [6]:
def load_to_df(dict):
    columns = pd.MultiIndex.from_product(
        [['Test', 'Validation'], ['Precision', 'Recall', 'Specificity']],
        names=['Dataset', 'Metric']
    )

    data = []
    for model, metrics in dict.items():
        row = list(metrics[0] + metrics[1])  # Flatten the tuple of tuples
        data.append(row)
    
    df = pd.DataFrame(data, index=dict.keys(), columns=columns)
    return df

In [7]:
performance_dict = {}

performance_dict['Random Forest'] = valid_and_test(loc_rfc, iter_rang)
performance_dict['Logistic Regression'] = valid_and_test(loc_lrm, iter_rang)
performance_dict['XGBoost'] = valid_and_test(loc_xgb, iter_rang)
performance_dict['SVM'] = valid_and_test(loc_svm, iter_rang)

loc_08 = load_to_df(performance_dict)

performance_dict = {}

performance_dict['Random Forest'] = valid_and_test(vec_rfc, iter_rang, type='vec')
performance_dict['Logistic Regression'] = valid_and_test(vec_lrm, iter_rang, type='vec')
performance_dict['XGBoost'] = valid_and_test(vec_xgb, iter_rang, type='vec')
performance_dict['SVM'] = valid_and_test(vec_svm, iter_rang, type='vec')

vec_08 = load_to_df(performance_dict)

In [8]:
loc_08

Dataset,Test,Test,Test,Validation,Validation,Validation
Metric,Precision,Recall,Specificity,Precision,Recall,Specificity
Random Forest,0.884615,0.832579,0.625,0.917949,0.813636,0.75
Logistic Regression,0.883495,0.823529,0.625,0.907692,0.804545,0.71875
XGBoost,0.902913,0.841629,0.6875,0.889447,0.804545,0.65625
SVM,0.8867,0.81448,0.640625,0.897959,0.8,0.6875


In [9]:
vec_08

Dataset,Test,Test,Test,Validation,Validation,Validation
Metric,Precision,Recall,Specificity,Precision,Recall,Specificity
Random Forest,0.868932,0.809955,0.578125,0.774194,0.981818,0.015625
Logistic Regression,0.832685,0.968326,0.328125,0.817829,0.959091,0.265625
XGBoost,0.89899,0.80543,0.6875,0.815385,0.963636,0.25
SVM,0.806084,0.959276,0.203125,0.8,0.963636,0.171875


In [10]:
th = 0.9
performance_dict = {}

performance_dict['Random Forest'] = valid_and_test(loc_rfc, iter_rang, threshold=th)
performance_dict['Logistic Regression'] = valid_and_test(loc_lrm, iter_rang, threshold=th)
performance_dict['XGBoost'] = valid_and_test(loc_xgb, iter_rang, threshold=th)
performance_dict['SVM'] = valid_and_test(loc_svm, iter_rang, threshold=th)

loc_09 = load_to_df(performance_dict)

performance_dict = {}

performance_dict['Random Forest'] = valid_and_test(vec_rfc, iter_rang, type='vec', threshold=th)
performance_dict['Logistic Regression'] = valid_and_test(vec_lrm, iter_rang, type='vec', threshold=th)
performance_dict['XGBoost'] = valid_and_test(vec_xgb, iter_rang, type='vec', threshold=th)
performance_dict['SVM'] = valid_and_test(vec_svm, iter_rang, type='vec', threshold=th)

vec_09 = load_to_df(performance_dict)

In [11]:
loc_09

Dataset,Test,Test,Test,Validation,Validation,Validation
Metric,Precision,Recall,Specificity,Precision,Recall,Specificity
Random Forest,0.803636,1.0,0.15625,0.80292,1.0,0.15625
Logistic Regression,0.838462,0.986425,0.34375,0.816794,0.972727,0.25
XGBoost,0.844358,0.9819,0.375,0.856,0.972727,0.4375
SVM,0.785714,0.995475,0.0625,0.814126,0.995455,0.21875


In [12]:
vec_09

Dataset,Test,Test,Test,Validation,Validation,Validation
Metric,Precision,Recall,Specificity,Precision,Recall,Specificity
Random Forest,0.790614,0.99095,0.09375,0.774194,0.981818,0.015625
Logistic Regression,0.811111,0.99095,0.203125,0.788321,0.981818,0.09375
XGBoost,0.801471,0.986425,0.15625,0.796296,0.977273,0.140625
SVM,0.79562,0.986425,0.125,0.783088,0.968182,0.078125


In [13]:
th = 0.75
performance_dict = {}

performance_dict['Random Forest'] = valid_and_test(loc_rfc, iter_rang, threshold=th)
performance_dict['Logistic Regression'] = valid_and_test(loc_lrm, iter_rang, threshold=th)
performance_dict['XGBoost'] = valid_and_test(loc_xgb, iter_rang, threshold=th)
performance_dict['SVM'] = valid_and_test(loc_svm, iter_rang, threshold=th)

loc_075 = load_to_df(performance_dict)

performance_dict = {}

performance_dict['Random Forest'] = valid_and_test(vec_rfc, iter_rang, type='vec', threshold=th)
performance_dict['Logistic Regression'] = valid_and_test(vec_lrm, iter_rang, type='vec', threshold=th)
performance_dict['XGBoost'] = valid_and_test(vec_xgb, iter_rang, type='vec', threshold=th)
performance_dict['SVM'] = valid_and_test(vec_svm, iter_rang, type='vec', threshold=th)

vec_075 = load_to_df(performance_dict)

In [14]:
loc_075

Dataset,Test,Test,Test,Validation,Validation,Validation
Metric,Precision,Recall,Specificity,Precision,Recall,Specificity
Random Forest,0.889474,0.764706,0.671875,0.923077,0.763636,0.78125
Logistic Regression,0.882653,0.782805,0.640625,0.918919,0.772727,0.765625
XGBoost,0.917582,0.755656,0.765625,0.893617,0.763636,0.6875
SVM,0.891753,0.782805,0.671875,0.902174,0.754545,0.71875


In [15]:
vec_075

Dataset,Test,Test,Test,Validation,Validation,Validation
Metric,Precision,Recall,Specificity,Precision,Recall,Specificity
Random Forest,0.933702,0.764706,0.8125,0.922222,0.754545,0.78125
Logistic Regression,0.922222,0.751131,0.78125,0.897849,0.759091,0.703125
XGBoost,0.943182,0.751131,0.84375,0.906593,0.75,0.734375
SVM,0.922652,0.755656,0.78125,0.887097,0.75,0.671875


In [16]:
th = 0.85
performance_dict = {}

performance_dict['Random Forest'] = valid_and_test(loc_rfc, iter_rang, threshold=th)
performance_dict['Logistic Regression'] = valid_and_test(loc_lrm, iter_rang, threshold=th)
performance_dict['XGBoost'] = valid_and_test(loc_xgb, iter_rang, threshold=th)
performance_dict['SVM'] = valid_and_test(loc_svm, iter_rang, threshold=th)

loc_085 = load_to_df(performance_dict)

performance_dict = {}

performance_dict['Random Forest'] = valid_and_test(vec_rfc, iter_rang, type='vec', threshold=th)
performance_dict['Logistic Regression'] = valid_and_test(vec_lrm, iter_rang, type='vec', threshold=th)
performance_dict['XGBoost'] = valid_and_test(vec_xgb, iter_rang, type='vec', threshold=th)
performance_dict['SVM'] = valid_and_test(vec_svm, iter_rang, type='vec', threshold=th)

vec_085 = load_to_df(performance_dict)

In [17]:
loc_085

Dataset,Test,Test,Test,Validation,Validation,Validation
Metric,Precision,Recall,Specificity,Precision,Recall,Specificity
Random Forest,0.882353,0.882353,0.59375,0.885845,0.881818,0.609375
Logistic Regression,0.883721,0.859729,0.609375,0.896714,0.868182,0.65625
XGBoost,0.899083,0.886878,0.65625,0.891509,0.859091,0.640625
SVM,0.880734,0.868778,0.59375,0.880531,0.904545,0.578125


In [18]:
vec_085

Dataset,Test,Test,Test,Validation,Validation,Validation
Metric,Precision,Recall,Specificity,Precision,Recall,Specificity
Random Forest,0.790614,0.99095,0.09375,0.774194,0.981818,0.015625
Logistic Regression,0.811111,0.99095,0.203125,0.788321,0.981818,0.09375
XGBoost,0.801471,0.986425,0.15625,0.796296,0.977273,0.140625
SVM,0.79562,0.986425,0.125,0.783088,0.968182,0.078125


In [31]:
val_preds1 = loc_lrm.predict_proba(X_loc_train)[:, 1]
val_preds2 = vec_xgb.predict_proba(X_vec_train)[:, 1]
val_preds3 = loc_xgb.predict_proba(X_loc_train)[:, 1]
val_preds4 = vec_lrm.predict_proba(X_vec_train)[:, 1]

stacked_features = np.column_stack((val_preds1, val_preds2, val_preds3, val_preds4))

meta_model = LogisticRegression()
meta_model.fit(stacked_features, y_loc_train);

test_preds1 = loc_lrm.predict_proba(X_loc_test)[:, 1]
test_preds2 = vec_xgb.predict_proba(X_vec_test)[:, 1]
test_preds3 = loc_xgb.predict_proba(X_loc_test)[:, 1]
test_preds4 = vec_lrm.predict_proba(X_vec_test)[:, 1]
test_x  = np.column_stack((test_preds1, test_preds2, test_preds3, test_preds4))

test_pred = meta_model.predict(test_x)

apply_metric(y_loc_test, test_pred)

valid_preds1 = loc_lrm.predict_proba(X_loc_valid)[:, 1]
valid_preds2 = vec_xgb.predict_proba(X_vec_valid)[:, 1]
valid_preds3 = loc_xgb.predict_proba(X_loc_valid)[:, 1]
valid_preds4 = vec_lrm.predict_proba(X_vec_valid)[:, 1]
valid_x  = np.column_stack((valid_preds1, valid_preds2, valid_preds3, valid_preds4))

valid_pred = meta_model.predict(valid_x)

apply_metric(y_loc_valid, valid_pred)

CM: 
[[ 34  30]
 [ 12 209]]
Precision: 0.87
Recall: 0.95
Specificity: 0.53
Accuracy: 0.85
CM: 
[[ 36  28]
 [ 12 208]]
Precision: 0.88
Recall: 0.95
Specificity: 0.56
Accuracy: 0.86


(0.8813559322033898,
 0.9454545454545454,
 0.5625,
 array([[ 36,  28],
        [ 12, 208]], dtype=int64))

In [32]:
val_preds1 = loc_lrm.predict_proba(X_loc_train)[:, 1]
val_preds2 = vec_lrm.predict_proba(X_vec_train)[:, 1]

stacked_features = np.column_stack((val_preds1, val_preds2))

meta_model = LogisticRegression()
meta_model.fit(stacked_features, y_loc_train);

test_preds1 = loc_lrm.predict_proba(X_loc_test)[:, 1]
test_preds2 = vec_lrm.predict_proba(X_vec_test)[:, 1]
test_x  = np.column_stack((test_preds1, test_preds2))

test_pred = meta_model.predict(test_x)

apply_metric(y_loc_test, test_pred)

valid_preds1 = loc_lrm.predict_proba(X_loc_valid)[:, 1]
valid_preds2 = vec_lrm.predict_proba(X_vec_valid)[:, 1]
valid_x  = np.column_stack((valid_preds1, valid_preds2))

valid_pred = meta_model.predict(valid_x)

apply_metric(y_loc_valid, valid_pred)

CM: 
[[ 46  18]
 [ 34 187]]
Precision: 0.91
Recall: 0.85
Specificity: 0.72
Accuracy: 0.82
CM: 
[[ 47  17]
 [ 45 175]]
Precision: 0.91
Recall: 0.80
Specificity: 0.73
Accuracy: 0.78


(0.9114583333333334,
 0.7954545454545454,
 0.734375,
 array([[ 47,  17],
        [ 45, 175]], dtype=int64))

In [33]:
val_preds1 = loc_xgb.predict_proba(X_loc_train)[:, 1]
val_preds2 = vec_xgb.predict_proba(X_vec_train)[:, 1]

stacked_features = np.column_stack((val_preds1, val_preds2))

meta_model = LogisticRegression()
meta_model.fit(stacked_features, y_loc_train);

test_preds1 = loc_xgb.predict_proba(X_loc_test)[:, 1]
test_preds2 = vec_xgb.predict_proba(X_vec_test)[:, 1]
test_x  = np.column_stack((test_preds1, test_preds2))

test_pred = meta_model.predict(test_x)

apply_metric(y_loc_test, test_pred)

valid_preds1 = loc_xgb.predict_proba(X_loc_valid)[:, 1]
valid_preds2 = vec_xgb.predict_proba(X_vec_valid)[:, 1]
valid_x  = np.column_stack((valid_preds1, valid_preds2))

valid_pred = meta_model.predict(valid_x)

apply_metric(y_loc_valid, valid_pred)

CM: 
[[ 35  29]
 [ 12 209]]
Precision: 0.88
Recall: 0.95
Specificity: 0.55
Accuracy: 0.86
CM: 
[[ 35  29]
 [ 12 208]]
Precision: 0.88
Recall: 0.95
Specificity: 0.55
Accuracy: 0.86


(0.8776371308016878,
 0.9454545454545454,
 0.546875,
 array([[ 35,  29],
        [ 12, 208]], dtype=int64))

In [36]:

val_preds1 = loc_lrm.predict_proba(X_loc_train)[:, 1]
val_preds2 = vec_xgb.predict_proba(X_vec_train)[:, 1]

stacked_features = np.column_stack((val_preds1, val_preds2))

meta_model = LogisticRegression()
meta_model.fit(stacked_features, y_loc_train)

test_preds1 = loc_lrm.predict_proba(X_loc_test)[:, 1]
test_preds2 = vec_xgb.predict_proba(X_vec_test)[:, 1]
test_x = np.column_stack((test_preds1, test_preds2))

test_pred = meta_model.predict(test_x)

apply_metric(y_loc_test, test_pred)

valid_preds1 = loc_lrm.predict_proba(X_loc_valid)[:, 1]
valid_preds2 = vec_xgb.predict_proba(X_vec_valid)[:, 1]
valid_x = np.column_stack((valid_preds1, valid_preds2))

valid_pred = meta_model.predict(valid_x)

apply_metric(y_loc_valid, valid_pred)

CM: 
[[ 37  27]
 [ 26 195]]
Precision: 0.88
Recall: 0.88
Specificity: 0.58
Accuracy: 0.81
CM: 
[[ 42  22]
 [ 35 185]]
Precision: 0.89
Recall: 0.84
Specificity: 0.66
Accuracy: 0.80


(0.893719806763285,
 0.8409090909090909,
 0.65625,
 array([[ 42,  22],
        [ 35, 185]], dtype=int64))

In [37]:

val_preds1 = loc_xgb.predict_proba(X_loc_train)[:, 1]
val_preds2 = vec_lrm.predict_proba(X_vec_train)[:, 1]

stacked_features = np.column_stack((val_preds1, val_preds2))

meta_model = LogisticRegression()
meta_model.fit(stacked_features, y_loc_train)

test_preds1 = loc_xgb.predict_proba(X_loc_test)[:, 1]
test_preds2 = vec_lrm.predict_proba(X_vec_test)[:, 1]
test_x = np.column_stack((test_preds1, test_preds2))

test_pred = meta_model.predict(test_x)

apply_metric(y_loc_test, test_pred)

valid_preds1 = loc_xgb.predict_proba(X_loc_valid)[:, 1]
valid_preds2 = vec_lrm.predict_proba(X_vec_valid)[:, 1]
valid_x = np.column_stack((valid_preds1, valid_preds2))

valid_pred = meta_model.predict(valid_x)

apply_metric(y_loc_valid, valid_pred)

CM: 
[[ 36  28]
 [ 14 207]]
Precision: 0.88
Recall: 0.94
Specificity: 0.56
Accuracy: 0.85
CM: 
[[ 36  28]
 [ 17 203]]
Precision: 0.88
Recall: 0.92
Specificity: 0.56
Accuracy: 0.84


(0.8787878787878788,
 0.9227272727272727,
 0.5625,
 array([[ 36,  28],
        [ 17, 203]], dtype=int64))