In [1]:
import sklearn
import numpy as np
import pandas as pd
from dev import DEVALGO 
from smote import SMOTEBoost
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from imblearn.metrics import geometric_mean_score

In [2]:
MINORITY = 1
MAJORITY = 0
CLASS_NAME = "class"

In [3]:
def pprint_dict(d):
    for key, value in d.items():
        print(key + ": " + "{0:.2f}".format(value/100.0))

In [4]:
df = pd.read_csv("data/Wisconsin/wisconsin.csv").sample(frac=1, random_state=0).reset_index(drop=True).iloc[:, 1:]
df.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f22,f23,f24,f25,f26,f27,f28,f29,f30,class
0,13.4,20.52,88.64,556.7,0.1106,0.1469,0.1445,0.08172,0.2116,0.07325,...,29.66,113.3,844.4,0.1574,0.3856,0.5106,0.2051,0.3585,0.1109,M
1,13.21,25.25,84.1,537.9,0.08791,0.05205,0.02772,0.02068,0.1619,0.05584,...,34.23,91.29,632.9,0.1289,0.1063,0.139,0.06005,0.2444,0.06788,B
2,14.02,15.66,89.59,606.5,0.07966,0.05581,0.02087,0.02652,0.1589,0.05586,...,19.31,96.53,688.9,0.1034,0.1017,0.0626,0.08216,0.2136,0.0671,B
3,14.26,18.17,91.22,633.1,0.06576,0.0522,0.02475,0.01374,0.1635,0.05586,...,25.26,105.8,819.7,0.09445,0.2167,0.1565,0.0753,0.2636,0.07676,B
4,13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,0.1467,0.05863,...,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169,B


In [5]:
# Uncomment this when testing on vowel
df[CLASS_NAME].replace({"M":1, "B":0}, inplace=True)
df.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f22,f23,f24,f25,f26,f27,f28,f29,f30,class
0,13.4,20.52,88.64,556.7,0.1106,0.1469,0.1445,0.08172,0.2116,0.07325,...,29.66,113.3,844.4,0.1574,0.3856,0.5106,0.2051,0.3585,0.1109,1
1,13.21,25.25,84.1,537.9,0.08791,0.05205,0.02772,0.02068,0.1619,0.05584,...,34.23,91.29,632.9,0.1289,0.1063,0.139,0.06005,0.2444,0.06788,0
2,14.02,15.66,89.59,606.5,0.07966,0.05581,0.02087,0.02652,0.1589,0.05586,...,19.31,96.53,688.9,0.1034,0.1017,0.0626,0.08216,0.2136,0.0671,0
3,14.26,18.17,91.22,633.1,0.06576,0.0522,0.02475,0.01374,0.1635,0.05586,...,25.26,105.8,819.7,0.09445,0.2167,0.1565,0.0753,0.2636,0.07676,0
4,13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,0.1467,0.05863,...,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169,0


In [6]:
# Outcomes distribution
df[CLASS_NAME].value_counts()

0    357
1    212
Name: class, dtype: int64

In [7]:
# Calculate imbalanced ratio
frq = df[CLASS_NAME].value_counts()
ratio = max(frq)/min(frq)
"{0:.1f}".format(ratio)

'1.7'

# Use the following for testing Decision tree

- run 100 iterations
- 50/50 training/testing
- Decision tree

In [8]:
res = dict()
X, y= df.iloc[:,:-1].values, df[CLASS_NAME].values
for i in range(100):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.5, random_state=0)
    clf_tree = DecisionTreeClassifier(random_state=0)
    clf_tree.fit(X_train, y_train)
    y_pred = clf_tree.predict(X_test)
    tmp_res = classification_report(y_test, y_pred, output_dict=True)
    res["precision"] = res.get("precision", 0) + tmp_res["weighted avg"]["precision"]
    res["recall"] = res.get("recall", 0) + tmp_res["weighted avg"]["recall"]
    res["f1-score"] = res.get("f1-score", 0) + tmp_res["weighted avg"]["f1-score"]
    res["specificity"] = res.get("specificity", 0) + tmp_res[str(MAJORITY)]["recall"]
    res["sensitivity"] = res.get("sensitivity", 0) + tmp_res[str(MINORITY)]["recall"]
    res["overall accuracy"] = res.get("overall accuracy", 0) + accuracy_score(y_test, y_pred,)
    res["auc"] = res.get("auc", 0) + roc_auc_score(y_test, y_pred)
    res["g_mean"] = res.get("g_mean", 0) + geometric_mean_score(y_test, y_pred)

In [9]:
pprint_dict(res)

precision: 0.95
recall: 0.95
f1-score: 0.95
specificity: 0.96
sensitivity: 0.93
overall accuracy: 0.95
auc: 0.95
g_mean: 0.95


# Use the following for testing SMOTE

- run 100 iterations
- 50/50 training/testing
- Decision tree
- N = 200

In [10]:
res = dict()
X, y= df.iloc[:,:-1].values, df[CLASS_NAME].values
for i in range(100):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.5, random_state=0)
    if i == 0:
        print("Shape of X_train before oversampling: " + str(X_train.shape))
        print("Outcome distribution of X_train before oversampling: " + str(np.bincount(y_train)))
    # Oversample training data
    sm = SMOTE(random_state=0)
    sm.fit(X_train, y_train)
    X_train_r, y_train_r = sm.sample(X_train, y_train)
    if i == 0:
        print("Shape of X_train after oversampling: " + str(X_train_r.shape))
        print("Outcome distribution of X_train after oversampling: " + str(np.bincount(y_train_r)))
    # Build classifier on resampled data
    clf_tree = DecisionTreeClassifier(random_state=0)
    clf_tree.fit(X_train_r, y_train_r)
    y_pred = clf_tree.predict(X_test)
    tmp_res = classification_report(y_test, y_pred, output_dict=True)
    res["precision"] = res.get("precision", 0) + tmp_res["weighted avg"]["precision"]
    res["recall"] = res.get("recall", 0) + tmp_res["weighted avg"]["recall"]
    res["f1-score"] = res.get("f1-score", 0) + tmp_res["weighted avg"]["f1-score"]
    res["specificity"] = res.get("specificity", 0) + tmp_res[str(MAJORITY)]["recall"]
    res["sensitivity"] = res.get("sensitivity", 0) + tmp_res[str(MINORITY)]["recall"]
    res["overall accuracy"] = res.get("overall accuracy", 0) + accuracy_score(y_test, y_pred,)
    res["auc"] = res.get("auc", 0) + roc_auc_score(y_test, y_pred)
    res["g_mean"] = res.get("g_mean", 0) + geometric_mean_score(y_test, y_pred)

Shape of X_train before oversampling: (284, 30)
Outcome distribution of X_train before oversampling: [178 106]
Shape of X_train after oversampling: (356, 30)
Outcome distribution of X_train after oversampling: [178 178]


In [11]:
pprint_dict(res)

precision: 0.92
recall: 0.92
f1-score: 0.92
specificity: 0.93
sensitivity: 0.90
overall accuracy: 0.92
auc: 0.91
g_mean: 0.91


# Use the following for testing ADASYN

- run 100 iterations
- 50/50 training/testing
- Decision tree
- A fully balanced dataset after synthesizing
- Dth = 0.75 (Dth is a preset threshold for the maximum tolerated degree of class imbalance ratio)

In [12]:
res = dict()
X, y= df.iloc[:,:-1].values, df[CLASS_NAME].values
for i in range(100):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.5, random_state=0)
    if i == 0:
        print("Shape of X_train before oversampling: " + str(X_train.shape))
        print("Outcome distribution of X_train before oversampling: " + str(np.bincount(y_train)))
    # Oversample training data
    ada = ADASYN(random_state=0)
    ada.fit(X_train, y_train)
    X_train_r, y_train_r = ada.sample(X_train, y_train)
    if i == 0:
        print("Shape of X_train after oversampling: " + str(X_train_r.shape))
        print("Outcome distribution of X_train after oversampling: " + str(np.bincount(y_train_r)))
    # Build classifier on resampled data
    clf_tree = DecisionTreeClassifier(random_state=0)
    clf_tree.fit(X_train_r, y_train_r)
    y_pred = clf_tree.predict(X_test)
    tmp_res = classification_report(y_test, y_pred, output_dict=True)
    res["precision"] = res.get("precision", 0) + tmp_res["weighted avg"]["precision"]
    res["recall"] = res.get("recall", 0) + tmp_res["weighted avg"]["recall"]
    res["f1-score"] = res.get("f1-score", 0) + tmp_res["weighted avg"]["f1-score"]
    res["specificity"] = res.get("specificity", 0) + tmp_res[str(MAJORITY)]["recall"]
    res["sensitivity"] = res.get("sensitivity", 0) + tmp_res[str(MINORITY)]["recall"]
    res["overall accuracy"] = res.get("overall accuracy", 0) + accuracy_score(y_test, y_pred,)
    res["auc"] = res.get("auc", 0) + roc_auc_score(y_test, y_pred)
    res["g_mean"] = res.get("g_mean", 0) + geometric_mean_score(y_test, y_pred)

Shape of X_train before oversampling: (284, 30)
Outcome distribution of X_train before oversampling: [178 106]
Shape of X_train after oversampling: (355, 30)
Outcome distribution of X_train after oversampling: [178 177]


In [13]:
pprint_dict(res)

precision: 0.95
recall: 0.95
f1-score: 0.95
specificity: 0.96
sensitivity: 0.94
overall accuracy: 0.95
auc: 0.95
g_mean: 0.95


# Use the following for testing SMOTEBoost

- run 100 iterations
- 50/50 training/testing
- Decision tree

In [14]:
res = dict()
X, y= df.iloc[:,:-1].values, df[CLASS_NAME].values
for i in range(100):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.5, random_state=0)
    clf1 = SMOTEBoost(random_state=0)
    clf1.fit(X_train, y_train)
    y_pred = clf1.predict(X_test)
    tmp_res = classification_report(y_test, y_pred, output_dict=True)
    res["precision"] = res.get("precision", 0) + tmp_res["weighted avg"]["precision"]
    res["recall"] = res.get("recall", 0) + tmp_res["weighted avg"]["recall"]
    res["f1-score"] = res.get("f1-score", 0) + tmp_res["weighted avg"]["f1-score"]
    res["specificity"] = res.get("specificity", 0) + tmp_res[str(MAJORITY)]["recall"]
    res["sensitivity"] = res.get("sensitivity", 0) + tmp_res[str(MINORITY)]["recall"]
    res["overall accuracy"] = res.get("overall accuracy", 0) + accuracy_score(y_test, y_pred,)
    res["auc"] = res.get("auc", 0) + roc_auc_score(y_test, y_pred)
    res["g_mean"] = res.get("g_mean", 0) + geometric_mean_score(y_test, y_pred)

In [15]:
pprint_dict(res)

precision: 0.98
recall: 0.98
f1-score: 0.98
specificity: 0.99
sensitivity: 0.95
overall accuracy: 0.98
auc: 0.97
g_mean: 0.97


# Use the following for testing Dev_algo

- run 100 iterations
- 50/50 training/testing
- Decision tree

In [16]:
res = dict()
X, y= df.iloc[:,:-1].values, df[CLASS_NAME].values
for i in range(100):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.5, random_state=0)
    clf1 = DEVALGO(random_state=0, n_samples=144)
    clf1.fit(X_train, y_train)
    y_pred = clf1.predict(X_test)
    tmp_res = classification_report(y_test, y_pred, output_dict=True)
    res["precision"] = res.get("precision", 0) + tmp_res["weighted avg"]["precision"]
    res["recall"] = res.get("recall", 0) + tmp_res["weighted avg"]["recall"]
    res["f1-score"] = res.get("f1-score", 0) + tmp_res["weighted avg"]["f1-score"]
    res["specificity"] = res.get("specificity", 0) + tmp_res[str(MAJORITY)]["recall"]
    res["sensitivity"] = res.get("sensitivity", 0) + tmp_res[str(MINORITY)]["recall"]
    res["overall accuracy"] = res.get("overall accuracy", 0) + accuracy_score(y_test, y_pred,)
    res["auc"] = res.get("auc", 0) + roc_auc_score(y_test, y_pred)
    res["g_mean"] = res.get("g_mean", 0) + geometric_mean_score(y_test, y_pred)

In [17]:
 pprint_dict(res)

precision: 0.96
recall: 0.96
f1-score: 0.96
specificity: 0.97
sensitivity: 0.94
overall accuracy: 0.96
auc: 0.95
g_mean: 0.95
