In [1]:
import sklearn
import numpy as np
import pandas as pd
from dev import DEVALGO 
from smote import SMOTEBoost
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from imblearn.metrics import geometric_mean_score

In [2]:
MINORITY = 1
MAJORITY = 0
CLASS_NAME = "class"

In [3]:
def pprint_dict(d):
    for key, value in d.items():
        print(key + ": " + "{0:.2f}".format(value/100.0))

In [4]:
df = pd.read_csv("data/segment/segment.csv").sample(frac=1, random_state=0).reset_index(drop=True)
df.head()

Unnamed: 0,region-centroid-col,region-centroid-row,region-pixel-count,short-line-density-5,short-line-density-2,vedge-mean,vegde-sd,hedge-mean,hedge-sd,intensity-mean,rawred-mean,rawblue-mean,rawgreen-mean,exred-mean,exblue-mean,exgreen-mean,value-mean,saturation-mean,hue-mean,class
0,28,131,9,0.0,0.0,0.222222,0.074074,0.777778,0.207407,5.18518,6.66667,6.0,2.88889,4.44444,2.44444,-6.88889,6.66667,0.566138,-0.874641,brickface
1,242,164,9,0.111111,0.0,0.388889,0.534027,3.22222,1.04704,19.8148,15.1111,18.7778,25.5556,-14.1111,-3.11111,17.2222,25.5556,0.409529,2.45981,grass
2,66,101,9,0.111111,0.0,0.722223,0.329629,2.22222,0.651852,19.037,19.6667,23.5556,13.8889,1.88889,13.5556,-15.4444,23.5556,0.410267,-1.45759,brickface
3,94,215,9,0.0,0.0,3.11111,5.0963,1.61111,1.12963,17.1852,14.3333,14.3333,22.8889,-8.55556,-8.55556,17.1111,22.8889,0.404392,2.1052,grass
4,186,12,9,0.0,0.0,0.444444,0.272165,2.33333,1.96638,6.25926,3.88889,11.3333,3.55556,-7.11111,15.2222,-8.11111,11.3333,0.687302,-2.05798,window


In [5]:
# Outcomes distribution
df[CLASS_NAME].value_counts()

sky          330
path         330
foliage      330
cement       330
brickface    330
grass        330
window       330
Name: class, dtype: int64

In [6]:
# Uncomment this when testing on vowel
df[CLASS_NAME].replace({"brickface":0, "cement":0, "foliage":0, "path":0, "sky":0, "window":1, "grass":0,}, inplace=True)
df.head()

Unnamed: 0,region-centroid-col,region-centroid-row,region-pixel-count,short-line-density-5,short-line-density-2,vedge-mean,vegde-sd,hedge-mean,hedge-sd,intensity-mean,rawred-mean,rawblue-mean,rawgreen-mean,exred-mean,exblue-mean,exgreen-mean,value-mean,saturation-mean,hue-mean,class
0,28,131,9,0.0,0.0,0.222222,0.074074,0.777778,0.207407,5.18518,6.66667,6.0,2.88889,4.44444,2.44444,-6.88889,6.66667,0.566138,-0.874641,0
1,242,164,9,0.111111,0.0,0.388889,0.534027,3.22222,1.04704,19.8148,15.1111,18.7778,25.5556,-14.1111,-3.11111,17.2222,25.5556,0.409529,2.45981,0
2,66,101,9,0.111111,0.0,0.722223,0.329629,2.22222,0.651852,19.037,19.6667,23.5556,13.8889,1.88889,13.5556,-15.4444,23.5556,0.410267,-1.45759,0
3,94,215,9,0.0,0.0,3.11111,5.0963,1.61111,1.12963,17.1852,14.3333,14.3333,22.8889,-8.55556,-8.55556,17.1111,22.8889,0.404392,2.1052,0
4,186,12,9,0.0,0.0,0.444444,0.272165,2.33333,1.96638,6.25926,3.88889,11.3333,3.55556,-7.11111,15.2222,-8.11111,11.3333,0.687302,-2.05798,1


In [7]:
# Outcomes distribution
df[CLASS_NAME].value_counts()

0    1980
1     330
Name: class, dtype: int64

In [8]:
# Calculate imbalanced ratio
frq = df[CLASS_NAME].value_counts()
ratio = max(frq)/min(frq)
"{0:.1f}".format(ratio)

'6.0'

# Use the following for testing Decision tree

- run 100 iterations
- 50/50 training/testing
- Decision tree

In [9]:
res = dict()
X, y= df.iloc[:,:-1].values, df[CLASS_NAME].values
for i in range(100):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.5, random_state=0)
    clf_tree = DecisionTreeClassifier(random_state=0)
    clf_tree.fit(X_train, y_train)
    y_pred = clf_tree.predict(X_test)
    tmp_res = classification_report(y_test, y_pred, output_dict=True)
    res["precision"] = res.get("precision", 0) + tmp_res["1"]["precision"]
    res["recall"] = res.get("recall", 0) + tmp_res["1"]["recall"]
    res["f1-score"] = res.get("f1-score", 0) + tmp_res["1"]["f1-score"]
    res["specificity"] = res.get("specificity", 0) + tmp_res[str(MAJORITY)]["recall"]
    res["sensitivity"] = res.get("sensitivity", 0) + tmp_res[str(MINORITY)]["recall"]
    res["overall accuracy"] = res.get("overall accuracy", 0) + accuracy_score(y_test, y_pred,)
    res["auc"] = res.get("auc", 0) + roc_auc_score(y_test, y_pred)
    res["g_mean"] = res.get("g_mean", 0) + geometric_mean_score(y_test, y_pred)

In [10]:
pprint_dict(res)

precision: 0.88
recall: 0.88
f1-score: 0.88
specificity: 0.98
sensitivity: 0.88
overall accuracy: 0.96
auc: 0.93
g_mean: 0.93


# Use the following for testing SMOTE

- run 100 iterations
- 50/50 training/testing
- Decision tree
- N = 200

In [11]:
res = dict()
X, y= df.iloc[:,:-1].values, df[CLASS_NAME].values
for i in range(100):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.5, random_state=0)
    if i == 0:
        print("Shape of X_train before oversampling: " + str(X_train.shape))
        print("Outcome distribution of X_train before oversampling: " + str(np.bincount(y_train)))
    # Oversample training data
    sm = SMOTE(random_state=0)
    sm.fit(X_train, y_train)
    X_train_r, y_train_r = sm.fit_resample(X_train, y_train)
    if i == 0:
        print("Shape of X_train after oversampling: " + str(X_train_r.shape))
        print("Outcome distribution of X_train after oversampling: " + str(np.bincount(y_train_r)))
    # Build classifier on resampled data
    clf_tree = DecisionTreeClassifier(random_state=0)
    clf_tree.fit(X_train_r, y_train_r)
    y_pred = clf_tree.predict(X_test)
    tmp_res = classification_report(y_test, y_pred, output_dict=True)
    res["precision"] = res.get("precision", 0) + tmp_res["1"]["precision"]
    res["recall"] = res.get("recall", 0) + tmp_res["1"]["recall"]
    res["f1-score"] = res.get("f1-score", 0) + tmp_res["1"]["f1-score"]
    res["specificity"] = res.get("specificity", 0) + tmp_res[str(MAJORITY)]["recall"]
    res["sensitivity"] = res.get("sensitivity", 0) + tmp_res[str(MINORITY)]["recall"]
    res["overall accuracy"] = res.get("overall accuracy", 0) + accuracy_score(y_test, y_pred,)
    res["auc"] = res.get("auc", 0) + roc_auc_score(y_test, y_pred)
    res["g_mean"] = res.get("g_mean", 0) + geometric_mean_score(y_test, y_pred)

Shape of X_train before oversampling: (1155, 19)
Outcome distribution of X_train before oversampling: [996 159]
Shape of X_train after oversampling: (1992, 19)
Outcome distribution of X_train after oversampling: [996 996]


In [12]:
pprint_dict(res)

precision: 0.87
recall: 0.85
f1-score: 0.86
specificity: 0.98
sensitivity: 0.85
overall accuracy: 0.96
auc: 0.91
g_mean: 0.91


# Use the following for testing ADASYN

- run 100 iterations
- 50/50 training/testing
- Decision tree
- A fully balanced dataset after synthesizing
- Dth = 0.75 (Dth is a preset threshold for the maximum tolerated degree of class imbalance ratio)

In [13]:
res = dict()
X, y= df.iloc[:,:-1].values, df[CLASS_NAME].values
for i in range(100):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.5, random_state=0)
    if i == 0:
        print("Shape of X_train before oversampling: " + str(X_train.shape))
        print("Outcome distribution of X_train before oversampling: " + str(np.bincount(y_train)))
    # Oversample training data
    ada = ADASYN(random_state=0)
    ada.fit(X_train, y_train)
    X_train_r, y_train_r = ada.fit_resample(X_train, y_train)
    if i == 0:
        print("Shape of X_train after oversampling: " + str(X_train_r.shape))
        print("Outcome distribution of X_train after oversampling: " + str(np.bincount(y_train_r)))
    # Build classifier on resampled data
    clf_tree = DecisionTreeClassifier(random_state=0)
    clf_tree.fit(X_train_r, y_train_r)
    y_pred = clf_tree.predict(X_test)
    tmp_res = classification_report(y_test, y_pred, output_dict=True)
    res["precision"] = res.get("precision", 0) + tmp_res["1"]["precision"]
    res["recall"] = res.get("recall", 0) + tmp_res["1"]["recall"]
    res["f1-score"] = res.get("f1-score", 0) + tmp_res["1"]["f1-score"]
    res["specificity"] = res.get("specificity", 0) + tmp_res[str(MAJORITY)]["recall"]
    res["sensitivity"] = res.get("sensitivity", 0) + tmp_res[str(MINORITY)]["recall"]
    res["overall accuracy"] = res.get("overall accuracy", 0) + accuracy_score(y_test, y_pred,)
    res["auc"] = res.get("auc", 0) + roc_auc_score(y_test, y_pred)
    res["g_mean"] = res.get("g_mean", 0) + geometric_mean_score(y_test, y_pred)

Shape of X_train before oversampling: (1155, 19)
Outcome distribution of X_train before oversampling: [996 159]
Shape of X_train after oversampling: (1999, 19)
Outcome distribution of X_train after oversampling: [ 996 1003]


In [14]:
pprint_dict(res)

precision: 0.88
recall: 0.87
f1-score: 0.87
specificity: 0.98
sensitivity: 0.87
overall accuracy: 0.96
auc: 0.92
g_mean: 0.92


# Use the following for testing SMOTEBoost

- run 100 iterations
- 50/50 training/testing
- Decision tree

In [15]:
res = dict()
X, y= df.iloc[:,:-1].values, df[CLASS_NAME].values
for i in range(100):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.5, random_state=0)
    clf1 = SMOTEBoost(random_state=0)
    clf1.fit(X_train, y_train)
    y_pred = clf1.predict(X_test)
    tmp_res = classification_report(y_test, y_pred, output_dict=True)
    res["precision"] = res.get("precision", 0) + tmp_res["1"]["precision"]
    res["recall"] = res.get("recall", 0) + tmp_res["1"]["recall"]
    res["f1-score"] = res.get("f1-score", 0) + tmp_res["1"]["f1-score"]
    res["specificity"] = res.get("specificity", 0) + tmp_res[str(MAJORITY)]["recall"]
    res["sensitivity"] = res.get("sensitivity", 0) + tmp_res[str(MINORITY)]["recall"]
    res["overall accuracy"] = res.get("overall accuracy", 0) + accuracy_score(y_test, y_pred,)
    res["auc"] = res.get("auc", 0) + roc_auc_score(y_test, y_pred)
    res["g_mean"] = res.get("g_mean", 0) + geometric_mean_score(y_test, y_pred)

In [16]:
pprint_dict(res)

precision: 0.80
recall: 0.87
f1-score: 0.83
specificity: 0.96
sensitivity: 0.87
overall accuracy: 0.95
auc: 0.92
g_mean: 0.92


# Use the following for testing Dev_algo

- run 100 iterations
- 50/50 training/testing
- Decision tree

In [17]:
res = dict()
X, y= df.iloc[:,:-1].values, df[CLASS_NAME].values
for i in range(100):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.5, random_state=0)
    unique, counts = np.unique(y_train, return_counts=True)
    frequency = dict(zip(unique, counts))
    clf1 = DEVALGO(random_state=0, n_samples=frequency[MAJORITY])
    clf1.fit(X_train, y_train)
    y_pred = clf1.predict(X_test)
    tmp_res = classification_report(y_test, y_pred, output_dict=True)
    res["precision"] = res.get("precision", 0) + tmp_res["1"]["precision"]
    res["recall"] = res.get("recall", 0) + tmp_res["1"]["recall"]
    res["f1-score"] = res.get("f1-score", 0) + tmp_res["1"]["f1-score"]
    res["specificity"] = res.get("specificity", 0) + tmp_res[str(MAJORITY)]["recall"]
    res["sensitivity"] = res.get("sensitivity", 0) + tmp_res[str(MINORITY)]["recall"]
    res["overall accuracy"] = res.get("overall accuracy", 0) + accuracy_score(y_test, y_pred,)
    res["auc"] = res.get("auc", 0) + roc_auc_score(y_test, y_pred)
    res["g_mean"] = res.get("g_mean", 0) + geometric_mean_score(y_test, y_pred)

In [18]:
 pprint_dict(res)

precision: 0.34
recall: 0.97
f1-score: 0.51
specificity: 0.68
sensitivity: 0.97
overall accuracy: 0.72
auc: 0.82
g_mean: 0.81
