In [23]:
# evaluates a decision tree model on the imbalanced dataset
from numpy import mean
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
from sklearn.metrics import confusion_matrix, make_scorer


In [2]:
data = pd.read_csv('final_dataset_new.csv')

In [3]:
X = data.drop(['Domain', 'Label', 'Subdomain levels'], axis=1)
y = data['Label']

In [4]:
# define model
model = DecisionTreeClassifier()

In [5]:
Counter(y)

Counter({0: 37610, 1: 3904})

'''When callable, function taking y and returns a dict.
The keys correspond to the targeted classes.
The values correspond to the desired number of samples for each class.'''

In [6]:
def call(y):
    a = {}
    ratio = int(4 * Counter(y)[1])
    a[0] = ratio
    a[1] = Counter(y)[1]
    return a

In [7]:
%%time
from imblearn.under_sampling import NearMiss

X_nm, y_nm = NearMiss(sampling_strategy=0.25, n_neighbors=3).fit_sample(X, y)

CPU times: user 338 ms, sys: 53.3 ms, total: 391 ms
Wall time: 641 ms


In [8]:
%%time
from imblearn.under_sampling import CondensedNearestNeighbour

X_cnn, y_cnn = CondensedNearestNeighbour(n_neighbors=3).fit_sample(X, y)

CPU times: user 36min 25s, sys: 16.1 s, total: 36min 41s
Wall time: 37min 24s


In [9]:
%%time
from imblearn.under_sampling import EditedNearestNeighbours

X_enn, y_enn = EditedNearestNeighbours(n_neighbors=3).fit_sample(X, y)

CPU times: user 941 ms, sys: 20.4 ms, total: 961 ms
Wall time: 969 ms


In [10]:
%%time
from imblearn.under_sampling import TomekLinks

X_tl, y_tl = TomekLinks().fit_sample(X, y)

CPU times: user 887 ms, sys: 9.69 ms, total: 897 ms
Wall time: 902 ms


In [11]:
print("NM", Counter(y_nm))
print("CNN", Counter(y_cnn))
print("ENN", Counter(y_enn))
print("TL", Counter(y_tl))

NM Counter({0: 15616, 1: 3904})
CNN Counter({1: 3904, 0: 2065})
ENN Counter({0: 36255, 1: 3904})
TL Counter({0: 37348, 1: 3904})


In [28]:
def g_mean(y_true, y_pred): 
    tn = confusion_matrix(y_true, y_pred)[0, 0]
    fp = confusion_matrix(y_true, y_pred)[0, 1]
    tp = confusion_matrix(y_true, y_pred)[1, 1]
    fn = confusion_matrix(y_true, y_pred)[1, 0]
    neg = tn + fp
    pos = tp + fn
    specificity = tn/neg
    sensitivity = tp/pos
    gmean = math.sqrt(specificity*sensitivity)
    return gmean

In [56]:
cv = RepeatedStratifiedKFold(n_splits=10, random_state=42)
# evaluate model
scores_nm = cross_val_score(model, X_nm, y_nm, scoring='roc_auc', cv=cv, n_jobs=-1)
scores_cnn = cross_val_score(model, X_cnn, y_cnn, scoring='roc_auc', cv=cv, n_jobs=-1)
scores_enn = cross_val_score(model, X_enn, y_enn, scoring='roc_auc', cv=cv, n_jobs=-1)
scores_tl = cross_val_score(model, X_tl, y_tl, scoring='roc_auc', cv=cv, n_jobs=-1)

In [57]:
# summarize performance
print('Mean ROC AUC NM: %.3f' % mean(scores_nm))
print('Mean ROC AUC CNN: %.3f' % mean(scores_cnn))
print('Mean ROC AUC ENN: %.3f' % mean(scores_enn))
print('Mean ROC AUC TL: %.3f' % mean(scores_tl))

Mean ROC AUC NM: 0.934
Mean ROC AUC CNN: 0.837
Mean ROC AUC ENN: 0.957
Mean ROC AUC TL: 0.937


In [44]:
Counter(y_nm)[0]/len(y_nm)

0.8

In [49]:
Counter(y_cnn)[0]/len(y_cnn)

0.34595409616351147

In [50]:
Counter(y_enn)[0]/len(y_enn)

0.9027864239647402

In [51]:
Counter(y_tl)[0]/len(y_tl)

0.9053621642587026