In [20]:
import warnings

warnings.filterwarnings("ignore")
import numpy as np

np.random.seed(42)
import functools
from tabulate import tabulate
import joblib
import matplotlib.pyplot as plt
import pandas as pd
from imblearn.over_sampling import SMOTENC
from sklearn.metrics import (ConfusionMatrixDisplay, accuracy_score,
                             classification_report, confusion_matrix)
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from TMGWO import TMGWO


def load_dataset(dataset):
    data = pd.read_csv(dataset)
    return data


def preprocess(data):
    objectList = data.select_dtypes(include="object").columns
    intList = data.select_dtypes(include="int64").columns
    le = LabelEncoder()
    scaler = MinMaxScaler()
    for feature in objectList:
        data[feature] = le.fit_transform(data[feature])
    for feature in intList:
        data[[feature]] = scaler.fit_transform(data[[feature]])
    # joblib.dump(scaler, "models/scaler.pkl")
    return data


def save_preprocessed(preprocessed):
    preprocessed.to_csv("datasets/diabetes_preprocessed.csv", index=False)


def split_preprocessed(preprocessed):
    X = preprocessed[preprocessed.columns[:-1]].values
    y = preprocessed[preprocessed.columns[-1]].values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=42, test_size=0.2, shuffle=True, stratify=y
    )
    return X, y, X_train, X_test, y_train, y_test


def split_for_crossval(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=42, test_size=0.2, shuffle=True, stratify=y
    )
    return X_train,y_train


def fitness(x, X_train, y_train, X_test, y_test):
    if x.ndim == 1:
        x = x.reshape(1, -1)
    loss = np.zeros(x.shape[0])
    model = KNeighborsClassifier(metric="jaccard", n_neighbors=2)
    for i in range(x.shape[0]):
        if np.sum(x[i, :]) > 0:
            model.fit(X_train[:, x[i, :].astype(bool)], y_train)
            score = accuracy_score(
                model.predict(X_test[:, x[i, :].astype(bool)]), y_test
            )
            loss[i] = 0.99 * (1 - score) + 0.01 * (np.sum(x[i, :]) / X_train.shape[1])
        else:
            loss[i] = np.inf
    return loss


def data_antar_kelas(y_train):
    positive = list(y_train).count(1)
    negative = list(y_train).count(0)
    fig = plt.figure()
    plt.title("Data antar kelas pada training data")
    plt.bar("Positive", positive)
    plt.annotate(positive, (0, positive / 2), ha="center")
    plt.bar("Negative", negative)
    plt.annotate(negative, (1, negative / 2), ha="center")
    plt.xlabel("Kelas")
    plt.ylabel("Jumlah data")
    plt.show()
    return fig


def feature_selection(X_train, X_test, y_train, y_test):
    lossfunc = functools.partial(
        fitness, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test
    )
    optimizer = TMGWO(fitness=lossfunc, D=X_train.shape[1], P=8, G=70)
    optimizer.optimize()
    selected_features = optimizer.gBest_X > 0
    return selected_features


def plot_selected_features(X, selected_features):
    fig = plt.figure()
    plt.title("Jumlah Fitur Sebelum dan Sesudah Seleksi Fitur")
    plt.bar("before", X.shape[1])
    plt.annotate(X.shape[1], (0, X.shape[1] / 2), ha="center")
    plt.bar("after", X[:, selected_features].shape[1])
    plt.annotate(
        X[:, selected_features].shape[1],
        (1, X[:, selected_features].shape[1] / 2),
        ha="center",
    )
    plt.ylabel("Num of features")
    return fig


def do_smote(X_train, y_train):
    smote = SMOTENC(
        random_state=42,
        categorical_features=[
            False,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
            True,
        ],
    )
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    return X_train_smote, y_train_smote


def plot_smote(y_train_smote):
    positive = list(y_train_smote).count(1)
    negative = list(y_train_smote).count(0)
    fig = plt.figure()
    plt.title("Data antar kelas pada training data setelah SMOTE")
    plt.bar("Positive", height=positive)
    plt.annotate(positive, (0, positive / 2), ha="center", fontsize=20)
    plt.bar("Negative", height=negative)
    plt.annotate(negative, (1, negative / 2), ha="center", fontsize=20)
    plt.xlabel("Kelas")
    plt.ylabel("Jumlah data")
    return fig


hasil = {}
hasil["cross_val"] = {}
hasil["cross_val"]["num_sf"] = []
hasil["cross_val"]["akurasi"] = {}
hasil["cross_val"]["cm"] = {}
hasil["cross_val"]["cr"] = {}
hasil["akurasi"] = {}
hasil["cm"] = {}
hasil["cr"] = {}


def cross_val_knn(X, y):
    X_train, y_train = split_for_crossval(X, y)
    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
    for train_index, test_index in sss.split(X_train, y_train):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model = KNeighborsClassifier(metric="jaccard", n_neighbors=5)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        hasil["cross_val"]["akurasi"]["KNN"].append(
            float("{:.2f}".format(accuracy_score(y_test, y_pred) * 100))
        )
        hasil["cross_val"]["cm"]["KNN"].append(confusion_matrix(y_test, y_pred))
        hasil["cross_val"]["cr"]["KNN"].append(
            classification_report(
                y_test, y_pred, output_dict=True, target_names=["Negative", "Positive"]
            )
        )


def cross_val_knn_smote(X, y):
    X_train, y_train = split_for_crossval(X, y)
    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
    for train_index, test_index in sss.split(X_train, y_train):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        smote = SMOTENC(
            random_state=42,
            categorical_features=[
                False,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
            ],
        )
        X_train, y_train = smote.fit_resample(X_train, y_train)
        model = KNeighborsClassifier(metric="jaccard", n_neighbors=5)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        hasil["cross_val"]["akurasi"]["KNN+SMOTE"].append(
            float("{:.2f}".format(accuracy_score(y_test, y_pred) * 100))
        )
        hasil["cross_val"]["cm"]["KNN+SMOTE"].append(confusion_matrix(y_test, y_pred))
        hasil["cross_val"]["cr"]["KNN+SMOTE"].append(
            classification_report(
                y_test, y_pred, output_dict=True, target_names=["Negative", "Positive"]
            )
        )


def cross_val_knn_smote_tmgwo(X, y):
    X_train, y_train = split_for_crossval(X, y)
    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
    for train_index, test_index in sss.split(X_train, y_train):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        smote = SMOTENC(
            random_state=42,
            categorical_features=[
                False,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
            ],
        )
        selected_features = feature_selection(X_train, X_test, y_train, y_test)
        X_train, y_train = smote.fit_resample(X_train, y_train)
        model = KNeighborsClassifier(metric="jaccard", n_neighbors=5)
        model.fit(X_train[:, selected_features], y_train)
        y_pred = model.predict(X_test[:, selected_features])
        hasil["cross_val"]["akurasi"]["KNN+SMOTE+TMGWO"].append(
            float("{:.2f}".format(accuracy_score(y_test, y_pred) * 100))
        )
        hasil["cross_val"]["cm"]["KNN+SMOTE+TMGWO"].append(
            confusion_matrix(y_test, y_pred)
        )
        hasil["cross_val"]["cr"]["KNN+SMOTE+TMGWO"].append(
            classification_report(
                y_test, y_pred, output_dict=True, target_names=["Negative", "Positive"]
            )
        )


def train_model(
    X, y, X_train, X_test, y_train, y_test, algoritma, selected_features=None
):
    hasil["cross_val"]["akurasi"][algoritma] = []
    hasil["cross_val"]["cm"][algoritma] = []
    hasil["cross_val"]["cr"][algoritma] = []
    if algoritma == "KNN":
        cross_val_knn(X, y)
    if algoritma == "KNN+SMOTE":
        cross_val_knn_smote(X, y)
    if algoritma == "KNN+SMOTE+TMGWO":
        cross_val_knn_smote_tmgwo(X, y)
    model = KNeighborsClassifier(metric="jaccard", n_neighbors=5)
    model.fit(X_train, y_train)
    # if algoritma == "KNN+SMOTE+TMGWO":
    #     y_pred = model.predict(X_test[:, selected_features])
    # else:
    y_pred = model.predict(X_test)
    hasil["akurasi"][algoritma] = float(
        "{:.2f}".format(accuracy_score(y_test, y_pred) * 100)
    )
    hasil["cm"][algoritma] = confusion_matrix(y_test, y_pred)
    hasil["cr"][algoritma] = classification_report(
        y_test, y_pred, output_dict=True, target_names=["Negative", "Positive"]
    )
    score = hasil["akurasi"][algoritma]
    cm = hasil["cm"][algoritma]
    cr = hasil["cr"][algoritma]
    joblib.dump(model, f"models/{algoritma}.pkl")
    return model, score, cm, cr


def get_highest_acc_index(hasil):
    index = {}
    for key, val in hasil["akurasi"].items():
        index[key] = val.index(max(val))
    return index


def plot_cm(cm, algo):
    fig, ax = plt.subplots()
    cm = ConfusionMatrixDisplay(cm, display_labels=["Negative", "Positive"])
    cm.plot(ax=ax)
    plt.title(f"Confusion Matrix {algo}")
    return fig


def plot_highest_accuracy(hasil):
    fig = plt.figure()
    acc = [acc for acc in hasil["akurasi"].values()]
    label = [algoritma for algoritma in hasil["akurasi"].keys()]
    juml = len(acc)
    plt.title("Hasil Model")
    for i in range(juml):
        plt.bar(i, acc[i], label="Ori")
        plt.annotate(f"{acc[i]} %", (i, acc[i] / 2), ha="center")
    plt.ylabel("Akurasi")
    plt.xticks([i for i in range(juml)], label)
    plt.xlabel("Algoritma")
    plt.ylim(0, 100)
    return fig


In [21]:
data = load_dataset("diabetes.csv")

In [22]:
data = preprocess(data)

In [23]:
X, y, X_train, X_test, y_train, y_test = split_preprocessed(data)

In [24]:
for algoritma in ["KNN", "KNN+SMOTE", "KNN+SMOTE+TMGWO"]:
    hasil["cross_val"]["akurasi"][algoritma] = []
    hasil["cross_val"]["cr"][algoritma] = []
    hasil["cross_val"]["cm"][algoritma] = []

In [25]:
cross_val_knn(X, y)

In [26]:
cross_val_knn_smote(X, y)

In [31]:
hasil["cross_val"]["akurasi"]["KNN+SMOTE+TMGWO"] = []
hasil["cross_val"]["cm"]["KNN+SMOTE+TMGWO"] = []
hasil["cross_val"]["cr"]["KNN+SMOTE+TMGWO"] = []
hasil['cross_val']['num_sf'] = []
X_train, y_train = split_for_crossval(X, y)
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
for train, test in sss.split(X_train, y_train):
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    smote = SMOTENC(
            random_state=42,
            categorical_features=[
                False,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
            ],
        )
    selected_features = feature_selection(X_train, X_test, y_train, y_test)
    X_train, y_train = smote.fit_resample(X_train, y_train)
    model = KNeighborsClassifier(metric='jaccard', n_neighbors=5)
    model.fit(X_train[:, selected_features], y_train)
    y_pred = model.predict(X_test[:, selected_features])
    hasil["cross_val"]["akurasi"]["KNN+SMOTE+TMGWO"].append(
            float("{:.2f}".format(accuracy_score(y_test, y_pred) * 100))
        )
    hasil["cross_val"]["cm"]["KNN+SMOTE+TMGWO"].append(
            confusion_matrix(y_test, y_pred)
        )
    hasil["cross_val"]["cr"]["KNN+SMOTE+TMGWO"].append(
            classification_report(
                y_test, y_pred, output_dict=True, target_names=["Negative", "Positive"]
            )
        )
    hasil['cross_val']['num_sf'].append(list(selected_features).count(True))

In [63]:
for key, val in hasil['cross_val']['akurasi'].items():
    print(key.center(70, '='))
    print(tabulate([val], headers=[str(i) for i in range(1,11)]))
    print()
    print("rata-rata: ", np.mean(val))
    print("+"*70)

    1      2      3      4      5      6      7      8      9     10
-----  -----  -----  -----  -----  -----  -----  -----  -----  -----
90.48  91.67  92.86  90.48  94.05  96.43  90.48  95.24  89.29  91.67

rata-rata:  92.265
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
   1      2      3      4      5      6     7      8      9     10
----  -----  -----  -----  -----  -----  ----  -----  -----  -----
88.1  90.48  96.43  90.48  94.05  96.43  86.9  94.05  89.29  89.29

rata-rata:  91.55
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    1      2      3      4      5      6      7      8      9     10
-----  -----  -----  -----  -----  -----  -----  -----  -----  -----
91.67  95.24  96.43  95.24  98.81  95.24  92.86  94.05  94.05  91.67

rata-rata:  94.526
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


In [76]:
print("junlah fitur pada tiap fold")
print(tabulate([hasil['cross_val']['num_sf']], headers=[str(i) for i in range(1,11)]))

junlah fitur pada tiap fold
  1    2    3    4    5    6    7    8    9    10
---  ---  ---  ---  ---  ---  ---  ---  ---  ----
  8   12   10   11   10    9   12   10   12    12


In [65]:
data = pd.DataFrame(hasil['cross_val']['akurasi'])

In [72]:
num_sf = pd.DataFrame(hasil['cross_val']['num_sf'])

In [71]:
data.to_csv("hasil_cross_val_akurasi.csv", index=False)

In [73]:
num_sf.to_csv("hasil_cross_val_num_features.csv", index=False)