# Steps

1. Import required libraries
2. Load data
3. Exploratory data analysis
4. Scale data
5. Sampling
6. Train base models
7. Select Features with SelectKBest
8. Hyperparameter Tuning with Optuna
9. Model Calibration
10. Model Explanation with LIME

# Import Required Libraries

In [None]:
#!pip install -q scikit-plot catboost optuna lime

In [None]:
import numpy as np
np.random.seed(42)

import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.float_format", lambda x: "%.2f" % x)
pd.options.mode.chained_assignment = None

import datetime
from collections import Counter
import os
import pickle
import io
import time

import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix
sns.set_style("whitegrid")

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import lime
import shap

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

from sklearn.base import clone
from sklearn.preprocessing import RobustScaler, OrdinalEncoder, LabelEncoder, label_binarize
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score

from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE

from sklearn.metrics import brier_score_loss
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

if not os.path.exists(f'./outputs'):
    os.makedirs('./outputs')

PROJECT_ROOT_DIR = './outputs'
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, 'images')
if not os.path.exists(f'./outputs/images'):
    os.makedirs('./outputs/images')
    
def save_fig(title):
    path = os.path.join(IMAGES_PATH, title + '.png')
    plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

from IPython.display import Markdown

def bold(string):
    display(Markdown("**" + string + "**"))

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Load Data

In [None]:
data = pd.read_csv('/mnt/d/Datasets/bug_hunter/BugHunterDataset-1.0/single/MapDB/class.csv')
df = data.copy()
df.head()

In [None]:
def df_stats(data):
    bold(" SHAPE ".center(50, "#"))
    print("ROWS: {}".format(data.shape[0]))
    print("COLUMNS: {}".format(data.shape[1]))
    bold(" TYPES ".center(50, "#"))
    print(data.dtypes)
    bold(" MISSING VALUES ".center(50, "#"))
    print(data.isnull().sum())
    bold(" DUPLICATED VALUES ".center(50, "#"))
    print("NUMBER OF DUPLICATED VALUES: {}".format(data.duplicated().sum()))
    bold(" MEMORY USAGE ".center(50, "#"))
    buf = io.StringIO()
    data.info(buf=buf)
    info = buf.getvalue().split("\n")[-2].split(":")[1].strip()
    print("Memory Usage: {}".format(info))
    bold(" DESCRIBE ".center(50, "#"))
    display(data.describe().T)

In [None]:
df_stats(df)

In [None]:
desc_dict = {
    'Hash': "Dosyanın benzersiz bir tanımlayıcısı.",
    'LongName': "Dosyanın uzun adı.",
    'CC': "Karmaşıklık Skoru (Cyclomatic Complexity Score).",
    'CCL': "Karmaşıklık Skoru Limiti (Cyclomatic Complexity Limit).",
    'CCO': "Karmaşıklık Skoru Aşım Oranı (Cyclomatic Complexity Overflow).",
    'CI': "Kalite İndeksi.",
    'CLC': "Kod Kalitesi Limiti.",
    'CLLC': "Kod Kalitesi Limiti Aşım Oranı.",
    'LDC': "Lokal Değişken Sayısı (Local Variable Count).",
    'LLDC': "Lokal Uzunluk Değişken Sayısı (Local Long Variable Count).",
    'LCOM5': "LCOM5 (Lack of Cohesion in Methods) Metrik Değeri.",
    'NL': "Nesne Sayısı.",
    'NLE': "Nesne Limiti.",
    'WMC': "Ağırlıklı Metot Sayısı (Weighted Method Count).",
    'CBO': "Bağlılık Sayısı (Coupling Between Objects).",
    'CBOI': "Bağlılık Sayısı Limiti (Coupling Between Objects Limit).",
    'NII': "Nesne İçerme İndeksi (Number of Inherited Methods).",
    'NOI': "Nesne İçerme Limiti (Number of Inherited Limit).",
    'RFC': "Fonksiyon Çağrısı Sayısı (Response For a Class).",
    'AD': "Asgari Mesafe.",
    'CD': "Çocuk Sayısı (Child Count).",
    'CLOC': "Yürütülen Kod Satırı Sayısı (Count of Lines of Code).",
    'DLOC': "Silinen Kod Satırı Sayısı (Deleted Lines of Code).",
    'PDA': "Genel Çeşitlilik.",
    'PUA': "Kullanılmayan Parametre Sayısı (Unused Parameters Count).",
    'TCD': "Çalışma Süresi Bağlılığı (Temporal Coupling Degree).",
    'TCLOC': "Toplam Yürütülen Kod Satırı Sayısı (Total Count of Lines of Code).",
    'DIT': "Derinlik İkilik Ağacı (Depth of Inheritance Tree).",
    'NOA': "İlgili Nesne Sayısı (Number of Aggregated Objects).",
    'NOC': "İlgili Çocuk Nesne Sayısı (Number of Children).",
    'NOD': "Çıkış Nesnesi Sayısı (Number of Descendants).",
    'NOP': "Çıkış Parametre Sayısı (Number of Parameters).",
    'LLOC': "Lokal Kod Satırı Sayısı (Local Lines of Code).",
    'LOC': "Toplam Kod Satırı Sayısı (Lines of Code).",
    'NA': "İlgili Metot Sayısı (Number of Attributes).",
    'NG': "Metot Grup Sayısı (Number of Groups).",
    'NLA': "Uygulanan Lokal Arayüz Sayısı (Number of Local Accessors).",
    'NLG': "Uygulanan Lokal Getter Sayısı (Number of Local Getters).",
    'NLM': "Uygulanan Lokal Metot Sayısı (Number of Local Methods).",
    'NLPA': "Uygulanan Lokal Parametre Sayısı (Number of Local Parameters).",
    'NLPM': "Uygulanan Lokal Parametre Metot Sayısı (Number of Local Parameter Methods).",
    'NLS': "Uygulanan Lokal Setter Sayısı (Number of Local Setters).",
    'NM': "Uygulanan Metot Sayısı (Number of Methods).",
    'NOS': "Çıkış Üzerindeki Nesne Sayısı (Number of Objects).",
    'NPA': "Uygulanan Parametre Sayısı (Number of Parameters).",
    'NPM': "Uygulanan Parametre Metot Sayısı (Number of Parameter Methods).",
    'NS': "Uygulanan Setter Sayısı (Number of Setters).",
    'TLLOC': "Toplam Lokal Kod Satırı Sayısı (Total Lines of Local Code).",
    'TLOC': "Toplam Kod Satırı Sayısı (Total Lines of Code).",
    'TNA': "Toplam Nesne Sayısı (Total Number of Objects).",
    'TNG': "Toplam Metot Grup Sayısı (Total Number of Groups).",
    'TNLA': "Toplam Uygulanan Lokal Arayüz Sayısı (Total Number of Local Accessors).",
    'TNLG': "Toplam Uygulanan Lokal Getter Sayısı (Total Number of Local Getters).",
    'TNLM': "Toplam Uygulanan Lokal Metot Sayısı (Total Number of Local Methods).",
    'TNLPA': "Toplam Uygulanan Lokal Parametre Sayısı (Total Number of Local Parameters).",
    'TNLPM': "Toplam Uygulanan Lokal Parametre Metot Sayısı (Total Number of Local Parameter Methods).",
    'TNLS': "Toplam Uygulanan Lokal Setter Sayısı (Total Number of Local Setters).",
    'TNM': "Toplam Uygulanan Metot Sayısı (Total Number of Methods).",
    'TNOS': "Toplam Çıkış Üzerindeki Nesne Sayısı (Total Number of Objects).",
    'TNPA': "Toplam Uygulanan Parametre Sayısı (Total Number of Parameters).",
    'TNPM': "Toplam Uygulanan Parametre Metot Sayısı (Total Number of Parameter Methods).",
    'TNS': "Toplam Uygulanan Setter Sayısı (Total Number of Setters).",
    'WarningBlocker': "Bloke Edici Uyarı Sayısı.",
    'WarningCritical': "Kritik Uyarı Sayısı.",
    'WarningInfo': "Bilgi Uyarısı Sayısı.",
    'WarningMajor': "Önemli Uyarı Sayısı.",
    'WarningMinor': "Küçük Uyarı Sayısı.",
    'Android Rules': "Android uygulama geliştirme ile ilgili kurallar.",
    'Basic Rules': "Temel kod yazma kuralları.",
    'Brace Rules': "Süslü parantez kullanım kuralları.",
    'Clone Implementation Rules': "Klonlanmış kodların uygulama kuralları.",
    'Code Size Rules': "Kod boyutu ile ilgili kurallar.",
    'Comment Rules': "Yorum satırları ile ilgili kurallar.",
    'Controversial Rules': "Tartışmalı kod yazma kuralları.",
    'Coupling Rules': "Modül bağlantıları ile ilgili kurallar.",
    'Design Rules': "Yazılım tasarımı ile ilgili kurallar.",
    'Empty Code Rules': "Boş kod blokları ile ilgili kurallar.",
    'Finalizer Rules': "Finalizer metotları ile ilgili kurallar.",
    'Import Statement Rules': "İmport ifadeleri ile ilgili kurallar.",
    'J2EE Rules': "Java 2 Enterprise Edition ile ilgili kurallar.",
    'JUnit Rules': "JUnit test kapsamı ile ilgili kurallar.",
    'Jakarta Commons Logging Rules': "Jakarta Commons Logging kütüphanesi ile ilgili kurallar.",
    'Java Logging Rules': "Java loglama ile ilgili kurallar.",
    'JavaBean Rules': "JavaBean uygulama kuralları.",
    'MigratingToJUnit4 Rules': "JUnit 4'e geçiş kuralları.",
    'Migration Rules': "Geçiş kuralları.",
    'Migration13 Rules': "Geçiş versiyon 1.3 kuralları.",
    'Migration14 Rules': "Geçiş versiyon 1.4 kuralları.",
    'Migration15 Rules': "Geçiş versiyon 1.5 kuralları.",
    'Naming Rules': "İsimlendirme kuralları.",
    'Optimization Rules': "Kod optimizasyonu ile ilgili kurallar.",
    'Security Code Guideline Rules': "Güvenlik kodu rehberi kuralları.",
    'Strict Exception Rules': "Katı istisna kuralları.",
    'String and StringBuffer Rules': "String ve StringBuffer kullanım kuralları.",
    'Type Resolution Rules': "Tür çözümleme kuralları.",
    'Unnecessary and Unused Code Rules': "Gereksiz ve kullanılmayan kod kuralları.",
    'Vulnerability Rules': "Güvenlik açığı ile ilgili kurallar.",
    'Number of Bugs': "Hata Sayısı."
}

In [None]:
desc_df = pd.DataFrame.from_dict(desc_dict, orient='index').reset_index(level=0)
desc_df.columns = ["Feature", "Description"]
desc_df

# EDA

In [None]:
def grab_cols(df):
    cat_cols = [col for col in df.columns if str(df[col].dtypes) in ["bool", "category", "object"]]
    bold(f"Categorical Variables ({len(cat_cols)})")
    print(cat_cols)
    
    num_cols = [col for col in df.columns if df[col].dtypes in [int, float]]
    bold(f"Numerical Variables ({len(num_cols)})")
    print(num_cols)
    
    numerical_but_categorical_cols = [col for col in num_cols if df[col].nunique() < 10]
    bold(f"Numerical but Categorical Variables ({len(numerical_but_categorical_cols)})")
    print(numerical_but_categorical_cols)
    
    categorical_but_cardinal_cols = [col for col in cat_cols if df[col].nunique() > 20]
    bold(f"Categorical but Cardinal Variables ({len(categorical_but_cardinal_cols)})")
    print(categorical_but_cardinal_cols)

    same_value_cols = [col for col in df.columns if df[col].nunique() == 1]
    bold(f"Same Value Variables ({len(same_value_cols)})")
    print(same_value_cols)

    for col in same_value_cols:
        if col in cat_cols:
            cat_cols.remove(col)
        elif col in num_cols:
            num_cols.remove(col)
        
    return cat_cols, num_cols

In [None]:
categorical_variables, numerical_variables = grab_cols(df)

In [None]:
def plot_num(df, columns):
    plt.figure(figsize=(len(columns) / 4, len(columns)))
    for i, column in enumerate(columns):
        plt.subplot(int(len(columns) / 2) + 1, 2, i + 1)
        sns.histplot(x=column, data=df, bins=30, kde=True)
        plt.axvline(df[column].mean(), color="r", linestyle="--", label="Mean")
        plt.axvline(df[column].median(), color="g", linestyle="-", label="Median")
        plt.grid()
        plt.title(f"{column} Distribution")
        plt.legend()
        plt.tight_layout()

    plt.show()

In [None]:
plot_num(df, numerical_variables)

In [None]:
def plot_correlation_heatmap(df: pd.core.frame.DataFrame, title_name: str='Correlation Map') -> None:
    corr = df.corr()
    fig, axes = plt.subplots(figsize=(df.shape[1], df.shape[1]))
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True
    sns.heatmap(corr, mask=mask, linewidths=.5, cmap='viridis', annot=True, fmt='.2f')
    plt.title(title_name)
    #plt.savefig('correlation')
    plt.show()

In [None]:
plot_correlation_heatmap(df, 'Dataset Correlation')

# Preprocess

In [None]:
def load_models():
    return [RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), DecisionTreeClassifier(), ExtraTreesClassifier(),
            LogisticRegression(), SGDClassifier(), XGBClassifier(), LGBMClassifier(verbose=-100), CatBoostClassifier(verbose=0), MLPClassifier(),
            GaussianNB(), SVC(), KNeighborsClassifier()]

In [None]:
def preprocess_and_train(df, drop_cols, target):
    X = df.drop(drop_cols, axis=1)
    X = df.drop(target, axis=1)
    y = df[target]

    if not os.path.exists(f'./outputs/process'):
        os.makedirs('./outputs/process')
    
    scores_df = pd.DataFrame(columns=["Model Name", "Selected Features", "Parameters", "Train Time", "Test Time", "Train Accuracy", "Test Accuracy", "F1", "Precision", "Recall"])
    samplers = ["No Sampler",
                #SMOTE(random_state=42),
                RandomOverSampler(random_state=42),
                #ADASYN(random_state=42),
                #BorderlineSMOTE(k_neighbors=4, random_state=42)
               ]

    class_index = {}
    for c in np.unique(y):
        class_index[c] = np.where(y == c)[0]

    min_class_len = min(len(ind) for ind in class_index.values())
    test_index = []
    for i in class_index.values():
        test_index.extend(np.random.choice(i, size=min_class_len // 2, replace=False))

    train_index = np.setdiff1d(X.index.values, test_index)
    train_index = list(train_index)

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    rs = RobustScaler()
    X_train[numerical_variables] = rs.fit_transform(X_train[numerical_variables])
    X_test[numerical_variables] = rs.transform(X_test[numerical_variables])
    with open('./outputs/process/rs.pkl', 'wb') as f:
        pickle.dump(rs, f)
    
    class_names = df[target].unique().tolist()

    for sampler in samplers:
        if sampler == "No Sampler":
            sampler_name = "No Sampler"
            X_train_resampled, y_train_resampled = X_train, y_train 

        else:
            sampler_name = str(sampler.__class__).split(".")[-1].replace("'>", "")
            X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)
    
        #print(Counter(y_train), '=', len(y_train))
        #print(Counter(y_test), '=', len(y_test))

        models = load_models()
        for i, model in enumerate(models):
            train_model(i, model, scores_df, X_train_resampled, X_test, y_train_resampled, y_test, class_names, sampler_name)

    scores_df.to_csv(f'./outputs/scores.csv')
    plot_results(scores_df, f'Base Models Results')

# Train

In [None]:
def train_model(i, model, scores_df, X_train, X_test, y_train, y_test, class_names, sampler_name):
    if not os.path.exists(f'./outputs/models'):
        os.makedirs(f'./outputs/models')
    
    best_k = 10
    best_score = 0

    for k in range(10, X_train.shape[1] + 1):
        selector = SelectKBest(score_func=f_classif, k=k)
        X_train_selected = selector.fit_transform(X_train, y_train)
        X_test_selected = selector.transform(X_test)

        selected_model = clone(model)

        selected_model.fit(X_train_selected, y_train)
        y_pred = selected_model.predict(X_test_selected)
        accuracy = accuracy_score(y_test, y_pred)

        if accuracy > best_score:
            best_score = accuracy
            best_k = k

    selector = SelectKBest(score_func=f_classif, k=best_k)
    X_train = selector.fit_transform(X_train, y_train)
    X_test = selector.transform(X_test)
    selected_features = selector.get_feature_names_out()

    model_name = str(model.__class__).split(".")[-1].replace("'>", "").replace("Classifier", "")

    study = optuna.create_study(direction='maximize')

    if model_name == "LGBM":
        study.optimize(lambda trial: objective_lgbm(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "AdaBoost":
        study.optimize(lambda trial: objective_ada(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "DecisionTree":
        study.optimize(lambda trial: objective_dt(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "ExtraTree":
        study.optimize(lambda trial: objective_et(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "GradientBoosting":
        study.optimize(lambda trial: objective_gb(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "MLP":
        study.optimize(lambda trial: objective_mlp(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "RandomForest":
        study.optimize(lambda trial: objective_rf(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "SGD":
        study.optimize(lambda trial: objective_sgd(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "XGB":
        study.optimize(lambda trial: objective_xgb(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "SVC":
        study.optimize(lambda trial: objective_svc(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "KNeighbors":
        study.optimize(lambda trial: objective_knn(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    else:
        best_params = {}

    model = model.set_params(**best_params)

    best_calibration_method = None
    best_brier = 1
    calibration_methods = ['sigmoid', 'isotonic']
    for calibration in calibration_methods:
        model_calibrate = clone(model)
        calibrated_classifier = CalibratedClassifierCV(model_calibrate, method=calibration, cv=None)
        calibrated_classifier.fit(X_train, y_train)
        if hasattr(calibrated_classifier, 'predict_proba'):
            prob_pos = calibrated_classifier.predict_proba(X_test)
        else:
            prob_pos = calibrated_classifier.decision_function(X_test)
            prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())

        y_test_binarized = label_binarize(y_test, classes=np.unique(y_test))
        brier_scores = [brier_score_loss(y_test_binarized[:, i], prob_pos[:, i]) for i in range(y_test_binarized.shape[1])]
        brier = np.mean(brier_scores)
        if brier < best_brier:
            best_calibration_method = calibration

    model_name = f'{model_name} + {sampler_name} + k={best_k} + {best_calibration_method}'
    calibrated_model = CalibratedClassifierCV(model, method=best_calibration_method, cv=None)

    start = time.time()
    calibrated_model.fit(X_train, y_train)
    end = time.time()
    train_time = end - start

    start = time.time()
    y_pred = calibrated_model.predict(X_test)
    end = time.time()
    test_time = end - start

    train_pred = calibrated_model.predict(X_train)
    train_acc = accuracy_score(y_train, train_pred)
    test_acc = accuracy_score(y_test, y_pred)

    f1 = f1_score(y_test, y_pred, average="weighted")
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = precision_score(y_test, y_pred, average="weighted")

    bold(f" {model_name} ".center(100, "#"))
    
    cm = confusion_matrix(y_test, y_pred)
    plot_confusion_matrix(conf_mat=cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8))
    save_fig(f"{model_name}_confusion_matrix")
    plt.title(model_name)
    plt.show()

    print(classification_report(y_test, y_pred))
    print()

    bold("#" * 100)

    #print(f"Model: {model_name} | Accuracy: {round(test_acc, 2)}")

    with open(f'./outputs/models/{model_name}.pkl', 'wb') as file:
        pickle.dump(calibrated_model, file)
    
    scores_df.loc[len(scores_df)] = [model_name, selected_features, best_params, train_time, test_time, train_acc, test_acc, f1, precision, recall]

# Hyperparameters

In [None]:
def objective_lgbm(trial, X_train, X_test, y_train, y_test):
    param = {
        'objective': 'multiclass',
        'num_class': 3,
        'metric': 'multi_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.5),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 10.0),
    }

    model = LGBMClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_xgb(trial, X_train, X_test, y_train, y_test):
    param = {
        'objective': 'multi:softmax',
        'num_class': 3,
        'eval_metric': 'mlogloss',
        'verbosity': 0,
        'booster': 'gbtree',
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.5),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 10.0),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-5, 10.0),
    }

    model = XGBClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_rf(trial, X_train, X_test, y_train, y_test):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_uniform('max_features', 0.6, 1.0)
    }

    model = RandomForestClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_ada(trial, X_train, X_test, y_train, y_test):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
    }

    model = AdaBoostClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_gb(trial, X_train, X_test, y_train, y_test):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
    }

    model = GradientBoostingClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_dt(trial, X_train, X_test, y_train, y_test):
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_uniform('max_features', 0.6, 1.0)
    }

    model = DecisionTreeClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_et(trial, X_train, X_test, y_train, y_test):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_uniform('max_features', 0.6, 1.0)
    }

    model = ExtraTreesClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_sgd(trial, X_train, X_test, y_train, y_test):
    param = {
        'alpha': trial.suggest_loguniform('alpha', 1e-5, 1e-1),
        'loss': trial.suggest_categorical('loss', ['hinge', 'log_loss', 'modified_huber', 'squared_hinge']),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])
    }

    model = SGDClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_mlp(trial, X_train, X_test, y_train, y_test):
    param = {
        'activation': trial.suggest_categorical('activation', ['identity', 'logistic', 'tanh', 'relu']),
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'sgd', 'adam']),
        'alpha': trial.suggest_loguniform('alpha', 1e-5, 1e-1),
    }

    model = MLPClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_svc(trial, X_train, X_test, y_train, y_test):
    param = {
        'kernel': trial.suggest_categorical('kernel', ['rbf']),
        'C': trial.suggest_loguniform('C', 1e-5, 100),
        'gamma': trial.suggest_loguniform('gamma', 1e-5, 100),
    }

    model = SVC(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_knn(trial, X_train, X_test, y_train, y_test):
    param = {
        'n_neighbors': trial.suggest_int('n_neighbors', 1, 30),
        'metric': trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'chebyshev']),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
    }
    model = KNeighborsClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Scores

In [None]:
def plot_results(df, title):
    plt.figure(figsize=(10, 50))

    plt.subplot(711)
    ax = sns.barplot(data=df.sort_values(by="Train Time", ascending=False), y="Model Name", x="Train Time", palette='viridis')
    for container in ax.containers:
        ax.bar_label(container)
    plt.title("Model / Train Time")
    plt.xlabel("")

    plt.subplot(712)
    ax = sns.barplot(data=df.sort_values(by="Test Time", ascending=False), y="Model Name", x="Test Time", palette='viridis')
    for container in ax.containers:
        ax.bar_label(container)
    plt.title("Model / Test Time")
    plt.xlabel("")

    plt.subplot(713)
    ax = sns.barplot(data=df.sort_values(by="Train Accuracy", ascending=True), y="Model Name", x="Train Accuracy", palette='viridis')
    for container in ax.containers:
        ax.bar_label(container)
    plt.title("Model / Train Accuracy")
    plt.xlabel("")

    plt.subplot(714)
    ax = sns.barplot(data=df.sort_values(by="Test Accuracy", ascending=True), y="Model Name", x="Test Accuracy", palette='viridis')
    for container in ax.containers:
        ax.bar_label(container)
    plt.title("Model / Test Accuracy")
    plt.xlabel("")

    plt.subplot(715)
    ax = sns.barplot(data=df.sort_values(by="F1", ascending=True), y="Model Name", x="F1", palette='viridis')
    for container in ax.containers:
        ax.bar_label(container)
    plt.title("Model / F1")
    plt.xlabel("")

    plt.subplot(716)
    ax = sns.barplot(data=df.sort_values(by="Precision", ascending=True), y="Model Name", x="Precision", palette='viridis')
    for container in ax.containers:
        ax.bar_label(container)
    plt.title("Model / Precision")
    plt.xlabel("")

    plt.subplot(717)
    ax = sns.barplot(data=df.sort_values(by="Recall", ascending=True), y="Model Name", x="Recall", palette='viridis')
    for container in ax.containers:
        ax.bar_label(container)
    plt.title("Model / Recall")
    plt.xlabel("")
    save_fig("base_results")
    plt.show()

# Run Functions

In [None]:
drop_list = ["Hash", "LongName", 'WarningBlocker', 'WarningInfo', 'Android Rules', 'Code Size Rules', 'Comment Rules', 'Coupling Rules', 
             'MigratingToJUnit4 Rules', 'Migration13 Rules', 'Migration14 Rules', 'Migration15 Rules', 'Vulnerability Rules']

In [None]:
target = "Number of Bugs"

In [None]:
preprocess_and_train(df, drop_list, target)

# Results

In [None]:
best_scores_df = pd.read_csv("./outputs/scores.csv", index_col=0)
best_scores_df

# Model Explanation

In [None]:
best_model = pickle.load(open('./outputs/models/SVC + RandomOverSampler + k=10 + isotonic.pkl', 'rb'))

In [None]:
predict_fn = lambda x: best_model.predict_proba(x)

In [None]:
features = ['NG','NLG','NPM','TNG','TNLG','TNLPM','TNPM','Import Statement Rules','Java Logging Rules','Number of Bugs']

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(df[features].astype(int).values,
                                                   mode='classification',
                                                   class_names=df[target].unique().tolist(),
                                                   training_labels=df['Number of Bugs'],
                                                   feature_names=features)

In [None]:
def explain_instances(df, features, predict_fn, explainer, idx_arr):
    for i in idx_arr:
        exp = explainer.explain_instance(df.loc[i, features].astype(int).values, predict_fn, num_features=len(features))
        exp.show_in_notebook(show_table=True)
        #exp.as_list()

In [None]:
idx_arr = [0, 7, 10, 125, 450]

In [None]:
explain_instances(df, features, predict_fn, explainer, idx_arr)

# Conclusion

In [None]:
best_scores_df.drop(columns=['Selected Features', 'Parameters']).sort_values(by='Test Accuracy', ascending=False).reset_index(drop=True)