

This project is designed for seperating fluoroscence and non fluoroscence retinal deposits (presumed amyloid deposits). The main goal is to predict the existance of fluoroscence signal using statistics from polarimetry microscope, allowing amyloid deposits to be detected without the use of a dye.

14 polarization metrics values are considered as input features, 3 machine learning classifiers: LDA, SVM and Random forest have been tested.

We used 3 methods to deal with data imblance:
1. Adding Fluo_Negative_Cross_Negative data by selecting retinal area near deposits
2. ADASYN algorithm
3. BoardlineSMOTE


Author: Yunyi Qiu
Co-author: Tao Jin
Data provided by: Dr.Melanie Campbell's Lab

# Import Data and Basic preparation 

In [170]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat
import os
from math import pi

# Set precision
pd.options.display.max_columns = None
pd.options.display.max_rows = None
np.set_printoptions(precision=5)

pd.options.display.precision = 5
pd.set_option('precision',5)

In [171]:
datapath = os.getcwd() + "\\data\\dbt_m.csv"
df = pd.read_csv(datapath)

In [172]:
# Adjust some values in the table
# The Q metric
df[["Q_metric_Background_Mean", "Q_metric_Background_Std", "Q_metric_Deposit_Mean", "Q_metric_Deposit_Std", 
    "Q_metric_Full_Mean", "Q_metric_Full_Std"]] = \
df[["Q_metric_Background_Mean", "Q_metric_Background_Std", "Q_metric_Deposit_Mean", "Q_metric_Deposit_Std", 
    "Q_metric_Full_Mean", "Q_metric_Full_Std"]].divide(3) 

# The Linear retardance
df[["Retardance_Lin_Background_Mean","Retardance_Lin_Background_Std", 
    "Retardance_Lin_Deposit_Mean", "Retardance_Lin_Deposit_Std", 
    "Retardance_Lin_Full_Mean", "Retardance_Lin_Full_Std"]] = \
df[["Retardance_Lin_Background_Mean","Retardance_Lin_Background_Std", 
    "Retardance_Lin_Deposit_Mean", "Retardance_Lin_Deposit_Std", 
    "Retardance_Lin_Full_Mean", "Retardance_Lin_Full_Std"]].divide(180)

# The Circular retardance, Circular diattenuation , Circular polarizance
df[["Retardance_Circ_Background_Mean", "Retardance_Circ_Background_Std", 
    "Retardance_Circ_Deposit_Mean", "Retardance_Circ_Deposit_Std",
    "Retardance_Circ_Full_Mean", "Retardance_Circ_Full_Std"]] = \
(df[["Retardance_Circ_Background_Mean", "Retardance_Circ_Background_Std", 
    "Retardance_Circ_Deposit_Mean", "Retardance_Circ_Deposit_Std",
    "Retardance_Circ_Full_Mean", "Retardance_Circ_Full_Std"]]+ 180).divide(360)

# The Circular diattenuation
df[["Diattenuation_Circ_Background_Mean", "Diattenuation_Circ_Background_Std", 
    "Diattenuation_Circ_Deposit_Mean", "Diattenuation_Circ_Deposit_Std",
    "Diattenuation_Circ_Full_Mean", "Diattenuation_Circ_Full_Std"]] = \
(df[["Diattenuation_Circ_Background_Mean", "Diattenuation_Circ_Background_Std", 
    "Diattenuation_Circ_Deposit_Mean", "Diattenuation_Circ_Deposit_Std",
    "Diattenuation_Circ_Full_Mean", "Diattenuation_Circ_Full_Std"]]+ 1).divide(2)

# The Circular polarizance
df[["Polarizance_Circ_Background_Mean", "Polarizance_Circ_Background_Std", 
    "Polarizance_Circ_Deposit_Mean", "Polarizance_Circ_Deposit_Std",
    "Polarizance_Circ_Full_Mean", "Polarizance_Circ_Full_Std"]] = \
(df[["Polarizance_Circ_Background_Mean", "Polarizance_Circ_Background_Std", 
     "Polarizance_Circ_Deposit_Mean", "Polarizance_Circ_Deposit_Std",
     "Polarizance_Circ_Full_Mean", "Polarizance_Circ_Full_Std"]]+ 1).divide(2)

# The MMT parameters
df[["A_metric_Background_Mean", "A_metric_Background_Std",
    "A_metric_Deposit_Mean", "A_metric_Deposit_Std"]] = \
(df[["A_metric_Background_Mean", "A_metric_Background_Std",
    "A_metric_Deposit_Mean", "A_metric_Deposit_Std"]] + 1).divide(2)

df[["t_metric_Background_Mean", "t_metric_Background_Std",
    "t_metric_Deposit_Mean", "t_metric_Deposit_Std"]] = \
(df[["t_metric_Background_Mean", "t_metric_Background_Std",
    "t_metric_Deposit_Mean", "t_metric_Deposit_Std"]] + 1).divide(2)

df[["x_metric_Background_Mean", "x_metric_Background_Std",
    "x_metric_Deposit_Mean", "x_metric_Deposit_Std"]] = \
(df[["x_metric_Background_Mean", "x_metric_Background_Std",
    "x_metric_Deposit_Mean", "x_metric_Deposit_Std"]] + pi/4).divide(pi/2)

## Prepare data and labels

In [173]:
df_label = df[["RegionFolder", "Subject", "FluoroSignal", "CrossedSignal"]]

In [174]:
# Statistics of the number 
print("  Number of each class \n"
      "  Fluo_Positive_Cross_Positive: " + str(sum(np.multiply(df_label["FluoroSignal"], df_label["CrossedSignal"]))) + "\n" + 
      "  Fluo_Positive_Cross_Negative: " + str(sum(np.multiply(df_label["FluoroSignal"] == 1, df_label["CrossedSignal"] == 0))) + "\n"+
      "  Fluo_Negative_Cross_Positive: " + str(sum(np.multiply(df_label["FluoroSignal"] == 0, df_label["CrossedSignal"] == 1))) + "\n"+
      "  Fluo_Negative_Cross_Negative: " + str(sum(np.multiply(df_label["FluoroSignal"] == 0, df_label["CrossedSignal"] == 0))) + "\n"
     )

  Number of each class 
  Fluo_Positive_Cross_Positive: 789
  Fluo_Positive_Cross_Negative: 20
  Fluo_Negative_Cross_Positive: 131
  Fluo_Negative_Cross_Negative: 7



In [175]:
# df for background

dfb = df[["RegionFolder", "Subject",
          "Depolarization_Power_Background_Mean", "Depolarization_Power_Background_Std", 
          "Q_metric_Background_Mean", "Q_metric_Background_Std",
          "Anisotropy_Lin_Background_Mean", "Anisotropy_Lin_Background_Std",
          "Anisotropy_Circ_Background_Mean", "Anisotropy_Circ_Background_Std",
          "Polarizance_Lin_Background_Mean", "Polarizance_Lin_Background_Std",
          "Polarizance_Circ_Background_Mean", "Polarizance_Circ_Background_Std",
          "Diattenuation_Lin_Background_Mean", "Diattenuation_Lin_Background_Std",
          "Diattenuation_Circ_Background_Mean", "Diattenuation_Circ_Background_Std",
          "Retardance_Lin_Background_Mean", "Retardance_Lin_Background_Std",
          "Retardance_Circ_Background_Mean", "Retardance_Circ_Background_Std",
          "A_metric_Background_Mean", "A_metric_Background_Std",
          "b_metric_Background_Mean", "b_metric_Background_Std",
          "t_metric_Background_Mean", "t_metric_Background_Std",
          "x_metric_Background_Mean", "x_metric_Background_Std",
          "FluoroSignal", "CrossedSignal"
        ]]

dfb.set_index(["RegionFolder", "Subject"], inplace=True)

In [176]:
# df for deposits

dfd = df[["RegionFolder", "Subject",
          "Depolarization_Power_Deposit_Mean", "Depolarization_Power_Deposit_Std", 
          "Q_metric_Deposit_Mean", "Q_metric_Deposit_Std",
          "Anisotropy_Lin_Deposit_Mean", "Anisotropy_Lin_Deposit_Std",
          "Anisotropy_Circ_Deposit_Mean", "Anisotropy_Circ_Deposit_Std",
          "Polarizance_Lin_Deposit_Mean", "Polarizance_Lin_Deposit_Std",
          "Polarizance_Circ_Deposit_Mean", "Polarizance_Circ_Deposit_Std",
          "Diattenuation_Lin_Deposit_Mean", "Diattenuation_Lin_Deposit_Std",
          "Diattenuation_Circ_Deposit_Mean", "Diattenuation_Circ_Deposit_Std",
          "Retardance_Lin_Deposit_Mean", "Retardance_Lin_Deposit_Std",
          "Retardance_Circ_Deposit_Mean", "Retardance_Circ_Deposit_Std",
          "A_metric_Deposit_Mean", "A_metric_Deposit_Std",
          "b_metric_Deposit_Mean", "b_metric_Deposit_Std",
          "t_metric_Deposit_Mean", "t_metric_Deposit_Std",
          "x_metric_Deposit_Mean", "x_metric_Deposit_Std",
          "FluoroSignal", "CrossedSignal"
        ]]
dfd.set_index(["RegionFolder", "Subject"], inplace=True)

# Training using Scikit learn 

In [177]:
# import basic fucntions
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import BorderlineSMOTE
# Train lda, support vector machine, random forest
from sklearn import svm
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda

## Prepare data

In [178]:
# select data_preparation scheme
def data_preparation_scheme(s_num):
    if s_num == 1: # train with background data
        dfct = dfd[dfd["CrossedSignal"]==1].copy()
        
        df_b = dfb.sample(n=658, random_state=42)
        
        oldname = ["Depolarization_Power_Background_Mean", "Depolarization_Power_Background_Std", 
                   "Q_metric_Background_Mean", "Q_metric_Background_Std",
                   "Anisotropy_Lin_Background_Mean", "Anisotropy_Lin_Background_Std",
                   "Anisotropy_Circ_Background_Mean", "Anisotropy_Circ_Background_Std",
                   "Polarizance_Lin_Background_Mean", "Polarizance_Lin_Background_Std",
                   "Polarizance_Circ_Background_Mean", "Polarizance_Circ_Background_Std",
                   "Diattenuation_Lin_Background_Mean", "Diattenuation_Lin_Background_Std",
                   "Diattenuation_Circ_Background_Mean", "Diattenuation_Circ_Background_Std",
                   "Retardance_Lin_Background_Mean", "Retardance_Lin_Background_Std",
                   "Retardance_Circ_Background_Mean", "Retardance_Circ_Background_Std",
                   "A_metric_Background_Mean", "A_metric_Background_Std",
                   "b_metric_Background_Mean", "b_metric_Background_Std",
                   "t_metric_Background_Mean", "t_metric_Background_Std",
                   "x_metric_Background_Mean", "x_metric_Background_Std",
                   "FluoroSignal", "CrossedSignal"]
        
        newname = ["Depolarization_Power_Deposit_Mean", "Depolarization_Power_Deposit_Std", 
                   "Q_metric_Deposit_Mean", "Q_metric_Deposit_Std",
                   "Anisotropy_Lin_Deposit_Mean", "Anisotropy_Lin_Deposit_Std",
                   "Anisotropy_Circ_Deposit_Mean", "Anisotropy_Circ_Deposit_Std",
                   "Polarizance_Lin_Deposit_Mean", "Polarizance_Lin_Deposit_Std",
                   "Polarizance_Circ_Deposit_Mean", "Polarizance_Circ_Deposit_Std",
                   "Diattenuation_Lin_Deposit_Mean", "Diattenuation_Lin_Deposit_Std",
                   "Diattenuation_Circ_Deposit_Mean", "Diattenuation_Circ_Deposit_Std",
                   "Retardance_Lin_Deposit_Mean", "Retardance_Lin_Deposit_Std",
                   "Retardance_Circ_Deposit_Mean", "Retardance_Circ_Deposit_Std",
                   "A_metric_Deposit_Mean", "A_metric_Deposit_Std",
                   "b_metric_Deposit_Mean", "b_metric_Deposit_Std",
                   "t_metric_Deposit_Mean", "t_metric_Deposit_Std",
                   "x_metric_Deposit_Mean", "x_metric_Deposit_Std",
                   "FluoroSignal", "CrossedSignal"]
        
        namedict = {oldname[i]: newname[i] for i in range(len(oldname))}
        
        df_b.rename(columns = namedict, inplace=True)
        df_b["FluoroSignal"] = np.zeros(df_b.shape[0], dtype=np.int32)
        df_b["CrossedSignal"] = np.zeros(df_b.shape[0], dtype=np.int32)
        df_r = pd.concat([dfct,df_b])
        
        # prepare data and the labels
        X_r = df_r.values[:,0:(df_r.shape[1]-2)]
        y = df_r["FluoroSignal"].values
        
        # Standardized X
        scaler = preprocessing.StandardScaler().fit(X_r)
        X = scaler.transform(X_r)
        # Split train set and test set
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
        return X_train, X_test, y_train, y_test
    
    if s_num == 2: # train with ADASYN algorithm
        dfct = dfd[dfd["CrossedSignal"]==1].copy()
        metric_data = dfct.values[:,0:(dfct.shape[1]-2)]
        metric_label = dfct["FluoroSignal"].values
        
        ada = ADASYN(random_state=42)
        [ada_data, ada_label] = ada.fit_resample(metric_data,metric_label)
        
        scaler = preprocessing.StandardScaler().fit(ada_data)
        ada_data_ = scaler.transform(ada_data)
        
        X_ada_train, X_ada_test, y_ada_train, y_ada_test = train_test_split(ada_data_, ada_label, test_size=0.2, random_state=42)
        return X_ada_train, X_ada_test, y_ada_train, y_ada_test
    
    if s_num == 3: # train using borderlineSMOTE
        dfct = dfd[dfd["CrossedSignal"]==1].copy()
        metric_data = dfct.values[:,0:(dfct.shape[1]-2)]
        metric_label = dfct["FluoroSignal"].values
        
        BS = BorderlineSMOTE(random_state=42)
        [bs_data, bs_label] = BS.fit_resample(metric_data, metric_label)
        
        scaler = preprocessing.StandardScaler().fit(bs_data)
        bs_data_ = scaler.transform(bs_data)
        
        X_bs_train, X_bs_test, y_bs_train, y_bs_test = train_test_split(bs_data_, bs_label, test_size=0.2, random_state=42)
        return X_bs_train, X_bs_test, y_bs_train, y_bs_test

## Support vector machine 

In [179]:
def train_svm(data_train, label_train):
    wdict_svm = {0: 1, 1: 1}
    clf_svm = svm.SVC(class_weight=wdict_svm)
    # optimize the parameters
    param_dist = {"kernel": ["rbf", "poly"], 
                  "degree": [1, 2, 3],
                  "gamma": sp_randint(0, 10), 
                  "shrinking": [True, False]
                  }
    n_iter_search = 30
    rs_svm = RandomizedSearchCV(clf_svm, param_distributions=param_dist,
                                      n_iter=n_iter_search, cv=10, n_jobs=-1)
    # train
    rs_svm.fit(data_train, label_train)
    return rs_svm

## Linear discriminant analysis

In [180]:
def train_lda(data_train,label_train):
    clf_lda = lda(store_covariance=True)
    
    # train
    clf_lda.fit(data_train,label_train)
    lda_cv_score = cross_val_score(clf_lda, data_train, label_train, cv=10, scoring="accuracy")
    return clf_lda, lda_cv_score

## Random forest 

In [181]:
def train_rf(data_train, label_train, std=False):
    wdict_rf = {0: 1, 1: 1}
    clf_rf = RandomForestClassifier(n_jobs=-1)
    param_dist = {"max_depth": sp_randint(4,10),
                  "n_estimators": sp_randint(100, 1000),
                  "bootstrap": [True, False],
                  "max_features": sp_randint(1, 6) if std else sp_randint(1,16),
                  "min_samples_split": sp_randint(2,15),
                  "criterion": ["gini", "entropy"]}
    n_iter_search = 100
    rs_rf = RandomizedSearchCV(clf_rf, param_distributions=param_dist,
                                      n_iter=n_iter_search, cv=10, n_jobs=-1)
    # train
    rs_rf.fit(data_train, label_train)
    return rs_rf

## Other function

In [182]:
# Define a function for inputing stats from classifier to dataframe
def metric_scores(m_clf,mname, truevalt, predictvalt,LDA=False,lda_cv_score=None):
    
    trueval = truevalt.copy()
    predictval = predictvalt.copy()
    accuracy = accuracy_score(trueval, predictval)
    precision = precision_score(trueval, predictval)
    recall = recall_score(trueval, predictval)
    specificity = recall_score(trueval, predictval, pos_label=0)
    # CV score of the best_estimator
    cvscore = np.mean(lda_cv_score) if LDA else m_clf.best_score_
        
    df_scores = pd.DataFrame({"Method": mname, "Accuracy": [accuracy], "Precision": [precision], "Sensitivity (Recall)": [recall], 
                              "Specificity": [specificity], "Mean accuracy": [cvscore]})
    return df_scores

def train_models(X_train_in, X_test_in, y_train_in, y_test_in,std=False):  
    df_result = pd.DataFrame({"Method": [], "Accuracy": [], "Precision": [], "Sensitivity (Recall)": [], 
                              "Specificity": [], "Mean accuracy": []})
    # lda
    rs_lda_t,lda_cv_score = train_lda(X_train_in, y_train_in)
    y_lda_predict = rs_lda_t.predict(X_test_in)
    df_temp1 = metric_scores(rs_lda_t, "LDA", y_test_in, y_lda_predict,LDA=True,lda_cv_score=lda_cv_score)
    df_result = df_result.append(df_temp1)
    
    # svm
    rs_svm_t = train_svm(X_train_in, y_train_in)
    y_svm_predict = rs_svm_t.predict(X_test_in)
    df_temp2 = metric_scores(rs_svm_t, "SVM", y_test_in, y_svm_predict)
    df_result = df_result.append(df_temp2)
    # rf
    rs_rf_t = train_rf(X_train_in, y_train_in,std=std)
    y_rf_predict = rs_rf_t.predict(X_test_in)    
    df_temp3 = metric_scores(rs_rf_t, "RF", y_test_in, y_rf_predict)
    df_result = df_result.append(df_temp3)
    return df_result, rs_lda_t, y_lda_predict, rs_svm_t, y_svm_predict, rs_rf_t, y_rf_predict

def fimportance_dataframe(name, score):
    feature_importance_table = pd.DataFrame({"Metric": [], "Importance in percentage":[]})
    for name, score in zip(name,score):
        feature_importance_add = pd.DataFrame({"Metric": name, "Importance in percentage":[score]})
        feature_importance_table = feature_importance_table.append(feature_importance_add)
    
    feature_importance_table = feature_importance_table[['Metric','Importance in percentage']]
    feature_importance_table = feature_importance_table.sort_values('Importance in percentage', ascending=False)
    
    return feature_importance_table

# Results 

## Using background data 

In [65]:
X_train_1, X_test_1, y_train_1, y_test_1 = data_preparation_scheme(s_num=1)

b_result_1, b_lda_model_1, b_lda_pred_val_1, b_svm_model_1, b_svm_pred_val_1, b_rf_model_1, b_rf_pred_val_1 = \
                                                train_models(X_train_1, X_test_1, y_train_1,y_test_1)

In [54]:
b_result_1.set_index(["Method"], inplace=True)
b_result_1 

Unnamed: 0_level_0,Accuracy,Mean accuracy,Precision,Sensitivity (Recall),Specificity
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LDA,0.93038,0.90964,0.93605,0.93605,0.92361
SVM,0.93671,0.92235,0.93182,0.95349,0.91667
RF,0.93987,0.93502,0.92737,0.96512,0.90972


In [59]:
b_rf_model_1.best_params_

{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 9,
 'max_features': 4,
 'min_samples_split': 5,
 'n_estimators': 161}

In [84]:
feature_names = list(dfd.columns[0:-2].values)
feature_names_1 = feature_names[:28]

feature_importance_1 = b_rf_model_1.best_estimator_.feature_importances_
importance_table_1 = fimportance_dataframe(feature_names_1,feature_importance_1)


importance_table_1.set_index(["Metric"])

Unnamed: 0_level_0,Importance in percentage
Metric,Unnamed: 1_level_1
Retardance_Lin_Deposit_Std,0.35419
Anisotropy_Lin_Deposit_Std,0.22956
Retardance_Lin_Deposit_Mean,0.1146
Anisotropy_Lin_Deposit_Mean,0.0497
Q_metric_Deposit_Mean,0.02464
x_metric_Deposit_Mean,0.0175
b_metric_Deposit_Std,0.01487
Diattenuation_Circ_Deposit_Std,0.01384
Polarizance_Circ_Deposit_Std,0.01295
Q_metric_Deposit_Std,0.01294


### After random forest selection 

In [213]:
selected_metrics = importance_table_1[importance_table_1.loc[:,'Importance in percentage'] > 0.01]["Metric"].values

In [214]:
selected_metrics

array(['Retardance_Lin_Deposit_Std', 'Anisotropy_Lin_Deposit_Std',
       'Retardance_Lin_Deposit_Mean', 'Anisotropy_Lin_Deposit_Mean',
       'Q_metric_Deposit_Mean', 'x_metric_Deposit_Mean',
       'b_metric_Deposit_Std', 'Diattenuation_Circ_Deposit_Std',
       'Polarizance_Circ_Deposit_Std', 'Q_metric_Deposit_Std',
       'Depolarization_Power_Deposit_Mean',
       'Depolarization_Power_Deposit_Std', 't_metric_Deposit_Std',
       'Diattenuation_Circ_Deposit_Mean', 'Diattenuation_Lin_Deposit_Std',
       'Polarizance_Circ_Deposit_Mean'], dtype=object)

In [215]:
len(selected_metrics)

16

In [216]:
dfd_1_selected = df[selected_metrics.tolist() + ["FluoroSignal", "CrossedSignal"]]

selected_metrics_background = [metric.replace('Deposit','Background') for metric in selected_metrics]
dfb_1_selected = df[selected_metrics_background + ["FluoroSignal", "CrossedSignal"]]

In [217]:
def new_preparation_background():
    dfct = dfd_1_selected[dfd_1_selected["CrossedSignal"]==1].copy()  
    
    df_b = dfb_1_selected.sample(n=658, random_state=42)
    oldname = df_b.columns.tolist()
    newname = selected_metrics.tolist() + ["FluoroSignal", "CrossedSignal"]
    namedict = {oldname[i]: newname[i] for i in range(len(oldname))}
    df_b.rename(columns=namedict, inplace=True)
    
    df_b["FluoroSignal"] = np.zeros(df_b.shape[0], dtype=np.int32)
    df_b["CrossedSignal"] = np.zeros(df_b.shape[0], dtype=np.int32)
    
    df_r = pd.concat([dfct,df_b])
        
    # prepare data and the labels
    X_r = df_r.values[:,0:(df_r.shape[1]-2)]
    y = df_r["FluoroSignal"].values
        
    # Standardized X
    scaler = preprocessing.StandardScaler().fit(X_r)
    X = scaler.transform(X_r)
    # Split train set and test set
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [218]:
X_train_selected, X_test_selected, y_train_selected, y_test_selected = new_preparation_background()

b_result_selected, b_lda_model_selected, b_lda_pred_val_selected, b_svm_model_selected, b_svm_pred_val_selected, b_rf_model_selected, b_rf_pred_val_selected = \
                                                train_models(X_train_selected, X_test_selected, y_train_selected,y_test_selected)

In [219]:
b_result_selected.set_index(["Method"], inplace=True)
b_result_selected 

Unnamed: 0_level_0,Accuracy,Mean accuracy,Precision,Sensitivity (Recall),Specificity
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LDA,0.92405,0.90173,0.93023,0.93023,0.91667
SVM,0.93038,0.91838,0.92614,0.94767,0.90972
RF,0.93671,0.93582,0.91758,0.97093,0.89583


## Using ADASYN

In [71]:
X_train_ada, X_test_ada, y_train_ada, y_test_ada = data_preparation_scheme(s_num=2)

ada_result_1, ada_lda_model_1, ada_lda_pred_val_1, ada_svm_model_1, ada_svm_pred_val_1, ada_rf_model_1, ada_rf_pred_val_1 = \
                                                train_models(X_train_ada, X_test_ada, y_train_ada,y_test_ada)

In [76]:
ada_result_1.set_index(["Method"], inplace=True)
ada_result_1

Unnamed: 0_level_0,Accuracy,Mean accuracy,Precision,Sensitivity (Recall),Specificity
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LDA,0.76972,0.7673,0.81818,0.6879,0.85
SVM,0.92744,0.93829,0.88953,0.97452,0.88125
RF,0.90221,0.93592,0.91447,0.88535,0.91875


In [167]:
ada_rf_model_1.best_params_

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': 9,
 'max_features': 6,
 'min_samples_split': 3,
 'n_estimators': 727}

In [220]:
feature_names = list(dfd.columns[0:-2].values)
feature_names_2 = feature_names[:28]

feature_importance_2 = ada_rf_model_1.best_estimator_.feature_importances_
importance_table_2 = fimportance_dataframe(feature_names_2,feature_importance_2)

importance_table_2.set_index(["Metric"])

Unnamed: 0_level_0,Importance in percentage
Metric,Unnamed: 1_level_1
x_metric_Deposit_Mean,0.08745
Retardance_Lin_Deposit_Std,0.08552
Diattenuation_Circ_Deposit_Std,0.08035
Retardance_Lin_Deposit_Mean,0.06285
Polarizance_Circ_Deposit_Std,0.05558
Anisotropy_Lin_Deposit_Std,0.04126
Anisotropy_Lin_Deposit_Mean,0.03851
Retardance_Circ_Deposit_Std,0.03825
Q_metric_Deposit_Mean,0.03432
Polarizance_Lin_Deposit_Std,0.03398


## Using borderlineSMOTE 

In [75]:
X_train_BS, X_test_BS, y_train_BS, y_test_BS = data_preparation_scheme(s_num=3)

BS_result_1, BS_lda_model_1, BS_lda_pred_val_1, BS_svm_model_1, BS_svm_pred_val_1, BS_rf_model_1, BS_rf_pred_val_1 = \
                                                train_models(X_train_BS, X_test_BS, y_train_BS,y_test_BS)

In [77]:
BS_result_1.set_index(["Method"], inplace=True)
BS_result_1

Unnamed: 0_level_0,Accuracy,Mean accuracy,Precision,Sensitivity (Recall),Specificity
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LDA,0.80696,0.81542,0.85806,0.77326,0.84722
SVM,0.96203,0.93423,0.94444,0.98837,0.93056
RF,0.9462,0.92631,0.94798,0.95349,0.9375


In [221]:
BS_rf_model_1.best_params_

{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': 9,
 'max_features': 3,
 'min_samples_split': 15,
 'n_estimators': 406}

In [222]:
feature_names = list(dfd.columns[0:-2].values)
feature_names_3 = feature_names[:28]

feature_importance_3 = BS_rf_model_1.best_estimator_.feature_importances_
importance_table_3 = fimportance_dataframe(feature_names_3,feature_importance_3)

importance_table_3.set_index(["Metric"])

Unnamed: 0_level_0,Importance in percentage
Metric,Unnamed: 1_level_1
x_metric_Deposit_Mean,0.06604
Retardance_Lin_Deposit_Std,0.06568
Diattenuation_Circ_Deposit_Std,0.06113
Retardance_Lin_Deposit_Mean,0.05774
Anisotropy_Lin_Deposit_Mean,0.04664
Retardance_Circ_Deposit_Mean,0.04228
Polarizance_Lin_Deposit_Std,0.03933
Q_metric_Deposit_Std,0.03911
Anisotropy_Lin_Deposit_Std,0.03793
Depolarization_Power_Deposit_Mean,0.03647
