In [None]:
import numpy as np
import os
import cv2
import pandas as pd
from tqdm import tqdm
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [None]:
SEED = 42 # random seed

## Loading the frames

In [None]:
table_info = pd.read_excel("Lung_level_final_table.xlsx") # Table with Patient, Video ID, DiseaseID, and the link to the Video File
table_info = table_info.rename(columns = {'ID':'Paciente ',"Diseased (Normal : 0 Abnormal : 1)":"DiseaseID"})
table_info['patient_video'] = table_info['Paciente '] + "_" + table_info['Video ID']

# MAX BRIGHTNESS FRAMES
frames_50_folder = "Final_Frames_from_Videos_cons50/"
frames_100_folder = "Final_Frames_from_Videos_cons100/"

# MIDDLE 50% FRAMES
frames_middle_50_skip_0_folder = "Frames_from_Videos_middle_50_skip_0/"
frames_middle_50_skip_1_folder = "Frames_from_Videos_middle_50_skip_1/"
frames_middle_50_skip_2_folder = "Frames_from_Videos_middle_50_skip_2/"

table_info

In [None]:
table_info['DiseaseID'].value_counts()

In [None]:
import os
frame_folder = frames_middle_50_skip_0_folder # use the required frame folder
patient_video_list = os.listdir(frame_folder) 
file_path = []
paciente = []
video_id = []

# Create a mapping of values to their corresponding order
mapping = {value: index for index, value in enumerate(patient_video_list)}

# Apply the mapping to the DataFrame's column
table_info['patient_video'] = table_info['patient_video'].map(mapping)

# Sort the DataFrame based on the mapped values
table_info = table_info.sort_values(by='patient_video')

# Reset the index to maintain a clean index order
table_info = table_info.reset_index(drop=True)

table_info['patient_video'] = table_info['Paciente '] + "_" + table_info['Video ID']
table_info

for patient_video in patient_video_list:
    frame_list = os.listdir(frame_folder + patient_video + '/')
    for frame in frame_list:
        paciente.append(patient_video.split("_")[0])
        video_id.append(patient_video.split("_")[1])
        file_path.append(frame_folder + patient_video + '/' + frame)
        
    
train = pd.DataFrame({"Paciente ":paciente, "Video ID": video_id, "File": file_path})

In [None]:
# mapping of patient-videoid to the frame image location
train = pd.merge(train, table_info[['Paciente ','Video ID','DiseaseID']], on = ['Paciente ','Video ID'], how = 'inner')
train['key'] = train['Paciente '] + train['Video ID']
train

In [None]:
# count of normal and abnormal frames
train['DiseaseID'].value_counts()

## Train test splitting

In [None]:
IMAGE_SIZE =128 # fixing image size to 128 * 128 * 1
data_dir = os.getcwd()

def read_image(filepath):
    return cv2.imread(os.path.join(data_dir, filepath), cv2.IMREAD_GRAYSCALE) # READING AS GRAYSCALE TO KEEP SINGLE CHANNEL

# Resize image to target size
def resize_image(image, image_size):
    resized_image = cv2.resize(image.copy(), image_size, interpolation=cv2.INTER_AREA)
    return resized_image[:, :, np.newaxis]

X_train = np.zeros((train.shape[0], IMAGE_SIZE, IMAGE_SIZE, 1))

for i, file in tqdm(enumerate(train['File'].values)):
    image = read_image(file)
    image = image[:, :, np.newaxis]

    if image is not None:
        X_train[i] = resize_image(image, (IMAGE_SIZE, IMAGE_SIZE))
    

X_Train = X_train / 255.
print('Train Shape: {}'.format(X_Train.shape))

Y_Train = train['DiseaseID'].values
print('Total target labels:',len(Y_Train))

Y_Train = to_categorical(Y_Train, num_classes=2) # num_classes: number of classes in target

In [None]:
patient_videoids = train[['key']].groupby(['key']).sum().reset_index()

key_split = patient_videoids.iloc[int(np.floor(len(patient_videoids)*0.7)) - 1]['key']
train_len = max(train[(train['key'] == key_split)].index) + 1

X_train = X_Train[:train_len]
X_val = X_Train[train_len:]
Y_train = Y_Train[:train_len]
Y_val = Y_Train[train_len:]

In [None]:
split_table = train.copy()
split_table['Split'] = "test"
split_table['Split'].iloc[:train_len] = "train"

split_table

##### Note: "train_len" variable will be used in many calculations later!

## Modeling

In [None]:
# importing the required functions
from keras.layers import Input, Conv2D, concatenate, MaxPool2D, Dropout, Dense, Activation, Flatten, GlobalAveragePooling2D, BatchNormalization
from keras.models import Model
import tensorflow as tf
from keras.applications.densenet import DenseNet201

In [None]:
# image input initialization

IMG_WIDTH = 128
IMG_HEIGHT = 128
IMG_CHANNELS = 1
 
inputs = Input((IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS))

In [None]:
# neural network architecture
s=inputs

# 1st branch
c1 = Conv2D(256, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(s)
p1=  MaxPool2D(pool_size=(2,2))(c1)
p1=  Dropout(0.2)(p1)

mid1 = p1

c1_1= Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(p1)

# 2nd branch
c2=  Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(s)
p2=   MaxPool2D(pool_size=(2,2))(c2)

mid2 = p2
mid2 = Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(mid2)
mid2 = Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(mid2)
mid2 =  Dropout(0.2)(mid2)
P1_R = MaxPool2D(pool_size=(2,2))(mid2)

# concatenating from 1st and 2nd branch              
R1=concatenate([c1_1,p2])

C1_R=Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(R1)
mid1 = Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(mid1)
mid1 = Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(mid1)
mid1 =  Dropout(0.2)(mid1)
            
mid1_1 = concatenate([C1_R,mid1])

C11_R=Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(mid1_1)
C11_R=Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(C11_R)
C11_R = MaxPool2D(pool_size=(2,2))(C11_R)
C11_R =  Dropout(0.2)(C11_R)

mid2_1 = concatenate([C11_R,P1_R])

mid2_1 = Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(mid2_1)
x = Conv2D(16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(mid2_1)

densenet = DenseNet201(weights='imagenet', include_top=False)

x = (Flatten())(x)

# densenet branch
c_b_1=Conv2D(3, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(s)

branch_2 = densenet(c_b_1)
    
branch_2 = GlobalAveragePooling2D()(branch_2)
branch_2= BatchNormalization()(branch_2)
branch_2 = Dropout(0.5)(branch_2)
branch_2= Dense(256, activation='relu')(branch_2)

final=concatenate([x,branch_2])
final = BatchNormalization()(final)
final = Dropout(0.2)(final)
final = Dense(1024, activation='relu')(final)
final= Dropout(0.2)(final)
final= Dense(512, activation='relu')(final)
final= Dropout(0.2)(final) 
final= Dense(128, activation='relu')(final)
final= Dropout(0.5)(final) 
final= Dense(64, activation='relu')(final)
final= Dropout(0.5)(final) 

output = Dense(2,activation = 'softmax', name='root')(final)
      
model = Model(inputs,output)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.002, beta_1=0.9, beta_2=0.999, epsilon=0.1, weight_decay=0.0)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()

In [None]:
import time
start = time.time()

EPOCHS=15
BATCH_SIZE=16

model.fit(X_train,Y_train,batch_size=BATCH_SIZE,steps_per_epoch=X_train.shape[0] // BATCH_SIZE, epochs=EPOCHS)
model.save("model_weights/model_CNN_16batch_middle50skip0_grey.h5") # model pickle file name

print((time.time() - start)/60)

In [None]:
from tensorflow.keras.models import load_model

## Load the corresponding model pickle file
# model = load_model('model_weights/model_CNN_16batch_middle50skip2_grey.h5') # middle 50% skip 2 - 3.41 hours, accuracy: 0.9978
# model = load_model('model_weights/model_CNN_16batch_middle50skip1_grey.h5') # middle 50% skip 1 - 4.61 hours, accuracy: 0.9965
model = load_model('model_weights/model_CNN_16batch_middle50skip0_grey.h5') # middle 50% skip 0 - 11.48 hours, accuracy: 0.9996

model.summary()

In [None]:
import time
start = time.time()

# Testing
Y_pred = model.predict(X_val)

Y_pred_labels = np.array([round(x[1]) for x in Y_pred])
Y_val_labels = train['DiseaseID'].values[train_len:]

print((time.time() - start)/60)

## Calculating metrics at Frame level for CNN + DenseNet model

In [None]:
Model_Frames_list = []
test_accuracy_list = []
test_precision_score_list = []
test_recall_score_list = []
test_f1_score_list = []

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 

Model_Frames = "CNN Model with middle 50% frames skip 0 grey"
Model_Frames_list.append(Model_Frames)

test_accuracy = accuracy_score(Y_val_labels, Y_pred_labels)
test_accuracy_list.append(test_accuracy)
print(f"Test Accuracy from CNN using whole data as test: {test_accuracy*100}%")

test_precision_score = precision_score(Y_val_labels, Y_pred_labels)
test_precision_score_list.append(test_precision_score)
print(f"Test Precision from CNN using whole data as test: {test_precision_score*100}%")

test_recall_score = recall_score(Y_val_labels, Y_pred_labels)
test_recall_score_list.append(test_recall_score)
print(f"Test Recall from CNN using whole data as test: {test_recall_score*100}%")

test_f1_score = f1_score(Y_val_labels, Y_pred_labels)
test_f1_score_list.append(test_f1_score)
print(f"Test F1 Score from CNN using whole data as test: {test_f1_score*100}%")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 

Model_Frames = "CNN Model with middle 50% frames skip 1 grey"
Model_Frames_list.append(Model_Frames)

test_accuracy = accuracy_score(Y_val_labels, Y_pred_labels)
test_accuracy_list.append(test_accuracy)
print(f"Test Accuracy from CNN using whole data as test: {test_accuracy*100}%")

test_precision_score = precision_score(Y_val_labels, Y_pred_labels)
test_precision_score_list.append(test_precision_score)
print(f"Test Precision from CNN using whole data as test: {test_precision_score*100}%")

test_recall_score = recall_score(Y_val_labels, Y_pred_labels)
test_recall_score_list.append(test_recall_score)
print(f"Test Recall from CNN using whole data as test: {test_recall_score*100}%")

test_f1_score = f1_score(Y_val_labels, Y_pred_labels)
test_f1_score_list.append(test_f1_score)
print(f"Test F1 Score from CNN using whole data as test: {test_f1_score*100}%")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 

Model_Frames = "CNN Model with middle 50% frames skip 2 grey"
Model_Frames_list.append(Model_Frames)

test_accuracy = accuracy_score(Y_val_labels, Y_pred_labels)
test_accuracy_list.append(test_accuracy)
print(f"Test Accuracy from CNN using whole data as test: {test_accuracy*100}%")

test_precision_score = precision_score(Y_val_labels, Y_pred_labels)
test_precision_score_list.append(test_precision_score)
print(f"Test Precision from CNN using whole data as test: {test_precision_score*100}%")

test_recall_score = recall_score(Y_val_labels, Y_pred_labels)
test_recall_score_list.append(test_recall_score)
print(f"Test Recall from CNN using whole data as test: {test_recall_score*100}%")

test_f1_score = f1_score(Y_val_labels, Y_pred_labels)
test_f1_score_list.append(test_f1_score)
print(f"Test F1 Score from CNN using whole data as test: {test_f1_score*100}%")

## Calculating metrics at Clip level for CNN + DenseNet model

In [None]:
output_val = pd.DataFrame()

output_val['Predicted_CNN'] = Y_pred_labels
output_val['DiseaseID'] = Y_val_labels
output_val['Error_val'] = abs(output_val['DiseaseID'] - output_val['Predicted_CNN'])

output_val['Paciente '] = train['Paciente '].values[train_len:]
output_val['Video ID'] = train['Video ID'].values[train_len:]

In [None]:
# aggregating at Clip level
error_table = output_val[['Paciente ','Video ID','DiseaseID','Error_val']].groupby(['Paciente ','Video ID']).sum().reset_index()
count_rows = output_val[['Paciente ','Video ID','DiseaseID']].groupby(['Paciente ','Video ID']).count().reset_index().rename(columns = {'DiseaseID':'Count'})
error_table = pd.merge(error_table, count_rows, how = 'inner', on = ['Paciente ','Video ID'])
error_table['Error_percent'] = error_table['Error_val']/error_table['Count']

error_table

In [None]:
Model_Frames_tag_list = []
accuracy_list = []
precision_score_list = []
recall_score_list = []
f1_score_list = []

# middle 50% frames skip 2 grey
FN = len(error_table[(error_table['Error_percent'] >= 0.5)&(error_table['DiseaseID'] > 0)])
FP = len(error_table[(error_table['Error_percent'] > 0.5)&(error_table['DiseaseID'] == 0)])
TN = len(error_table[(error_table['Error_percent'] <= 0.5)&(error_table['DiseaseID'] == 0)])
TP = len(error_table[(error_table['Error_percent'] < 0.5)&(error_table['DiseaseID'] > 0)])

Model_Frames = "CNN Model with middle 50% frames skip 2 grey"
Model_Frames_tag_list.append(Model_Frames)

Accuracy = (TP+TN)/(TP+FP+FN+TN)
accuracy_list.append(Accuracy)

Precision = TP/(TP+FP)
precision_score_list.append(Precision)

Recall = TP/(TP+FN)
recall_score_list.append(Recall)

F1_Score = 2*(Recall * Precision) / (Recall + Precision)
f1_score_list.append(F1_Score)

In [None]:
pd.DataFrame({'Model info':Model_Frames_tag_list, 'Accuracy':accuracy_list, 'Precision':precision_score_list, 'Recall':recall_score_list, 'F1 Score':f1_score_list})

In [None]:
Model_Frames_tag_list = []
accuracy_list = []
precision_score_list = []
recall_score_list = []
f1_score_list = []

# middle 50% frames skip 1 grey
FN = len(error_table[(error_table['Error_percent'] >= 0.5)&(error_table['DiseaseID'] > 0)])
FP = len(error_table[(error_table['Error_percent'] > 0.5)&(error_table['DiseaseID'] == 0)])
TN = len(error_table[(error_table['Error_percent'] <= 0.5)&(error_table['DiseaseID'] == 0)])
TP = len(error_table[(error_table['Error_percent'] < 0.5)&(error_table['DiseaseID'] > 0)])

Model_Frames = "CNN Model with middle 50% frames skip 1 grey"
Model_Frames_tag_list.append(Model_Frames)

Accuracy = (TP+TN)/(TP+FP+FN+TN)
accuracy_list.append(Accuracy)

Precision = TP/(TP+FP)
precision_score_list.append(Precision)

Recall = TP/(TP+FN)
recall_score_list.append(Recall)

F1_Score = 2*(Recall * Precision) / (Recall + Precision)
f1_score_list.append(F1_Score)

In [None]:
pd.DataFrame({'Model info':Model_Frames_tag_list, 'Accuracy':accuracy_list, 'Precision':precision_score_list, 'Recall':recall_score_list, 'F1 Score':f1_score_list})

In [None]:
Model_Frames_tag_list = []
accuracy_list = []
precision_score_list = []
recall_score_list = []
f1_score_list = []

# middle 50% frames skip 0 grey
FN = len(error_table[(error_table['Error_percent'] >= 0.5)&(error_table['DiseaseID'] > 0)])
FP = len(error_table[(error_table['Error_percent'] > 0.5)&(error_table['DiseaseID'] == 0)])
TN = len(error_table[(error_table['Error_percent'] <= 0.5)&(error_table['DiseaseID'] == 0)])
TP = len(error_table[(error_table['Error_percent'] < 0.5)&(error_table['DiseaseID'] > 0)])

Model_Frames = "CNN Model with middle 50% frames skip 0 grey"
Model_Frames_tag_list.append(Model_Frames)

Accuracy = (TP+TN)/(TP+FP+FN+TN)
accuracy_list.append(Accuracy)

Precision = TP/(TP+FP)
precision_score_list.append(Precision)

Recall = TP/(TP+FN)
recall_score_list.append(Recall)

F1_Score = 2*(Recall * Precision) / (Recall + Precision)
f1_score_list.append(F1_Score)

In [None]:
pd.DataFrame({'Model info':Model_Frames_tag_list, 'Accuracy':accuracy_list, 'Precision':precision_score_list, 'Recall':recall_score_list, 'F1 Score':f1_score_list})

## Getting the 64 length array from second last layer

In [None]:
# IMPORTANT NOTE: try this code, if this gives error or takes too long try the code below
import time
start = time.time()
from keras import backend

feature_df_file = "Feature_dataframes/Feature_CNN_16batch_middle50skip0_grey.csv"

colnames = []
# creating column name list
for i in range(64):
    colnames.append("col_" + str(i))
    
index_for_classif = []
X = pd.DataFrame(columns = colnames)

count = 0

layer_name = model.layers[-2].name #for getting second last layer name

# the extraction happens for each Patient at a time.
for patient in train['Paciente '].unique():
    count+=1
    index_for_classif = list(train[train['Paciente ']==patient].index)
    
    print(len(index_for_classif))
    
    X_New = X_Train[index_for_classif]
    specific_layer_output = backend.function([model.layers[0].input], [model.get_layer(layer_name).output]) 
    layer_output = specific_layer_output([X_New])[0]
    
    df = pd.DataFrame(layer_output ,columns = colnames)
    X = pd.concat([X,df])

    if count == 5:
        X = X.reset_index(drop = True)
        X.to_csv(feature_df_file, index = False)

        X = pd.DataFrame(columns = colnames)
    elif count%5 == 0:
        X = X.reset_index(drop = True)
        # Save the updated DataFrame to the CSV file
        X.to_csv(feature_df_file, mode='a', header=False, index=False)

        X = pd.DataFrame(columns = colnames)

if count%5!=0:
    X = X.reset_index(drop = True)
    # Save the updated DataFrame to the CSV file
    X.to_csv(feature_df_file, mode='a', header=False, index=False)

    X = pd.DataFrame(columns = colnames)

print((time.time() - start)/60)

In [None]:
# IMPORTANT NOTE: if the code above doesn't work, run this      
import time
start = time.time()
from keras import backend

feature_df_file = "Feature_dataframes/Feature_CNN_16batch_middle50skip0_grey.csv"

colnames = []
# creating column name list
for i in range(64):
    colnames.append("col_" + str(i))
    
index_for_classif = []
X = pd.DataFrame(columns = colnames)

count = 0

layer_name = model.layers[-2].name #for getting second last layer name

# for skip 0 middle 50%, the extraction will happen for each Patient-VideoID combination at a time. This is to reduce the memory consumption
for key in train['key'].unique():
    count+=1
    index_for_classif = list(train[train['key']==key].index)
    
    print(len(index_for_classif))
    
    X_New = X_Train[index_for_classif]
    specific_layer_output = backend.function([model.layers[0].input], [model.get_layer(layer_name).output])
    layer_output = specific_layer_output([X_New])[0]
    
    df = pd.DataFrame(layer_output ,columns = colnames)
    X = pd.concat([X,df])

    if count == 2:
        X = X.reset_index(drop = True)
        X.to_csv(feature_df_file, index = False)

        X = pd.DataFrame(columns = colnames)
    elif count%2 == 0:
        X = X.reset_index(drop = True)
        # Save the updated DataFrame to the CSV file
        X.to_csv(feature_df_file, mode='a', header=False, index=False)

        X = pd.DataFrame(columns = colnames)

if count%2!=0:
    X = X.reset_index(drop = True)
    # Save the updated DataFrame to the CSV file
    X.to_csv(feature_df_file, mode='a', header=False, index=False)

    X = pd.DataFrame(columns = colnames)

print((time.time() - start)/60)

In [None]:
# IMPORTANT NOTE: if the code above doesn't work, run this      
import time
start = time.time()
from keras import backend

feature_df_file = "Feature_dataframes/Feature_CNN_16batch_middle50skip0_grey.csv"

colnames = []
# creating column name list
for i in range(64):
    colnames.append("col_" + str(i))
    
index_for_classif = []
X = pd.DataFrame(columns = colnames)

count = 0

layer_name = model.layers[-2].name #for getting second last layer name

# for skip 0 middle 50%, the extraction will happen for each Patient-VideoID combination at a time. This is to reduce the memory consumption
for key in train['key'].unique():
    count+=1
    index_for_classif = list(train[train['key']==key].index)
    
    print(len(index_for_classif))
    
    X_New = X_Train[index_for_classif]
    specific_layer_output = backend.function([model.layers[0].input], [model.get_layer(layer_name).output])
    layer_output = specific_layer_output([X_New])[0]
    
    df = pd.DataFrame(layer_output ,columns = colnames)
    X = pd.concat([X,df])

    if count == 1:
        X = X.reset_index(drop = True)
        X.to_csv(feature_df_file, index = False)

        X = pd.DataFrame(columns = colnames)
    else:
        X = X.reset_index(drop = True)
        # Save the updated DataFrame to the CSV file
        X.to_csv(feature_df_file, mode='a', header=False, index=False)

        X = pd.DataFrame(columns = colnames)

print((time.time() - start)/60)

## ML Classifier modelling at Frame level

In [None]:
# Saving File

# # Middle 50%
# X.to_csv("Feature_dataframes/Feature_CNN_16batch_middle50skip2_grey.csv", index = False) # middle 50% skip 2 
# X.to_csv("Feature_dataframes/Feature_CNN_16batch_middle50skip1_grey.csv", index = False) # middle 50% skip 1
# X.to_csv("Feature_dataframes/Feature_CNN_16batch_middle50skip0_grey.csv", index = False) # middle 50% skip 0

# reading the feature csv
X = pd.read_csv("Feature_dataframes/Feature_CNN_16batch_middle50skip0_grey.csv") 
X

In [None]:
y = train['DiseaseID'].values
y

In [None]:
# splitting train and test
X_training = X[:train_len]
X_testing = X[train_len:]
y_training = y[:train_len]
y_testing = y[train_len:]

X_training = X_training.reset_index(drop = True)
X_testing = X_testing.reset_index(drop = True)

#### Modeling and Training

In [None]:
import time
start = time.time()

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

dict_values_cv = {'Algorithm' : [], 'Parameters': [], 'Accuracy':[], 'F1':[], 'Precision':[], 'Recall':[] }

# 1. Logistic Regression
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

model = LogisticRegression(random_state=SEED)

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_training, y_training)

best_estimator = grid_search.best_estimator_

best_estimator.fit(X_training, y_training) # retrain on the training set

y_pred = best_estimator.predict(X_testing)

dict_values_cv['Algorithm'].append("Logistic Regression")
dict_values_cv['Parameters'].append(grid_search.best_params_)
dict_values_cv['Accuracy'].append(np.round(accuracy_score(y_testing, y_pred),4))
dict_values_cv['F1'].append(np.round(f1_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Precision'].append(np.round(precision_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Recall'].append(np.round(recall_score(y_testing, y_pred, average='binary'),4))

print("Logistic Regression done!")

# 2. Guassian NB
param_grid = {}

model = GaussianNB()

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_training, y_training)

best_estimator = grid_search.best_estimator_

best_estimator.fit(X_training, y_training) # retrain on the training set

y_pred = best_estimator.predict(X_testing)

dict_values_cv['Algorithm'].append("Gaussian Naive Bayes")
dict_values_cv['Parameters'].append(grid_search.best_params_)
dict_values_cv['Accuracy'].append(np.round(accuracy_score(y_testing, y_pred),4))
dict_values_cv['F1'].append(np.round(f1_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Precision'].append(np.round(precision_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Recall'].append(np.round(recall_score(y_testing, y_pred, average='binary'),4))

print("Gaussian NB done!")

# 3. SVC
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

model = SVC(random_state=SEED)

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_training, y_training)

best_estimator = grid_search.best_estimator_

best_estimator.fit(X_training, y_training) # retrain on the training set

y_pred = best_estimator.predict(X_testing)

dict_values_cv['Algorithm'].append("SVM Classifier")
dict_values_cv['Parameters'].append(grid_search.best_params_)
dict_values_cv['Accuracy'].append(np.round(accuracy_score(y_testing, y_pred),4))
dict_values_cv['F1'].append(np.round(f1_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Precision'].append(np.round(precision_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Recall'].append(np.round(recall_score(y_testing, y_pred, average='binary'),4))

print("SVC done!")

# 3. Random Forest Classifier
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30, 50]
}

model = RandomForestClassifier(random_state=SEED)

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_training, y_training)

best_estimator = grid_search.best_estimator_

best_estimator.fit(X_training, y_training) # retrain on the training set

y_pred = best_estimator.predict(X_testing)

dict_values_cv['Algorithm'].append("Random Forest Classifier")
dict_values_cv['Parameters'].append(grid_search.best_params_)
dict_values_cv['Accuracy'].append(np.round(accuracy_score(y_testing, y_pred),4))
dict_values_cv['F1'].append(np.round(f1_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Precision'].append(np.round(precision_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Recall'].append(np.round(recall_score(y_testing, y_pred, average='binary'),4))

print("Random Forest done!")

# 4. Bagging Classifier
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_samples': [0.5, 1.0],
    'max_features': [0.5, 1.0]
}

model = BaggingClassifier(estimator=SVC(kernel="linear"), random_state= SEED)

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_training, y_training)

best_estimator = grid_search.best_estimator_

best_estimator.fit(X_training, y_training) # retrain on the training set

y_pred = best_estimator.predict(X_testing)

dict_values_cv['Algorithm'].append("SVC Bagging Classifier")
dict_values_cv['Parameters'].append(grid_search.best_params_)
dict_values_cv['Accuracy'].append(np.round(accuracy_score(y_testing, y_pred),4))
dict_values_cv['F1'].append(np.round(f1_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Precision'].append(np.round(precision_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Recall'].append(np.round(recall_score(y_testing, y_pred, average='binary'),4))

print("SVC Bagging done!")

# 5. KNN Classifier
param_grid = {
    'n_neighbors': [3, 5, 7, 10, 20, 30],
    'weights': ['uniform', 'distance']
}

model = KNeighborsClassifier()

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_training, y_training)

best_estimator = grid_search.best_estimator_

best_estimator.fit(X_training, y_training) # retrain on the training set

y_pred = best_estimator.predict(X_testing)

dict_values_cv['Algorithm'].append("K-Nearest Neighbors Classifier")
dict_values_cv['Parameters'].append(grid_search.best_params_)
dict_values_cv['Accuracy'].append(np.round(accuracy_score(y_testing, y_pred),4))
dict_values_cv['F1'].append(np.round(f1_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Precision'].append(np.round(precision_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Recall'].append(np.round(recall_score(y_testing, y_pred, average='binary'),4))

print("KNN done!")

# 5. Decision Tree Classifier
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 20]
}

model = DecisionTreeClassifier(random_state=SEED)

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_training, y_training)

best_estimator = grid_search.best_estimator_

best_estimator.fit(X_training, y_training) # retrain on the training set

y_pred = best_estimator.predict(X_testing)

dict_values_cv['Algorithm'].append("Decision Tree Classifier")
dict_values_cv['Parameters'].append(grid_search.best_params_)
dict_values_cv['Accuracy'].append(np.round(accuracy_score(y_testing, y_pred),4))
dict_values_cv['F1'].append(np.round(f1_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Precision'].append(np.round(precision_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Recall'].append(np.round(recall_score(y_testing, y_pred, average='binary'),4))

print("Decision Tree done!")

print((time.time() - start)/60)

### Results

In [None]:
print(pd.DataFrame(dict_values_cv).to_markdown()) # middle50%_skip0_grey

In [None]:
print(pd.DataFrame(dict_values_cv).to_markdown()) # middle50%_skip1_grey

In [None]:
print(pd.DataFrame(dict_values_cv).to_markdown()) # middle50%_skip2_grey

## ML Classifier modelling at Clip level

In [None]:
# Saving File
# # Middle 50%
# X.to_csv("Feature_dataframes/Feature_CNN_16batch_middle50skip2_grey.csv", index = False) # middle 50% skip 2 
# X.to_csv("Feature_dataframes/Feature_CNN_16batch_middle50skip1_grey.csv", index = False) # middle 50% skip 1
# X.to_csv("Feature_dataframes/Feature_CNN_16batch_middle50skip0_grey.csv", index = False) # middle 50% skip 0

# read feature csv
X = pd.read_csv("Feature_dataframes/Feature_CNN_16batch_middle50skip0_grey.csv")

X

In [None]:
X = pd.concat([X, train[['Paciente ', 'Video ID']]], axis = 1)

y = train[['Paciente ', 'Video ID', 'DiseaseID']]

# splitting train and test
X_training = X[:train_len]
X_testing = X[train_len:]
y_training = y[:train_len]
y_testing = y[train_len:]

X_training = X_training.reset_index(drop = True)
X_testing = X_testing.reset_index(drop = True)

#### Averaging out the feature values at Clip level

In [None]:
X_training = X_training.groupby(by = ['Paciente ', 'Video ID']).mean().reset_index()
X_training = X_training.sort_values(by = ['Paciente ','Video ID']).reset_index(drop = True)
key_training = X_training[['Paciente ', 'Video ID']]
print(X_training[X_training['Paciente '] == "R11"])
X_training = X_training.drop(['Paciente ', 'Video ID'], axis = 1)
 

y_training = y_training[['Paciente ', 'Video ID', 'DiseaseID']].drop_duplicates().reset_index(drop = True)
y_training = y_training.sort_values(by = ['Paciente ','Video ID']).reset_index(drop = True)
print(y_training[y_training['Paciente '] == "R11"])
y_training = y_training['DiseaseID'].values

X_testing = X_testing.groupby(by = ['Paciente ', 'Video ID']).mean().reset_index()
X_testing = X_testing.sort_values(by = ['Paciente ','Video ID']).reset_index(drop = True)
key_testing = X_testing[['Paciente ', 'Video ID']]
print(X_testing[X_testing['Paciente '] == "R11"])
X_testing = X_testing.drop(['Paciente ', 'Video ID'], axis = 1)

y_testing = y_testing[['Paciente ', 'Video ID', 'DiseaseID']].drop_duplicates().reset_index(drop = True)
y_testing = y_testing.sort_values(by = ['Paciente ','Video ID']).reset_index(drop = True)
print(y_testing[y_testing['Paciente '] == "R11"])
y_testing = y_testing['DiseaseID'].values

X = pd.concat([X_training, X_testing]).reset_index(drop = True)
y = np.append(y_training, y_testing)

#### Modeling and Training

In [None]:
import time
start = time.time()

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

dict_values_cv = {'Algorithm' : [], 'Parameters': [], 'Accuracy':[], 'F1':[], 'Precision':[], 'Recall':[] }

# 1. Logistic Regression
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

model = LogisticRegression(random_state=SEED)

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_training, y_training)

best_estimator = grid_search.best_estimator_

best_estimator.fit(X_training, y_training) # retrain on the training set

y_pred = best_estimator.predict(X_testing)

dict_values_cv['Algorithm'].append("Logistic Regression")
dict_values_cv['Parameters'].append(grid_search.best_params_)
dict_values_cv['Accuracy'].append(np.round(accuracy_score(y_testing, y_pred),4))
dict_values_cv['F1'].append(np.round(f1_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Precision'].append(np.round(precision_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Recall'].append(np.round(recall_score(y_testing, y_pred, average='binary'),4))

print("Logistic Regression done!")

# 2. Guassian NB
param_grid = {}

model = GaussianNB()

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_training, y_training)

best_estimator = grid_search.best_estimator_

best_estimator.fit(X_training, y_training) # retrain on the training set

y_pred = best_estimator.predict(X_testing)

dict_values_cv['Algorithm'].append("Gaussian Naive Bayes")
dict_values_cv['Parameters'].append(grid_search.best_params_)
dict_values_cv['Accuracy'].append(np.round(accuracy_score(y_testing, y_pred),4))
dict_values_cv['F1'].append(np.round(f1_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Precision'].append(np.round(precision_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Recall'].append(np.round(recall_score(y_testing, y_pred, average='binary'),4))

print("Gaussian NB done!")

# 3. SVC
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

model = SVC(random_state=SEED)

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_training, y_training)

best_estimator = grid_search.best_estimator_

best_estimator.fit(X_training, y_training) # retrain on the training set

y_pred = best_estimator.predict(X_testing)

dict_values_cv['Algorithm'].append("SVM Classifier")
dict_values_cv['Parameters'].append(grid_search.best_params_)
dict_values_cv['Accuracy'].append(np.round(accuracy_score(y_testing, y_pred),4))
dict_values_cv['F1'].append(np.round(f1_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Precision'].append(np.round(precision_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Recall'].append(np.round(recall_score(y_testing, y_pred, average='binary'),4))

print("SVC done!")

# 3. Random Forest Classifier
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30, 50]
}

model = RandomForestClassifier(random_state=SEED)

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_training, y_training)

best_estimator = grid_search.best_estimator_

best_estimator.fit(X_training, y_training) # retrain on the training set

y_pred = best_estimator.predict(X_testing)

dict_values_cv['Algorithm'].append("Random Forest Classifier")
dict_values_cv['Parameters'].append(grid_search.best_params_)
dict_values_cv['Accuracy'].append(np.round(accuracy_score(y_testing, y_pred),4))
dict_values_cv['F1'].append(np.round(f1_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Precision'].append(np.round(precision_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Recall'].append(np.round(recall_score(y_testing, y_pred, average='binary'),4))

print("Random Forest done!")

# 4. Bagging Classifier
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_samples': [0.5, 1.0],
    'max_features': [0.5, 1.0]
}

model = BaggingClassifier(estimator=SVC(kernel="linear"), random_state= SEED)

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_training, y_training)

best_estimator = grid_search.best_estimator_

best_estimator.fit(X_training, y_training) # retrain on the training set

y_pred = best_estimator.predict(X_testing)

dict_values_cv['Algorithm'].append("SVC Bagging Classifier")
dict_values_cv['Parameters'].append(grid_search.best_params_)
dict_values_cv['Accuracy'].append(np.round(accuracy_score(y_testing, y_pred),4))
dict_values_cv['F1'].append(np.round(f1_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Precision'].append(np.round(precision_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Recall'].append(np.round(recall_score(y_testing, y_pred, average='binary'),4))

print("SVC Bagging done!")

# 5. KNN Classifier
param_grid = {
    'n_neighbors': [3, 5, 7, 10, 20, 30],
    'weights': ['uniform', 'distance']
}

model = KNeighborsClassifier()

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_training, y_training)

best_estimator = grid_search.best_estimator_

best_estimator.fit(X_training, y_training) # retrain on the training set

y_pred = best_estimator.predict(X_testing)

dict_values_cv['Algorithm'].append("K-Nearest Neighbors Classifier")
dict_values_cv['Parameters'].append(grid_search.best_params_)
dict_values_cv['Accuracy'].append(np.round(accuracy_score(y_testing, y_pred),4))
dict_values_cv['F1'].append(np.round(f1_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Precision'].append(np.round(precision_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Recall'].append(np.round(recall_score(y_testing, y_pred, average='binary'),4))

print("KNN done!")

# 5. Decision Tree Classifier
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 20]
}

model = DecisionTreeClassifier(random_state=SEED)

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_training, y_training)

best_estimator = grid_search.best_estimator_

best_estimator.fit(X_training, y_training) # retrain on the training set

y_pred = best_estimator.predict(X_testing)

dict_values_cv['Algorithm'].append("Decision Tree Classifier")
dict_values_cv['Parameters'].append(grid_search.best_params_)
dict_values_cv['Accuracy'].append(np.round(accuracy_score(y_testing, y_pred),4))
dict_values_cv['F1'].append(np.round(f1_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Precision'].append(np.round(precision_score(y_testing, y_pred, average='binary'),4))
dict_values_cv['Recall'].append(np.round(recall_score(y_testing, y_pred, average='binary'),4))

print("Decision Tree done!")

print((time.time() - start)/60)

### Results

In [None]:
print(pd.DataFrame(dict_values_cv).to_markdown()) # middle50%_skip0_grey

In [None]:
print(pd.DataFrame(dict_values_cv).to_markdown()) # middle50%_skip1_grey 

In [None]:
print(pd.DataFrame(dict_values_cv).to_markdown()) # middle50%_skip2_grey 

## For the best model

In [None]:
y_pred_train = best_estimator.predict(X_training)
y_pred_all = best_estimator.predict(X)
y_pred_proba_all = best_estimator.predict_proba(X)

In [None]:
print(np.round(accuracy_score(y_training, y_pred_train),4))
print(len(y_training))
print(y_training!=y_pred_train)
print(np.round(accuracy_score(y_testing, y_pred),4))
print(y_testing!=y_pred)
print(len(y_testing))

print(np.append(y_training,y_testing) != y)
print(np.append(y_pred_train,y_pred) != y_pred_all)



In [None]:
print(np.round(accuracy_score(y, y_pred_all),4))
print(np.round(precision_score(y, y_pred_all, average='binary'),4))
print(np.round(recall_score(y, y_pred_all, average='binary'),4))
print(np.round(f1_score(y, y_pred_all, average='binary'),4))

In [None]:
key_training['Split'] = 'train'
key_testing['Split'] = 'test'

In [None]:
output_predictions = pd.concat([key_training, key_testing]).reset_index(drop = True)
output_predictions['DiseaseID'] = y
output_predictions['Predictions'] = y_pred_all
output_predictions['Probability/Confidence of the prediction'] = np.max(y_pred_proba_all, axis=1)

print(output_predictions)
print(output_predictions[output_predictions['Paciente '] == 'R11'])

In [None]:
output_predictions.to_excel('Prediction_output_with_confidence.xlsx', index=False)