In [97]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import cv2 
import os 
from random import shuffle 
from tqdm import tqdm 
from PIL import Image
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings('ignore')
import os

In [98]:
image_subset = []

with open("../data/CXR8/all_list.txt", 'r') as allImages:
    for line in allImages:
        image_subset.append(str.strip(line))
    
    allImages.close()

In [99]:
data_entry = pd.read_csv("../data/CXR8/Data_Entry_2017_v2020.csv")
data_filtered_entry = data_entry[data_entry['Image Index'].apply(lambda index: index in image_subset)]

In [100]:
data_filtered_entry = data_filtered_entry.drop(columns=['Finding Labels', 'Follow-up #'], axis=1)

In [101]:
data_filtered_entry

Unnamed: 0,Image Index,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
2,00000001_002.png,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,2,80,M,PA,2500,2048,0.171,0.171
10,00000003_007.png,3,80,F,PA,2582,2905,0.143,0.143
12,00000004_000.png,4,82,M,AP,2500,2048,0.168,0.168
20,00000005_007.png,5,70,F,PA,2566,2681,0.143,0.143
...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,30801,38,M,PA,2048,2500,0.168,0.168
112116,00030802_000.png,30802,28,M,PA,2048,2500,0.168,0.168
112117,00030803_000.png,30803,42,F,PA,2048,2500,0.168,0.168
112118,00030804_000.png,30804,29,F,PA,2048,2500,0.168,0.168


In [102]:
#LongTailCXR
lt_test = pd.read_csv("../data/CXR8/LongTailCXR/nih-cxr-lt_single-label_test.csv")
lt_train = pd.read_csv("../data/CXR8/LongTailCXR/nih-cxr-lt_single-label_train.csv")

#PruneCXR
prune_test = pd.read_csv("../data/CXR8/PruneCXR/miccai2023_nih-cxr-lt_labels_test.csv")
prune_val = pd.read_csv("../data/CXR8/PruneCXR/miccai2023_nih-cxr-lt_labels_val.csv")
prune_train = pd.read_csv("../data/CXR8/PruneCXR/miccai2023_nih-cxr-lt_labels_train.csv")

In [103]:
prune_data = pd.concat([prune_test, prune_train, prune_val])
prune_filtered_data = prune_data[prune_data['id'].apply(lambda index: index in image_subset)]

In [104]:
lt_data = pd.concat([lt_test, lt_train])
lt_filtered_data = lt_data[lt_data['id'].apply(lambda index: index in image_subset)]

In [105]:
train_images = "../data/CXR8/images/train_val_images"

test_images = "../data/CXR8/images/test_images"


In [106]:
prune_filtered_data

Unnamed: 0,id,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,...,Pleural Thickening,Pneumonia,Pneumothorax,Pneumoperitoneum,Pneumomediastinum,Subcutaneous Emphysema,Tortuous Aorta,Calcification of the Aorta,No Finding,subj_id
46,00000013_046.png,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,13
107,00000032_060.png,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,32
108,00000072_000.png,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,72
112,00000092_003.png,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,92
153,00000116_040.png,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12528,00030762_000.png,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,30762
12529,00030771_000.png,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,30771
12530,00030776_000.png,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,30776
12531,00030781_000.png,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,30781


In [107]:
label_columns = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
                 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 
                 'Infiltration', 'Mass', 'Nodule', 'Pleural Thickening', 
                 'Pneumonia', 'Pneumothorax', 'Pneumoperitoneum', 'Pneumomediastinum', 
                 'Subcutaneous Emphysema', 'Tortuous Aorta', 'Calcification of the Aorta', 'No Finding']

In [110]:
def train_test_preprocess_data(image_size):
    train_data = []
    train_labels = []

    test_data = []
    test_labels = []

    for image1 in tqdm(os.listdir(train_images)): 
        path = os.path.join(train_images, image1)
        img1 = cv2.imread(path, cv2.IMREAD_GRAYSCALE) 
        img1 = cv2.resize(img1, (image_size, image_size))
        train_data.append(img1)
        train_labels.append(prune_filtered_data[prune_filtered_data['id'] == image1][label_columns].to_numpy())
    
    for image2 in tqdm(os.listdir(test_images)): 
        path = os.path.join(test_images, image2)
        img2 = cv2.imread(path, cv2.IMREAD_GRAYSCALE) 
        img2 = cv2.resize(img2, (image_size, image_size))
        test_data.append(img2)
        test_labels.append(prune_filtered_data[prune_filtered_data['id'] == image2][label_columns].to_numpy()) 
    
    return train_data, train_labels, test_data, test_labels

In [111]:
train_data, train_labels, test_data, test_labels = train_test_preprocess_data(image_size = 128)

100%|██████████| 28008/28008 [03:30<00:00, 133.28it/s]
100%|██████████| 2797/2797 [00:20<00:00, 135.41it/s]


In [112]:
x_data=np.concatenate((train_data,test_data),axis=0)
x_data = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))

In [134]:
y_data=train_labels + test_labels

In [132]:
y_data = np.reshape(x_data.shape[0], 1)

In [136]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.15, random_state=42)
number_of_train = x_train.shape[0]
number_of_test = x_test.shape[0]

In [137]:
x_train_flatten = x_train.reshape(number_of_train,x_train.shape[1]*x_train.shape[2])
x_test_flatten = x_test .reshape(number_of_test,x_test.shape[1]*x_test.shape[2])
print("X train flatten",x_train_flatten.shape)
print("X test flatten",x_test_flatten.shape)

X train flatten (26184, 16384)
X test flatten (4621, 16384)


In [143]:
model=LogisticRegression(multi_class='multinomial', random_state=42, solver='lbfgs', max_iter=500)
model.fit(x_train, y_train)


# Make predictions
y_pred = model.predict(x_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


ValueError: Found array with dim 3. None expected <= 2.

In [None]:
log_reg= LogisticRegression(C=1,penalty="l1")
log_reg.fit(x_train.T,y_train.T)
print("test accuracy: {} ".format(log_reg.fit(x_test.T, y_test.T).score(x_test.T, y_test.T)))
print("train accuracy: {} ".format(log_reg.fit(x_train.T, y_train.T).score(x_train.T, y_train.T)))

In [None]:
def initialize_weights_and_bias(dimension):
    w = np.full((dimension,1),0.01)
    b = 0.0
    return w, b

def sigmoid(z):
    y_head = 1/(1+np.exp(-z))
    return y_head

def forward_backward_propagation(w,b,x_train,y_train):
    # forward propagation
    z = np.dot(w.T,x_train) + b
    y_head = sigmoid(z)
    loss = -y_train*np.log(y_head)-(1-y_train)*np.log(1-y_head)
    cost = (np.sum(loss))/x_train.shape[1]
    # backward propagation
    derivative_weight = (np.dot(x_train,((y_head-y_train).T)))/x_train.shape[1]
    derivative_bias = np.sum(y_head-y_train)/x_train.shape[1]
    gradients = {"derivative_weight": derivative_weight,"derivative_bias": derivative_bias}
    return cost,gradients

def update(w, b, x_train, y_train, learning_rate,number_of_iterarion):
    cost_list = []
    cost_list2 = []
    index = []
    
    for i in range(number_of_iterarion):
        
        cost,gradients = forward_backward_propagation(w,b,x_train,y_train)
        cost_list.append(cost)
        
        w = w - learning_rate * gradients["derivative_weight"]
        b = b - learning_rate * gradients["derivative_bias"]
        if i % 100 == 0:
            cost_list2.append(cost)
            index.append(i)
            print ("Cost after iteration %i: %f" %(i, cost))
    
    parameters = {"weight": w,"bias": b}
    return parameters, gradients, cost_list

def predict(w,b,x_test):
    
    z = sigmoid(np.dot(w.T,x_test)+b)
    Y_prediction = np.zeros((1,x_test.shape[1]))

    for i in range(z.shape[1]):
        if z[0,i]<= 0.5:
            Y_prediction[0,i] = 0
        else:
            Y_prediction[0,i] = 1

    return Y_prediction

def logistic_regression(x_train, y_train, x_test, y_test, learning_rate ,  num_iterations):

    dimension =  x_train.shape[0]
    w,b = initialize_weights_and_bias(dimension)

    parameters, gradients, cost_list = update(w, b, x_train, y_train, learning_rate,num_iterations)
    
    y_prediction_test = predict(parameters["weight"],parameters["bias"],x_test)
    y_prediction_train = predict(parameters["weight"],parameters["bias"],x_train)
    
    print("Test Accuracy: {} %".format(round(100 - np.mean(np.abs(y_prediction_test - y_test)) * 100,2)))
    print("Train Accuracy: {} %".format(round(100 - np.mean(np.abs(y_prediction_train - y_train)) * 100,2)))

In [None]:
logistic_regression(x_train, y_train, x_test, y_test,learning_rate = 0.01, num_iterations = 500)

Cost after iteration 0: nan
Cost after iteration 100: 0.407350
Cost after iteration 200: 0.245656
Cost after iteration 300: 0.244309
Cost after iteration 400: 0.243111
Test Accuracy: 92.95 %
Train Accuracy: 93.11 %
