In [204]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
import cv2 
import os 
from random import shuffle 
from tqdm import tqdm 
from PIL import Image

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report, root_mean_squared_error

import warnings
warnings.filterwarnings('ignore')
import os

In [165]:
image_subset = []

with open("../data/CXR8/all_list.txt", 'r') as allImages:
    for line in allImages:
        image_subset.append(str.strip(line))
    
    allImages.close()

In [166]:
data_entry = pd.read_csv("../data/CXR8/Data_Entry_2017_v2020.csv")
data_filtered_entry = data_entry[data_entry['Image Index'].apply(lambda index: index in image_subset)]

In [167]:
data_filtered_entry = data_filtered_entry.drop(columns=['Finding Labels', 'Follow-up #'], axis=1)

In [168]:
#LongTailCXR
lt_test = pd.read_csv("../data/CXR8/LongTailCXR/nih-cxr-lt_single-label_test.csv")
lt_train = pd.read_csv("../data/CXR8/LongTailCXR/nih-cxr-lt_single-label_train.csv")

#PruneCXR
prune_test = pd.read_csv("../data/CXR8/PruneCXR/miccai2023_nih-cxr-lt_labels_test.csv")
prune_val = pd.read_csv("../data/CXR8/PruneCXR/miccai2023_nih-cxr-lt_labels_val.csv")
prune_train = pd.read_csv("../data/CXR8/PruneCXR/miccai2023_nih-cxr-lt_labels_train.csv")

In [169]:
prune_data = pd.concat([prune_test, prune_train, prune_val])
prune_filtered_data = prune_data[prune_data['id'].apply(lambda index: index in image_subset)]

In [170]:
lt_data = pd.concat([lt_test, lt_train])
lt_filtered_data = lt_data[lt_data['id'].apply(lambda index: index in image_subset)]

In [207]:
lt_filtered_data

Unnamed: 0,id,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,...,Pleural_Thickening,Pneumonia,Pneumothorax,Pneumoperitoneum,Pneumomediastinum,Subcutaneous Emphysema,Tortuous Aorta,Calcification of the Aorta,No Finding,subject_id
6,00000003_007.png,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,3
7,00000015_000.png,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,15
8,00000018_000.png,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,18
9,00000025_000.png,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,25
11,00000035_001.png,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68052,00030800_000.png,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,30800
68054,00030802_000.png,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,30802
68055,00030803_000.png,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,30803
68056,00030804_000.png,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,30804


In [171]:
train_images = "../data/CXR8/images/train_val_images"

test_images = "../data/CXR8/images/test_images"


In [172]:
label_columns = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
                 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 
                 'Infiltration', 'Mass', 'Nodule', 'Pleural Thickening', 
                 'Pneumonia', 'Pneumothorax', 'Pneumoperitoneum', 'Pneumomediastinum', 
                 'Subcutaneous Emphysema', 'Tortuous Aorta', 'Calcification of the Aorta', 'No Finding']

In [179]:
def train_test_preprocess_data(image_size):
    train_data = []
    train_labels = []

    test_data = []
    test_labels = []

    for image1 in tqdm(os.listdir(train_images)): 
        path = os.path.join(train_images, image1)
        img1 = cv2.imread(path, cv2.IMREAD_GRAYSCALE) 
        img1 = cv2.resize(img1, (image_size, image_size))
        train_data.append(img1)
        train_labels.append(prune_filtered_data[prune_filtered_data['id'] == image1][label_columns].to_numpy())
    
    for image2 in tqdm(os.listdir(test_images)): 
        path = os.path.join(test_images, image2)
        img2 = cv2.imread(path, cv2.IMREAD_GRAYSCALE) 
        img2 = cv2.resize(img2, (image_size, image_size))
        test_data.append(img2)
        test_labels.append(prune_filtered_data[prune_filtered_data['id'] == image2][label_columns].to_numpy()) 
    
    return train_data, np.array(train_labels), test_data, np.array(test_labels)

In [180]:
train_data, train_labels, test_data, test_labels = train_test_preprocess_data(image_size = 128)

100%|██████████| 28008/28008 [03:22<00:00, 138.06it/s]
100%|██████████| 2797/2797 [00:20<00:00, 137.48it/s]


In [181]:
x_data=np.concatenate((train_data,test_data),axis=0)

#min-max scaling
x_data = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))

In [185]:
test_labels

array([[[0, 0, 0, ..., 0, 0, 1]],

       [[0, 0, 0, ..., 0, 0, 1]],

       [[0, 0, 0, ..., 0, 0, 0]],

       ...,

       [[0, 0, 0, ..., 0, 0, 1]],

       [[0, 0, 0, ..., 0, 0, 1]],

       [[0, 0, 0, ..., 0, 0, 0]]])

In [186]:
y_data=np.concatenate((train_labels,test_labels),axis=0)

In [192]:
y_data = y_data.reshape(30805, 20)

In [193]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.15, random_state=42)
number_of_train = x_train.shape[0]
number_of_test = x_test.shape[0]

In [194]:
x_train_flatten = x_train.reshape(number_of_train,x_train.shape[1]*x_train.shape[2])
x_test_flatten = x_test .reshape(number_of_test,x_test.shape[1]*x_test.shape[2])
print("X train flatten",x_train_flatten.shape)
print("X test flatten",x_test_flatten.shape)

X train flatten (26184, 16384)
X test flatten (4621, 16384)


In [203]:
model=LogisticRegression(multi_class='multinomial', random_state=42, solver='lbfgs', max_iter=500)
model.fit(x_train_flatten, y_train)


# Make predictions
y_pred = model.predict(x_test_flatten)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


ValueError: y should be a 1d array, got an array of shape (26184, 20) instead.

In [197]:
md = 3
print("\nCreating decision tree max_depth=" + str(md))
model = tree.DecisionTreeClassifier(max_depth=md) 
model.fit(x_train_flatten, y_train)
print("Done ")

politic = model.predict(x_test_flatten)
# 3. evaluate
acc_train = model.score(x_test_flatten, y_test)


Creating decision tree max_depth=3
Done 


In [200]:
acc_train

0.6085262930101709

In [199]:
accuracy_score(politic, y_test)

0.6085262930101709

In [206]:
root_mean_squared_error(y_test, politic)

np.float64(0.15963961106674884)