# Chest X-Ray Model 1

Date: 8/18/2024

Author: Sylas Chacko

In [6]:
import numpy as np
import pandas as pd
import csv
import tensorflow as tf
import keras
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
import os
from pathlib import Path


## Data Preprocessing Labels

Train Labels

In [1]:
def list_files_in_folder(folder_path, output_csv):
    # Get a list of all files in the specified folder
    file_names = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    
    # Write the file names to a CSV file
    with open(output_csv, mode='w', newline='') as file:
        writer = csv.writer(file)
        # Write header
        writer.writerow(['File_Name'])
        # Write file names
        for file_name in file_names:
            writer.writerow([file_name])


folder_path = r'C:\Users\sylas\OneDrive\Projects\medical_images\data\sample\s_train_pipeline\s_train_final'  
output_csv = 'train_file_names.csv'  
list_files_in_folder(folder_path, output_csv)


In [6]:
# Define paths to the CSV files and directory
labels_file_path = r'C:\Users\sylas\OneDrive\Projects\medical_images\data\sample_labels.csv'
file_names_file = r'C:\Users\sylas\OneDrive\Projects\medical_images\train_file_names.csv'
output_csv = r'C:\Users\sylas\OneDrive\Projects\medical_images\Train_Image_Labels.csv'  

# Read the CSV files
labels_df = pd.read_csv(labels_file_path)
file_names_df = pd.read_csv(file_names_file)

# Extract the file names from file_names.csv and truncate to the first 12 characters
file_names = [f[:12] for f in file_names_df['File_Name'].tolist()]

# Truncate the 'Image Index' in sample_labels.csv to the first 12 characters
labels_df['Truncated_Image_Index'] = labels_df['Image Index'].str[:12]

# Initialize a list to store the matched data
data = []

# Loop through each row in the labels DataFrame
for _, row in labels_df.iterrows():
    truncated_file_name = row['Truncated_Image_Index']
    disease = row['Finding Labels']
    
    # Check if the truncated file name exists in the list of file names from file_names.csv
    if truncated_file_name in file_names:
        # Find all matching augmented versions of the file
        matching_files = [f for f in file_names_df['File_Name'] if f.startswith(truncated_file_name)]
        # Append each augmented version with its disease label
        for match in matching_files:
            data.append({'File_Name': match, 'Disease': disease})

# Check if data was found and create a DataFrame
if data:
    df = pd.DataFrame(data)
    df.to_csv(output_csv, index=False)
    print('Image Labels CSV created at:', output_csv)
else:
    print('No matching data processed. Check file paths and file availability.')


Image Labels CSV created at: C:\Users\sylas\OneDrive\Projects\medical_images\Image_Labels.csv


Test Labels

In [4]:
def list_files_in_folder(folder_path, output_csv):
    # Get a list of all files in the specified folder
    file_names = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    
    # Write the file names to a CSV file
    with open(output_csv, mode='w', newline='') as file:
        writer = csv.writer(file)
        # Write header
        writer.writerow(['File_Name'])
        # Write file names
        for file_name in file_names:
            writer.writerow([file_name])


folder_path = r'C:\Users\sylas\OneDrive\Projects\medical_images\data\sample\s_test_pipeline\s_test_final'  
output_csv = 'test_file_names.csv'  
list_files_in_folder(folder_path, output_csv)


In [5]:
# Define paths to the CSV files and directory
labels_file_path = r'C:\Users\sylas\OneDrive\Projects\medical_images\data\sample_labels.csv'
file_names_file = r'C:\Users\sylas\OneDrive\Projects\medical_images\test_file_names.csv'
output_csv = r'C:\Users\sylas\OneDrive\Projects\medical_images\Test_Image_Labels.csv'  

# Read the CSV files
labels_df = pd.read_csv(labels_file_path)
file_names_df = pd.read_csv(file_names_file)

# Extract the file names from file_names.csv and truncate to the first 12 characters
file_names = [f[:12] for f in file_names_df['File_Name'].tolist()]

# Truncate the 'Image Index' in sample_labels.csv to the first 12 characters
labels_df['Truncated_Image_Index'] = labels_df['Image Index'].str[:12]

# Initialize a list to store the matched data
data = []

# Loop through each row in the labels DataFrame
for _, row in labels_df.iterrows():
    truncated_file_name = row['Truncated_Image_Index']
    disease = row['Finding Labels']
    
    # Check if the truncated file name exists in the list of file names from file_names.csv
    if truncated_file_name in file_names:
        # Find all matching augmented versions of the file
        matching_files = [f for f in file_names_df['File_Name'] if f.startswith(truncated_file_name)]
        # Append each augmented version with its disease label
        for match in matching_files:
            data.append({'File_Name': match, 'Disease': disease})

# Check if data was found and create a DataFrame
if data:
    df = pd.DataFrame(data)
    df.to_csv(output_csv, index=False)
    print('Image Labels CSV created at:', output_csv)
else:
    print('No matching data processed. Check file paths and file availability.')


Image Labels CSV created at: C:\Users\sylas\OneDrive\Projects\medical_images\Test_Image_Labels.csv


## Model

In [7]:
# Paths to your preprocessed data and labels
train_images_path = r'C:\Users\sylas\OneDrive\Projects\medical_images\data\sample\s_train_pipeline\s_train_final'
test_images_path = r'C:\Users\sylas\OneDrive\Projects\medical_images\data\sample\s_test_pipeline\stest_final'
train_labels_path = r'C:\Users\sylas\OneDrive\Projects\medical_images\Train_Image_Labels.csv'
test_labels_path = r'C:\Users\sylas\OneDrive\Projects\medical_images\Test_Image_Labels.csv'

# Function to load preprocessed images and labels
def load_images_and_labels(images_path, labels_df):
    images = []
    labels = []

    for index, row in labels_df.iterrows():
        file_name = row['File_Name']
        disease_label = row['Disease']
        image_path = os.path.join(images_path, file_name)
        
        if os.path.exists(image_path):
            image_data = np.load(image_path)  # Assuming .npy files are already in shape (224, 224, 3)
            images.append(image_data)
            labels.append(disease_label)

    images = np.array(images)
    labels = np.array(labels)
    
    return images, labels

# Load labels
train_labels_df = pd.read_csv(train_labels_path)
test_labels_df = pd.read_csv(test_labels_path)

# Load the images and corresponding labels
X_train, y_train = load_images_and_labels(train_images_path, train_labels_df)
X_test, y_test = load_images_and_labels(test_images_path, test_labels_df)

# Ensure images are normalized if not already
X_train = X_train / 255.0
X_test = X_test / 255.0

# Encode labels to integers and then to categorical format
label_mapping = {label: idx for idx, label in enumerate(np.unique(y_train))}
y_train = np.array([label_mapping[label] for label in y_train])
y_test = np.array([label_mapping[label] for label in y_test])

# Convert labels to categorical (one-hot encoding)
y_train = to_categorical(y_train, num_classes=len(label_mapping))
y_test = to_categorical(y_test, num_classes=len(label_mapping))

# Define the VGG16 model with pre-trained weights
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False

# Add custom layers for classification
x = Flatten()(base_model.output)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(len(label_mapping), activation='softmax')(x)

# Create the full model
model = Model(inputs=base_model.input, outputs=x)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Define data augmentation parameters
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Train the model
model.fit(
    datagen.flow(X_train, y_train, batch_size=32),
    validation_data=(X_test, y_test),
    epochs=20,
    steps_per_epoch=len(X_train) // 32,
    validation_steps=len(X_test) // 32
)

# Save the trained model
model.save('vgg16_chest_xray_model.h5')
print("Model training completed and saved as 'vgg16_chest_xray_model.h5'")

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
113/808 [===>..........................] - ETA: 1:07:47 - loss: 2.4226 - accuracy: 0.5749

KeyboardInterrupt: 