<a href="https://colab.research.google.com/github/vivaanjhaveri/chest-xray/blob/main/chest_xray.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chest XRay Classification

### UBC Medicine Datathon 2025



---



### Import Statements

In [2]:
import os
import sys
import csv
import random

import IPython
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import HTML

sys.path.append(os.path.join(os.path.abspath(".."), "code"))
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interact, interactive
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_validate, train_test_split
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.multioutput import MultiOutputClassifier

## Exploratory Data Analysis

### 1. Overview of the Dataset
   The dataset contains metadata about chest X‑ray images. Key columns include:

- **Image Index:** Identifier or filename of the image.
- **Finding Labels:** One or more findings per image (separated by `|`).
- **Patient Age:** Age of the patient.
- **Patient Gender:** Gender of the patient.
- And additional clinical information.

The cells below display the first few rows and provide summary statistics for further exploration.

- **BBox_list_2017.csv:** Contains bounding box coordinates for regions of interest in the images.
- **Data_Entry_2017.csv:** Contains class labels and additional patient metadata.

In [3]:
# bbox_df = pd.read_csv('/BBox_List_2017.csv')
# data_df = pd.read_csv('/Data_Entry_2017.csv')

# # Display the shape of the datasets
# print('BBox_list_2017.csv shape:', bbox_df.shape)
# print('Data_Entry_2017.csv shape:', data_df.shape)

# Display the first few rows of each dataset
# print('\nFirst five rows of BBox_list_2017.csv:')
# print(bbox_df.head())

# print('\nFirst five rows of Data_Entry_2017.csv:')
# print(data_df.head())

In [4]:
# Inspect dataset info and missing values
# print('--- BBox_list_2017.csv Info ---')
# print(bbox_df.info())
# print('Missing values in BBox_list_2017.csv:')
# print(bbox_df.isnull().sum())

# print('\n--- Data_Entry_2017.csv Info ---')
# print(data_df.info())
# print('Missing values in Data_Entry_2017.csv:')
# print(data_df.isnull().sum())

# # Analyze the distribution of disease classes in Data_Entry_2017
# class_counts = data_df['Finding Labels'].value_counts()
# print('\nDistribution of Disease Classes:')
# print(class_counts)

In [5]:
# # Step 4: Merge the two datasets on the "Image Index" column
# merged_df = pd.merge(data_df, bbox_df, on='Image Index', how='left')

# # Display the shape of the merged dataframe
# print('Merged DataFrame shape:', merged_df.shape)

# # Display the first few rows of the merged dataframe
# print(merged_df.head())

### 2. Analysis of Finding Labels:
In this section, we split the `Finding Labels` column (which may contain multiple labels separated by `|`), count the frequency of each finding, and then visualize the distribution.

In [6]:
from collections import Counter

# # Split the 'Finding Labels' column and explode the list into separate rows
# all_labels = metadata['Finding Labels'].str.split('|').explode()

# # Count the occurrences of each label
# label_counts = Counter(all_labels)

# # Convert the counts to a DataFrame for visualization
# labels_df = pd.DataFrame.from_dict(label_counts, orient='index', columns=['Count'])
# labels_df = labels_df.sort_values(by='Count', ascending=False)

# # print("Top 10 Findings:")
# print(labels_df.head(10))

## 3. Demographic Analysis

Next, we examine the patient demographics. We will visualize:

- **Patient Age Distribution:** Using a histogram with KDE.
- **Patient Gender Distribution:** Using a count plot.

In [7]:
# plt.figure(figsize=(12, 6))
# sns.barplot(x=labels_df.index, y=labels_df['Count'], palette='viridis')
# plt.xticks(rotation=45, ha='right')
# plt.title('Distribution of Findings in NIH Chest X‑ray Dataset')
# plt.xlabel('Finding')
# plt.ylabel('Count')
# plt.tight_layout()
# plt.show()

In [8]:
# plt.figure(figsize=(8, 4))
# sns.histplot(metadata['Patient Age'].dropna(), kde=True, bins=30)
# plt.title('Distribution of Patient Age')
# plt.xlabel('Age')
# plt.ylabel('Frequency')
# plt.tight_layout()
# plt.show()

In [9]:
# plt.figure(figsize=(6, 4))
# sns.countplot(x='Patient Gender', data=metadata, palette='Set2')
# plt.title('Patient Gender Distribution')
# plt.xlabel('Gender')
# plt.ylabel('Count')
# plt.tight_layout()
# plt.show()

## 4. Co-occurrence Analysis of Findings

In this section, we compute a co-occurrence matrix to analyze how often different findings appear together in the dataset. This helps to understand relationships between various pathological findings.

In [10]:
# import itertools

# # Get a sorted list of unique findings
# unique_findings = sorted(list(all_labels.unique()))

# # Initialize a DataFrame for the co-occurrence matrix
# co_occurrence = pd.DataFrame(0, index=unique_findings, columns=unique_findings)

# # Populate the matrix by iterating over each record
# for labels in metadata['Finding Labels'].dropna():
#     label_list = labels.split('|')
#     # Update counts for each combination of findings
#     for label1, label2 in itertools.combinations(label_list, 2):
#         co_occurrence.loc[label1, label2] += 1
#         co_occurrence.loc[label2, label1] += 1
#     # Also increment self-occurrence for each label
#     for label in label_list:
#         co_occurrence.loc[label, label] += 1

# # Plot the co-occurrence heatmap
# plt.figure(figsize=(10, 8))
# sns.heatmap(co_occurrence, cmap='viridis', linewidths=0.5)
# plt.title('Co-occurrence Matrix of Findings')
# plt.xlabel('Finding')
# plt.ylabel('Finding')
# plt.tight_layout()
# plt.show()



---



## Model Setup

In this section, we will create a train-test split and build a classification task. For demonstration purposes, we will use a simplified binary classification:

- **Normal vs. Abnormal:**  
  We define images with the label "No Finding" as *Normal* (0) and those with any other findings as *Abnormal* (1).

We will use patient demographics (e.g., Age and Gender) as features. Note that in practice you would typically extract image features from the actual chest X‑ray images, but here we use available metadata for demonstration.

We will then train:
- A baseline model using a `DummyClassifier`.
- A Support Vector Machine (SVM) classifier with an RBF kernel.
- Hyperparameter tuning will be performed using GridSearchCV or RandomizedSearchCV.

Finally, we compare both models using classification metrics such as accuracy, precision, recall, and F1-score.


## Data Splitting 

The first thing to do is to split the dataset into two pieces (training and testing). 

We take our dataset and split it into training and testing data. 



In [None]:

import pandas as pd


csv_file = "xray_input.csv"
df = pd.read_csv(csv_file)
label_cols = [
    "Atelectasis", "Consolidation", "Infiltration", "Pneumothorax", "Edema",
    "Emphysema", "Fibrosis", "Effusion", "Pneumonia", "Pleural_Thickening",
    "Cardiomegaly", "Nodule", "Mass", "Hernia",
]

# Create a l
# list of the x-columns
x_cols = [col for col in df.columns if col.startswith("x")]
X = df[x_cols][:]
y = df[label_cols][:]
print(X.shape, y.shape)

dim = 100 

# Convert the data to a PyTorch tensor
X_data_tensor = torch.tensor(X.to_numpy(), dtype=torch.float32)

# Create a dataset and dataloader for batch processing
dataset = TensorDataset(X_data_tensor)
dataloader = DataLoader(dataset, batch_size=dim, shuffle=True)

# Define the autoencoder architecture
class Autoencoder(nn.Module):
    def __init__(self, input_dim=2048, encoding_dim=dim):
        super(Autoencoder, self).__init__()
        # Encoder: reduces dimensionality
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, encoding_dim),
            nn.ReLU()
        )
        # Decoder: reconstructs the input from the encoded representation
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, input_dim),
            nn.Sigmoid()  # Using Sigmoid if your data is normalized between 0 and 1
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Instantiate the model, loss function, and optimizer
input_dim = 2048
encoding_dim = 100  # Adjust as desired for compression
model = Autoencoder(input_dim, encoding_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training loop
num_epochs = 10  # Adjust number of epochs based on your needs
for epoch in range(num_epochs):
    for batch in dataloader:
        inputs = batch[0]
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# After training, use the encoder part to get the compressed representation
with torch.no_grad():
    X_reduced_tensor = model.encoder(X_data_tensor)

print("Compressed representation shape:", X_reduced_tensor.shape)






(112120, 2048) (112120, 14)
Epoch 1/10, Loss: 0.02096376195549965
