### Pyhealth Model PR
Authors: Aditya Asthana, Krish Desai

In [1]:
%pip install torch torchvision pandas matplotlib torchxrayvision

Collecting torch
  Using cached torch-2.2.2-cp38-none-macosx_10_9_x86_64.whl (150.6 MB)
Collecting torchvision
  Using cached torchvision-0.17.2-cp38-cp38-macosx_10_13_x86_64.whl (1.7 MB)
Collecting pandas
  Using cached pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl (11.7 MB)
Collecting matplotlib
  Downloading matplotlib-3.7.5-cp38-cp38-macosx_10_12_x86_64.whl (7.4 MB)
[K     |████████████████████████████████| 7.4 MB 2.5 MB/s eta 0:00:01
[?25hCollecting torchxrayvision
  Downloading torchxrayvision-1.3.4-py3-none-any.whl (29.0 MB)
[K     |████████████████████████████████| 29.0 MB 51.6 MB/s eta 0:00:01 0:00:01
[?25hCollecting networkx
  Using cached networkx-3.1-py3-none-any.whl (2.1 MB)
Collecting jinja2
  Downloading jinja2-3.1.6-py3-none-any.whl (134 kB)
[K     |████████████████████████████████| 134 kB 87.7 MB/s eta 0:00:01
[?25hCollecting sympy
  Using cached sympy-1.13.3-py3-none-any.whl (6.2 MB)
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[K  

In [2]:
# **Cell 2: Imports & Globals**
# --------------------------------------------------------------
import os
import pandas as pd
import numpy as np
from pathlib import Path
from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

import torchxrayvision as xrv
import matplotlib.pyplot as plt

# Paths
IMAGE_DIR = Path("data_sources/images")            # your raw scans
MANIFEST_DIR = Path("outputs")           # where CSVs will go
MANIFEST_DIR.mkdir(parents=True, exist_ok=True)


  from tqdm.autonotebook import tqdm


In [7]:
# **Revised Cell 3: Load Pretrained Model & Preprocessor (CPU, 1-channel)**
import torchxrayvision as xrv
import torch
from torchvision import transforms
from PIL import Image

# Load TorchXRayVision DenseNet-121 (grayscale input)
model = xrv.models.DenseNet(weights="densenet121-res224-chex")
model = model.eval()  # keep on CPU; no .cuda()

# Preprocessing: resize → center-crop → grayscale → to-tensor → normalize
preprocess = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.Grayscale(num_output_channels=1),     # ensure 1 channel :contentReference[oaicite:0]{index=0}
    transforms.ToTensor(),                           # yields shape [1,224,224]
    transforms.Normalize(mean=[0.485], std=[0.229]), # single-channel norms :contentReference[oaicite:1]{index=1}
])


In [8]:
# **Revised Cell 4: Generate Pseudo-Labels (1-channel inputs)**
import os
import pandas as pd
import torch

records = []
for fname in sorted(os.listdir(IMAGE_DIR)):
    path = IMAGE_DIR / fname
    img = Image.open(path).convert("L")                # open as grayscale :contentReference[oaicite:2]{index=2}
    tensor = preprocess(img).unsqueeze(0)              # shape [1,1,224,224]
    with torch.no_grad():
        scores = model(tensor).numpy().squeeze()       # outputs [21] pathology logits
    idx = model.pathologies.index("Pneumonia")
    prob = torch.sigmoid(torch.tensor(scores[idx])).item()
    label = int(prob >= 0.5)
    records.append({"image_path": str(path), "label": label})

df_labels = pd.DataFrame(records)
df_labels.to_csv(MANIFEST_DIR/"pseudo_labels.csv", index=False)
df_labels.head()


[W NNPACK.cpp:64] Could not initialize NNPACK! Reason: Unsupported hardware.


Unnamed: 0,image_path,label
0,data_sources/images/00000001_000.png,1
1,data_sources/images/00000001_001.png,1
2,data_sources/images/00000001_002.png,1
3,data_sources/images/00000002_000.png,1
4,data_sources/images/00000003_000.png,1


#### 2. Create Two Domains
- **Domain A:** your original images  
- **Domain B:** apply a consistent augmentation (brightness & blur) to simulate a second “dataset”  


In [11]:
# **Cell 5: Define Augmentation & Build Manifests**
# --------------------------------------------------------------
aug_transform = transforms.Compose([
    transforms.ColorJitter(brightness=0.5, contrast=0.5),
    transforms.GaussianBlur(5),
])

# prepare lists
manifests = {"A": [], "B": []}

for _, row in df_labels.iterrows():
    path, lbl = row["image_path"], row["label"]
    # Domain A record
    manifests["A"].append({"image_path": path, "label": lbl, "domain": 0})
    # Domain B: save augmented copy to a temp folder
    img = Image.open(path).convert("RGB")
    aug = aug_transform(img)
    save_path = IMAGE_DIR/"domainB"/os.path.basename(path)
    save_path.parent.mkdir(exist_ok=True)
    aug.save(save_path)
    manifests["B"].append({"image_path": str(save_path), "label": lbl, "domain": 1})

!pip install scikit-learn
# Split each into train/val/test (80/10/10)
from sklearn.model_selection import train_test_split
for dom, recs in manifests.items():
    df = pd.DataFrame(recs)
    train, temp = train_test_split(df, stratify=df.label, test_size=0.2, random_state=0)
    val, test = train_test_split(temp, stratify=temp.label, test_size=0.5, random_state=0)
    train.to_csv(MANIFEST_DIR/f"{dom}_train.csv", index=False)
    val.to_csv(MANIFEST_DIR/f"{dom}_val.csv", index=False)
    test.to_csv(MANIFEST_DIR/f"{dom}_test.csv", index=False)


Collecting scikit-learn
  Using cached scikit_learn-1.3.2-cp38-cp38-macosx_10_9_x86_64.whl (10.1 MB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Collecting joblib>=1.1.1
  Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.3.2 threadpoolctl-3.5.0
You should consider upgrading via the '/Users/adityaasthana/uiuc-mcs/CS598_Final_Report/cs598env/bin/python -m pip install --upgrade pip' command.[0m


In [12]:
# **Cell 6: Dataset Class**
# --------------------------------------------------------------
class ChestDataset(Dataset):
    def __init__(self, manifest_csv, transform=None):
        self.df = pd.read_csv(manifest_csv)
        self.transform = transform or transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485], std=[0.229]),
        ])
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = Image.open(row.image_path).convert("RGB")
        img = self.transform(img)
        return img, row.label, row.domain

def make_loader(dom, split, bs=32, shuffle=True):
    path = MANIFEST_DIR/f"{dom}_{split}.csv"
    return DataLoader(ChestDataset(path), batch_size=bs, shuffle=shuffle)

In [13]:
# **Cell 7 (CPU only): Model Factory & Training Loop**
import torch.nn as nn
import torch.optim as optim

def make_model():
    m = models.densenet121(pretrained=False)
    m.classifier = nn.Linear(m.classifier.in_features, 1)
    return m  # CPU model

def train_erm(domains, max_steps=5000, lr=1e-4):
    loaders = {d: make_loader(d, "train") for d in domains}
    iters = {d: iter(loaders[d]) for d in domains}
    model = make_model()            # CPU
    opt = optim.Adam(model.parameters(), lr=lr)
    crit = nn.BCEWithLogitsLoss()

    for step in range(max_steps):
        for d in domains:
            try:
                imgs, labs, _ = next(iters[d])
            except StopIteration:
                iters[d] = iter(loaders[d])
                imgs, labs, _ = next(iters[d])
            # no .cuda()
            preds = model(imgs).squeeze()
            loss = crit(preds, labs.float())
            loss.backward()
            opt.step()
            opt.zero_grad()
        if (step+1) % 1000 == 0:
            print(f"Step {step+1}/{max_steps}")
    return model
