# Project Baseline Setup:

### Architecture
Run pre-training with contrastive learning (MoCo framework)
* Use backbone - `ResNet50`
* **Perform SSL Pre-training** of backbone using contrastive learning (MoCo) -> augment medical imaging dataset (Chest XRs) (Pre-text task); create positive and negative pairs
  * Produce: `moco_resnet50_encoder.pth`
* **Transfer Learning:** Fine-tune pre-trained ResNet for Pneumonia Chest XR classification
  * Produce: `finetuned_resnet50_medical.pth`

### Pre-training Dataset: CheXpert
* Subset: Pneumonia classification only; smaller dataset (to accomodate class imbalance)

### Fine-tuning Dataset: NIH Pneumonia Dataset

###

In [1]:
# import libraries
import os
import sys
import argparse
from tqdm import tqdm

import numpy as np
import seaborn as sns
import pandas as pd
import math
from copy import deepcopy

%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import display

import pickle
from datetime import datetime

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, datasets
from torch.utils.data import random_split, DataLoader

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score

from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import label_binarize

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Collab Needs

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Define filepaths to required input (scripts, data) and outputs

In [None]:
# ----------------------------------------------------
# Inputs root
# ----------------------------------------------------
FP_ROOT="/content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/"

# ----------------------------------------------------
# Dataset info
# ----------------------------------------------------
# Images zip files
NIH_DATASET_PATH_SPLIT = FP_ROOT + "Data/NIH_Chest_XR_Pneumonia.zip"
# Chexpert
DATASET_PATH_SPLIT = FP_ROOT + "Data/CheXpert_reduced_dataset_split.zip"

# Labels
# NIH
TRAIN_LABELS_CSV = FP_ROOT + "Data/nih_train.csv"
VAL_LABELS_CSV = FP_ROOT + "Data/nih_val.csv"
TEST_LABELS_CSV = FP_ROOT + "Data/nih_test.csv"

# ChexPert
# TRAIN_LABELS_CSV = FP_ROOT + "Data/final_project_updated_names_train.csv"
# VAL_LABELS_CSV = FP_ROOT + "Data/final_project_updated_names_val.csv"
# TEST_LABELS_CSV = FP_ROOT + "Data/final_project_updated_names_test.csv"

# ----------------------------------------------------
# Model SRC
# ----------------------------------------------------
SRC_ROOT = FP_ROOT + "src/"
TRAIN_SCRIPT = f"{SRC_ROOT}/train_moco.py"
MOCO_FOLDER = f"{SRC_ROOT}/moco"

# ----------------------------------------------------
# Outputs
# ----------------------------------------------------
ROOT_ARTIFACT_SAVE = FP_ROOT + "artifacts/"

In [4]:
# Add the project /src to the system path
if SRC_ROOT not in sys.path:
    sys.path.append(SRC_ROOT)
    print(f"Added {SRC_ROOT} to sys.path")

Added /content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/src/ to sys.path


In [5]:
print(sys.path)

['/content', '/env/python', '/usr/lib/python312.zip', '/usr/lib/python3.12', '/usr/lib/python3.12/lib-dynload', '', '/usr/local/lib/python3.12/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.12/dist-packages/IPython/extensions', '/root/.ipython', '/tmp/tmp8w85yjzj', '/content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/src/']


## Unzip data

NIH

In [5]:
# Unzip the dataset (image) files to /tmp
DATA_DEST_UNZIPPED = "/tmp/NIH_Chest/"
os.makedirs(DATA_DEST_UNZIPPED, exist_ok=True)
!unzip "{NIH_DATASET_PATH_SPLIT}" -d {DATA_DEST_UNZIPPED} # need to use "" to accomdate space

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0481-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0482-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0483-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0484-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0485-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0486-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0487-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0488-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0489-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0490-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0491-0001-0001.jpeg  
  infl

In [5]:
# Update for the unzipped sub-name
DATA_DEST_UNZIPPED = "/tmp/NIH_Chest/NIH_Chest_XR_Pneumonia"

# Train subset
DATA_DEST_UNZIPPED_TRAIN = "/tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train"

ChexPert

In [None]:
# Unzip the dataset (image) files to /tmp
DATA_DEST_UNZIPPED = "/tmp/CheXpert_dataset/"
os.makedirs(DATA_DEST_UNZIPPED, exist_ok=True)
!unzip "{DATASET_PATH_SPLIT}" -d {DATA_DEST_UNZIPPED} # need to use "" to accomdate space

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split/train/patient40777_study2_view1_frontal.jpg  
  inflating: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split/train/patient40779_study2_view1_frontal.jpg  
  inflating: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split/train/patient40779_study2_view2_frontal.jpg  
  inflating: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split/train/patient40781_study1_view1_frontal.jpg  
  inflating: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split/train/patient40788_study1_view1_frontal.jpg  
  inflating: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split/train/patient40788_study6_view1_frontal.jpg  
  inflating: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split/train/patient40791_study2_view1_frontal.jpg  
  inflating: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split/train/patient40796_study1_view1_frontal.jpg  
  inflating: /tmp/CheXpert_dataset/CheX

In [None]:
# Update for the unzipped sub-name
DATA_DEST_UNZIPPED = "/tmp/CheXpert_dataset/CheXpert_reduced_dataset_split/"

# Train subset
DATA_DEST_UNZIPPED_TRAIN = "/tmp/CheXpert_dataset/CheXpert_reduced_dataset_split/train"

## 1) Run Pre-training - Contrastive Learning

In [6]:
%cd "/content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/src"

/content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/src


In [None]:
# Hyperparams
batch_size = 64

In [None]:
! python moco/train_moco.py \
    --train_csv_path "$TRAIN_LABELS_CSV" \
    --root_dir "$DATA_DEST_UNZIPPED_TRAIN" \
    --artifact_root "$ROOT_ARTIFACT_SAVE" \
    --batch_size "$batch_size" \
    --num_epochs 10 \
    --test_num_classes 2

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 3/199:  74% 323/438 [06:14<01:19,  1.45it/s, loss=7.82][2025-11-17 02:37:10] Epoch 3, Loss: 7.8186
Epoch 3/199:  74% 324/438 [06:14<03:11,  1.68s/it, loss=7.74][2025-11-17 02:37:11] Epoch 3, Loss: 7.7365
Epoch 3/199:  74% 325/438 [06:14<02:19,  1.24s/it, loss=7.63][2025-11-17 02:37:11] Epoch 3, Loss: 7.6314
Epoch 3/199:  74% 326/438 [06:14<01:43,  1.08it/s, loss=7.94][2025-11-17 02:37:11] Epoch 3, Loss: 7.9382
Epoch 3/199:  75% 327/438 [06:18<01:18,  1.41it/s, loss=7.81][2025-11-17 02:37:15] Epoch 3, Loss: 7.8093
Epoch 3/199:  75% 328/438 [06:18<03:05,  1.69s/it, loss=7.77][2025-11-17 02:37:15] Epoch 3, Loss: 7.7664
Epoch 3/199:  75% 329/438 [06:19<02:15,  1.24s/it, loss=7.68][2025-11-17 02:37:15] Epoch 3, Loss: 7.6786
Epoch 3/199:  75% 330/438 [06:19<01:40,  1.08it/s, loss=7.77][2025-11-17 02:37:16] Epoch 3, Loss: 7.7734
Epoch 3/199:  76% 331/438 [06:24<01:16,  1.41it/s, loss=7.78][2025-11-17 02:37:20] Epoch 3, Los

## 2) Fine tune the classifier

In [6]:
%cd "/content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/src"

/content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/src


In [7]:
# Load in the pre-trained baseline MoCo model

from moco.model_builder import MoCo

# Instantiate the MoCo model with updated momentum
moco_model = MoCo(dim=128, K=65536, m=0.999, T=0.2, pretrained=True, device='cuda')

# Extract the checkpoint state dict
MOCO_BACKBONE_SAVE_PATH = f"{ROOT_ARTIFACT_SAVE}/moco_checkpoint_epoch_10.pth"
#checkpoint = torch.load(MOCO_BACKBONE_SAVE_PATH, map_location=device)
#moco_model.load_state_dict(checkpoint['model_state'])

Using ImageNet pretrained weights for ResNet50: For encoder_q


### Run finetuning using the finetune_resnet module

In [9]:
ROOT_ARTIFACT_SAVE

'/content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/artifacts/'

In [9]:
! python finetune_resnet.py \
    --train_csv "$TRAIN_LABELS_CSV" \
    --val_csv "$VAL_LABELS_CSV" \
    --test_csv "$TEST_LABELS_CSV" \
    --root_dir "$DATA_DEST_UNZIPPED" \
    --pretrained_encoder "$MOCO_BACKBONE_SAVE_PATH" \
    --artifact_root "$ROOT_ARTIFACT_SAVE" \
    --n_epochs 8

Created log file:  /content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/artifacts/training_log_20251118_032730.txt
Loading training and validation datasets...
Training CSV: /content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/Data/nih_train.csv
 * Images - Train Root Directory: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train
Validation CSV: /content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/Data/nih_val.csv
 * Images - Val Root Directory: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/val
[2025-11-18 03:27:31] Loading pretrained encoder from: /content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/artifacts//moco_checkpoint_epoch_10.pth
[2025-11-18 03:27:31] Loaded pretrained encoder. missing keys: ['fc.weight', 'fc.bias'], unexpected: []
[2025-11-18 03:27:31] Starting finetuning...
Finetune Epoch 0: 100% 163/163 [00:23<00:00,  6.99it/s, loss=0.134]
[2025-11-18 03:27:56] Epoch 0: train_acc = 0.9469 | val_acc = 0.8333
[2025-11-18 03:27:56] Epoch 0: train_loss = 0.1335 | val_loss = 0.4207
Finetune Epoch 1: 100% 1