## Transfer Learning
Use pre-trained backbone from `run_pretrain_moco_vit_hybrid.ipynb`

### Environment Setup

In [1]:
# import libraries
import os
import sys
import argparse
from tqdm import tqdm

import numpy as np
import seaborn as sns
import pandas as pd
import math
from copy import deepcopy

%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import display

import pickle
from datetime import datetime

import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Collab Needs

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Define filepaths to required input (scripts, data) and outputs

In [3]:
# ----------------------------------------------------
# Inputs root
# ----------------------------------------------------
FP_ROOT="/content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/"

# ----------------------------------------------------
# Dataset info
# ----------------------------------------------------
# Images zip files
NIH_DATASET_PATH_SPLIT = FP_ROOT + "Data/NIH_Chest_XR_Pneumonia.zip"
# Chexpert
CHEXPERT_DATASET_PATH_SPLIT = FP_ROOT + "Data/CheXpert_reduced_dataset_split_transfer_binary.zip"

# Labels
# NIH
TRAIN_LABELS_CSV = FP_ROOT + "Data/nih_train.csv"
VAL_LABELS_CSV = FP_ROOT + "Data/nih_val.csv"
TEST_LABELS_CSV = FP_ROOT + "Data/nih_test.csv"

# ChexPert - Pneumonia Binary
CHEXPERT_TRAIN_LABELS_CSV = FP_ROOT + "Data/0_final_project_updated_names_train_transfer_binary.csv"
CHEXPERT_VAL_LABELS_CSV = FP_ROOT + "Data/0_final_project_updated_names_val_transfer_binary.csv"
CHEXPERT_TEST_LABELS_CSV = FP_ROOT + "Data/0_final_project_updated_names_test_transfer_binary.csv"

# ----------------------------------------------------
# Model SRC
# ----------------------------------------------------
SRC_ROOT = FP_ROOT + "src/"

# ----------------------------------------------------
# Outputs
# ----------------------------------------------------
ROOT_ARTIFACT_SAVE = FP_ROOT + "artifacts/"

In [4]:
# Add the project /src to the system path
if SRC_ROOT not in sys.path:
    sys.path.append(SRC_ROOT)
    print(f"Added {SRC_ROOT} to sys.path")

print(sys.path)

Added /content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/src/ to sys.path
['/content', '/env/python', '/usr/lib/python312.zip', '/usr/lib/python3.12', '/usr/lib/python3.12/lib-dynload', '', '/usr/local/lib/python3.12/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.12/dist-packages/IPython/extensions', '/root/.ipython', '/content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/src/']


## Unzip data

NIH

In [5]:
# Unzip the dataset (image) files to /tmp
DATA_DEST_UNZIPPED = "/tmp/NIH_Chest/"
os.makedirs(DATA_DEST_UNZIPPED, exist_ok=True)
!unzip "{NIH_DATASET_PATH_SPLIT}" -d {DATA_DEST_UNZIPPED} # need to use "" to accomdate space

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0481-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0482-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0483-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0484-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0485-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0486-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0487-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0488-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0489-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0490-0001.jpeg  
  inflating: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train/NORMAL/IM-0491-0001-0001.jpeg  
  infl

In [6]:
# Update for the unzipped sub-name
DATA_DEST_UNZIPPED = "/tmp/NIH_Chest/NIH_Chest_XR_Pneumonia"

## CheXpert

In [7]:
# Unzip the dataset (image) files to /tmp
CHEXPERT_DATA_DEST_UNZIPPED = "/tmp/CheXpert_dataset/"
os.makedirs(CHEXPERT_DATA_DEST_UNZIPPED, exist_ok=True)
!unzip "{CHEXPERT_DATASET_PATH_SPLIT}" -d {CHEXPERT_DATA_DEST_UNZIPPED} # need to use "" to accomdate space

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split_transfer_binary/train/patient25303_study3_view2_lateral.jpg  
  inflating: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split_transfer_binary/train/patient25325_study2_view2_lateral.jpg  
  inflating: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split_transfer_binary/train/patient25327_study2_view1_frontal.jpg  
  inflating: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split_transfer_binary/train/patient25329_study2_view1_frontal.jpg  
  inflating: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split_transfer_binary/train/patient25331_study1_view1_frontal.jpg  
  inflating: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split_transfer_binary/train/patient25335_study1_view2_lateral.jpg  
  inflating: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split_transfer_binary/train/patient25337_study1_view1_frontal.jpg  
  inflating: /tmp/CheXpert_dataset/CheX

In [8]:
# Update for the unzipped sub-name
CHEXPERT_DATA_DEST_UNZIPPED = "/tmp/CheXpert_dataset/CheXpert_reduced_dataset_split_transfer_binary"

## 2) Fine tune the classifier

In [9]:
%cd "/content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/src"

# Load in the pre-trained baseline MoCo model

# Extract the checkpoint state dict
MOCO_BACKBONE_SAVE_PATH = f"{ROOT_ARTIFACT_SAVE}/vit_hybrid_moco_encoder.pth"

/content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/src


### Run finetuning using the finetune_resnet module

## NIH

In [10]:
! python finetune_vit_hybrid.py \
    --train_csv "$TRAIN_LABELS_CSV" \
    --val_csv "$VAL_LABELS_CSV" \
    --test_csv "$TEST_LABELS_CSV" \
    --root_dir "$DATA_DEST_UNZIPPED" \
    --pretrained_encoder "$MOCO_BACKBONE_SAVE_PATH" \
    --artifact_root "$ROOT_ARTIFACT_SAVE" \
    --n_epochs 35 \
    --batch_size 64 \
    --subtitle "NIH_ViT_hybrid_moco_finetune"

Created log file:  /content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/artifacts/finetune_ViT_hybrid_training_log_20251202_062852.txt
Loading train dataset...
CSV: /content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/Data/nih_train.csv
 * Images - Train Root Directory: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/train
Unique labels in column 'Pneumonia': [0 1]
Loading val dataset...
CSV: /content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/Data/nih_val.csv
 * Images - Val Root Directory: /tmp/NIH_Chest/NIH_Chest_XR_Pneumonia/val
Unique labels in column 'Pneumonia': [0 1]
[2025-12-02 06:28:54] Loading pretrained MoCo model from /content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/artifacts//vit_hybrid_moco_encoder.pth...
[2025-12-02 06:29:01] Detected 'model_state' in checkpoint. Extracting encoder_q weights.
[2025-12-02 06:29:01] Loaded encoder_q. missing=[], unexpected=['encoder_q_proj.0.weight', 'encoder_q_proj.0.bias', 'encoder_q_proj.2.weight', 'encoder_q_proj.2.bias']
[2025-12-02 06:29:01] Buildin

## CheXpert

`Pneumonia` Binary - Negative and Postive Only

Experiement 1: Small Updates to the backbone layers using

In [11]:
! python finetune_vit_hybrid.py \
    --train_csv "$CHEXPERT_TRAIN_LABELS_CSV" \
    --val_csv "$CHEXPERT_VAL_LABELS_CSV" \
    --test_csv "$CHEXPERT_TEST_LABELS_CSV" \
    --root_dir "$CHEXPERT_DATA_DEST_UNZIPPED" \
    --pretrained_encoder "$MOCO_BACKBONE_SAVE_PATH" \
    --artifact_root "$ROOT_ARTIFACT_SAVE" \
    --n_epochs 35 \
    --batch_size 64 \
    --num_classes 2 \
    --subtitle "CheXpert_Pneumonia_binary_ViT_Hybrid" \
    --label_col "Pneumonia"

Created log file:  /content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/artifacts/finetune_ViT_hybrid_training_log_20251202_072447.txt
Loading train dataset...
CSV: /content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/Data/0_final_project_updated_names_train_transfer_binary.csv
 * Images - Train Root Directory: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split_transfer_binary/train
Unique labels in column 'Pneumonia': [1 0]
Loading val dataset...
CSV: /content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/Data/0_final_project_updated_names_val_transfer_binary.csv
 * Images - Val Root Directory: /tmp/CheXpert_dataset/CheXpert_reduced_dataset_split_transfer_binary/val
Unique labels in column 'Pneumonia': [1 0]
[2025-12-02 07:24:49] Loading pretrained MoCo model from /content/drive/MyDrive/Colab Notebooks/7_Py_DL/FP/artifacts//vit_hybrid_moco_encoder.pth...
[2025-12-02 07:24:50] Detected 'model_state' in checkpoint. Extracting encoder_q weights.
[2025-12-02 07:24:50] Loaded encoder_q. missing=[],