In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
import json
from functools import partial
import os
import sys
import gc
from datetime import datetime
from tqdm import tqdm

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import BitsAndBytesConfig
from transformers import MllamaForConditionalGeneration, AutoTokenizer, AutoProcessor
from peft import LoraConfig
from peft import get_peft_model
from peft import prepare_model_for_kbit_training

from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from unsloth import FastVisionModel # FastLanguageModel for LLMs
from trl import SFTTrainer, SFTConfig
from peft import prepare_model_for_kbit_training


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


### Setup


#### Directories/Helper Code

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
project_dir = 'drive/Othercomputers/My Laptop/ML-Quiz-XRay-ReportGeneration/'
## General Settings
data_dir = os.path.join(project_dir, 'data')
image_dir = os.path.join(data_dir, 'images')
json_fpath = os.path.join(project_dir, 'results/task1_convert_validation_annotations/annotation_quiz_all_modified.json')
output_dir = os.path.join(project_dir, 'results/task2_llama_3_2_vision_train')
overwrite=True
# loss_fun = torch.nn.CrossEntropyLoss(ignore_index=-100)

## Setup output directory
if os.path.exists(output_dir) and overwrite:
    print('removing output directory')
    os.system('rm -rf {}'.format(output_dir))
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

removing output directory


In [None]:
sys.path.append(os.path.join(project_dir, 'src'))

from data_utils import *
from llama_3_2_vision_unsloth_helpers import *

In [None]:
os.listdir(os.path.join(project_dir, 'src'))

['qwenl2_sandbox.ipynb',
 'qwenl2_helpers.py',
 'Untitled.ipynb',
 'Untitled1.ipynb',
 'molmo_sandbox.ipynb',
 'llama_3_2_vision_unsloth.ipynb',
 '.ipynb_checkpoints',
 '__pycache__',
 'input_prep.py',
 'llama_3_2_vision_helpers.py',
 'llama_3_2_vision_unsloth_helpers.py',
 'unsloth_compiled_cache',
 'data_utils.py',
 'auto_clicker.py']

#### Model

In [None]:
## Model Setup
print('Model Setup')
print(datetime.now())
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)
print(model.device)

Model Setup
2024-12-08 22:36:19.970474
==((====))==  Unsloth 2024.12.4: Fast Mllama vision patching. Transformers: 4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

cuda:0


In [None]:
model

MllamaForConditionalGeneration(
  (vision_model): MllamaVisionModel(
    (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)
    (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(
      (tile_embedding): Embedding(9, 8197120)
    )
    (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
      (embedding): Embedding(9, 5120)
    )
    (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
      (embedding): Embedding(9, 5120)
    )
    (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    (transformer): MllamaVisionEncoder(
      (layers): ModuleList(
        (0-12): 13 x MllamaVisionEncoderLayer(
          (self_attn): MllamaVisionSdpaAttention(
            (q_proj): Linear4bit(in_features=1280, out_features=1280, bias=False)
            (k_proj): Linear4bit(in_features=1280, out_features

In [None]:
## Setup LoRA model
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

In [None]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MllamaForConditionalGeneration(
      (vision_model): MllamaVisionModel(
        (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)
        (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(
          (tile_embedding): Embedding(9, 8197120)
        )
        (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (transformer): MllamaVisionEncoder(
          (layers): ModuleList(
            (0-12): 13 x MllamaVisionEncoderLayer(
              (self_attn): MllamaVisionSdpaAttention(
               

#### Dataset

In [None]:
ds_train = XRayImageDataset(top_image_dir=image_dir, json_fpath=json_fpath, split='train', inference_mode=False, img_size=224)
ds_val = XRayImageDataset(top_image_dir=image_dir, json_fpath=json_fpath, split='val', inference_mode=False, img_size=224)
# ds_train.subsample(n_subsample=12)
# ds_val.subsample(n_subsample=12)
ds_train_converted = []
ds_val_converted = []
print('converting training data')
print(datetime.now())
for i in tqdm(range(len(ds_train))):
    img_i, id_i, text_i = ds_train[i]
    ds_train_converted.append(convert_to_conversation(img_i, text_i))
    del img_i
    del id_i
    del text_i
    gc.collect()
print('converting validation data')
print(datetime.now())
for i in tqdm(range(len(ds_val))):
    img_i, id_i, text_i = ds_val[i]
    ds_val_converted.append(convert_to_conversation(img_i, text_i))
    del img_i
    del id_i
    del text_i
    gc.collect()
print(datetime.now())
print("example input for model")
print(ds_train_converted[0])

converting training data
2024-12-08 22:37:21.749577


100%|██████████| 2069/2069 [29:31<00:00,  1.17it/s]


converting validation data
2024-12-08 23:06:53.029721


100%|██████████| 296/296 [05:57<00:00,  1.21s/it]

2024-12-08 23:12:50.733103
example input for model
{'messages': [{'role': 'user', 'content': [{'type': 'text', 'text': 'Summarize the input image(s) in the following json format \n{\n    "lung": "summary of lung related findings, empty if no findings",\n    "heart": "summary of heart related findings, empty if no findings",\n    "bone": "summary of bone related findings, empty if no findings",\n    "mediastinal": "summary of mediastinal related findings, empty if no findings",\n    "others": "summary of any findings not related to lung, heart, bone, or mediastinal"\n}\n'}, {'type': 'image', 'image': <PIL.Image.Image image mode=L size=224x224 at 0x7C83D1AEECE0>}]}, {'role': 'assistant', 'content': [{'type': 'text', 'text': '{"bone": "Degenerative changes are present in the spine.", "heart": "Heart size and pulmonary vascularity appear within normal limits.", "lung": "Lungs are free of focal airspace disease. No pneumothorax or pleural effusion is seen.", "mediastinal": "", "others": "A 




### Examine How Model Performs Prior to Training

In [None]:
print(datetime.now())
image, _, _ = ds_train[0]
instruction = 'Summarize the input image(s) in the following json format \n{\n    "lung": "summary of lung related findings, empty if no findings",\n    "heart": "summary of heart related findings, empty if no findings",\n    "bone": "summary of bone related findings, empty if no findings",\n    "mediastinal": "summary of mediastinal related findings, empty if no findings",\n    "others": "summary of any findings not related to lung, heart, bone, or mediastinal"\n}'

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

2024-12-08 23:12:50.743309
The lung appears clear. No lung-related findings were reported.

There was no heart-related finding. No heart enlargement, and normal pulmonary vessels, normal cardiac silhouette, normal left atrium.

No bone or joint abnormality is seen.

The mediastinum appears unremarkable.

There were no other significant abnormality reported.

**Summary** 

A chest x-ray showed a normal lung with no pulmonary findings.  No abnormality was reported in the heart, mediastinum, or bones.<|eot_id|>


### Run Training

In [None]:
print(datetime.now())
FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = ds_train_converted,
    eval_dataset = ds_val_converted,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 2,
        warmup_steps = 5,
        # max_steps = 30,
        num_train_epochs = 3, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = output_dir,
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)
trainer_stats = trainer.train()

2024-12-08 23:13:55.684357


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,069 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 2
\        /    Total batch size = 4 | Total steps = 1,551
 "-____-"     Number of trainable parameters = 67,174,400
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!


Step,Training Loss
1,2.0534
2,2.0232
3,1.9647
4,1.5299
5,1.4101
6,1.24
7,1.0123
8,0.7657
9,0.6891
10,0.556


In [None]:
print(datetime.now())
trainer_stats

2024-12-09 07:05:53.133539


TrainOutput(global_step=1551, training_loss=0.2515734536903547, metrics={'train_runtime': 28313.4259, 'train_samples_per_second': 0.219, 'train_steps_per_second': 0.055, 'total_flos': 7.702795271219942e+16, 'train_loss': 0.2515734536903547, 'epoch': 2.9971014492753625})

In [None]:
## Save model
model.save_pretrained(os.path.join(output_dir, 'lora_model')) # Local saving
tokenizer.save_pretrained(os.path.join(output_dir, 'lora_model'))

[]

#### Examine How Trained Model Performs on Example Image Again

In [None]:
FastVisionModel.for_inference(model) # Enable for inference!
image, _, _ = ds_train[0]
instruction = 'Summarize the input image(s) in the following json format \n{\n    "lung": "summary of lung related findings, empty if no findings",\n    "heart": "summary of heart related findings, empty if no findings",\n    "bone": "summary of bone related findings, empty if no findings",\n    "mediastinal": "summary of mediastinal related findings, empty if no findings",\n    "others": "summary of any findings not related to lung, heart, bone, or mediastinal"\n}'

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

{"bone": "", "heart": "", "lung": "Low lung volumes without focal consolidation. No pneumothorax.", "mediastinal": "Mediastinal contours are stable.", "others": ""}<|eot_id|>


In [None]:
_, _, text = ds_train[0]

In [None]:
text

'{"bone": "Degenerative changes are present in the spine.", "heart": "Heart size and pulmonary vascularity appear within normal limits.", "lung": "Lungs are free of focal airspace disease. No pneumothorax or pleural effusion is seen.", "mediastinal": "", "others": "A large hiatal hernia is noted."}'