In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Requirements

In [None]:
!git clone https://github.com/sunandhini96/TSAI_ERAV1.git

Cloning into 'TSAI_ERAV1'...
remote: Enumerating objects: 1358, done.[K
remote: Counting objects: 100% (629/629), done.[K
remote: Compressing objects: 100% (305/305), done.[K
remote: Total 1358 (delta 390), reused 514 (delta 316), pack-reused 729[K
Receiving objects: 100% (1358/1358), 31.06 MiB | 12.09 MiB/s, done.
Resolving deltas: 100% (635/635), done.
Updating files: 100% (189/189), done.


In [2]:
cd /content/drive/MyDrive/capstone/Stage_2/TSAI_ERAV1/Capstone/Stage_2

/content/drive/MyDrive/capstone/Stage_2/TSAI_ERAV1/Capstone/Stage_2


In [3]:
!pip install -q -r requirements.txt

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [4]:
import pickle
import peft
from peft import LoraConfig, PeftModel
from transformers import AutoTokenizer,BitsAndBytesConfig, AutoModelForCausalLM, CLIPVisionModel, AutoProcessor
import torch
from torch.utils.data import random_split, DataLoader
import pandas as pd
from torch.nn import functional as F
import csv
import random
from PIL import Image
import requests
import wandb
import os
import torch.nn as nn
from finetune_dataset import llavadataset, collate_fn

#Define values for parameters used in code

In [5]:
clip_model_name = "wkcn/TinyCLIP-ViT-61M-32-Text-29M-LAION400M"
phi_model_name  = "microsoft/phi-2"
tokenizer  = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
processor  = AutoProcessor.from_pretrained(clip_model_name)
tokenizer.pad_token = tokenizer.eos_token
train_batch_size = 4
clip_embed = 640
phi_embed  = 2560
device = "cuda" if torch.cuda.is_available() else "cpu"
num_workers = 10
IMAGE_TOKEN_ID = 23893 # token for word comment
max_steps      = 20000
EOS_TOKEN_ID   = 50256
phi_patches    = 49
vocab_size     = 51200
max_generate_length = 100
model_val_step      = 1000
model_log_step      = 100
model_save_step     = 100



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#Automatically chooses the precision for GPU

In [6]:
torch.cuda.amp.autocast(enabled=True)

<torch.cuda.amp.autocast_mode.autocast at 0x7c2f6f1aa710>

#Logging in Wandb

In [7]:
wandb.init(project  = "tsai_multimodal_gpt_project", name="step2_finetuning_QLoRA")

[34m[1mwandb[0m: Currently logged in as: [33mgsunandhini[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Data loaders

In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
data = pd.read_csv('/content/drive/MyDrive/capstone/Stage_2/training.csv')

# Shuffle the data (optional but recommended)
data = data.sample(frac=1).reset_index(drop=True)

# Calculate the number of rows for the training set
train_size = int(0.9 * len(data))

# Split the data into training and testing sets
train_data = data.iloc[:train_size]
test_data = data.iloc[train_size:]

# Save the training and testing sets to new CSV files
train_data.to_csv('/content/drive/MyDrive/capstone/Stage_2/train_data.csv', index=False)
test_data.to_csv('/content/drive/MyDrive/capstone/Stage_2/sample_data.csv', index=False)

In [8]:
# training data
csv_file = '/content/drive/MyDrive/capstone/Stage_2/train_data.csv'
qa_dataset = pd.read_csv(csv_file)

In [9]:
qa_dataset.head()

Unnamed: 0,id,img_url,input,label
0,146649,http://images.cocodataset.org/train2017/000000...,What are some differences between snowboarding...,Snowboarding and skiing are both winter sports...
1,359548,http://images.cocodataset.org/train2017/000000...,What is the terrain like where the two people ...,The terrain where the two people are flying th...
2,152942,http://images.cocodataset.org/train2017/000000...,Can you describe the woman's facial expression?,The young woman has a shy smile on her face wh...
3,222083,http://images.cocodataset.org/train2017/000000...,What is the condition of the bathroom stall in...,The bathroom stall in the image appears to be ...
4,399001,http://images.cocodataset.org/train2017/000000...,What are some ingredients present in the sandw...,"The sandwich contains deli meat, onions, sauce..."


In [10]:
train_dataloader = DataLoader(llavadataset(qa_dataset, phi_model_name,clip_model_name,tokenizer,processor),
                  collate_fn=collate_fn, batch_size=train_batch_size, num_workers = num_workers, shuffle=True, pin_memory=True)

In [11]:
file = open('/content/drive/MyDrive/capstone/Stage_2/sample_data.csv')
csvreader = csv.reader(file)
sample_val_data = []
for row in csvreader:
    sample_val_data.append(row)
print(sample_val_data[1])
file.close()

['465508', 'http://images.cocodataset.org/train2017/000000465508.jpg', 'What is the skateboarder doing in the image?\n<image>', 'The skateboarder is performing a flip and other tricks in a large cement courtyard.']


# Model

In [12]:
clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,)

phi_model = AutoModelForCausalLM.from_pretrained(
    phi_model_name,
    torch_dtype=torch.float32,
    quantization_config=bnb_config,
    trust_remote_code=True
)
phi_model.config.use_cache = False
projection_layer = torch.nn.Linear(clip_embed, phi_embed).to(device)



  return self.fget.__get__(instance, owner)()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
print(phi_model)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layern

In [14]:
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "dense",
        "fc1",
        "fc2",
    ]
)
peft_model = peft.get_peft_model(phi_model, peft_config).to(device)
peft_model.print_trainable_parameters()

trainable params: 94,371,840 || all params: 2,874,055,680 || trainable%: 3.2835773035545364


In [15]:
class SimpleResBlock(nn.Module):
    def __init__(self, phi_embed):
        super().__init__()
        self.pre_norm = nn.LayerNorm(phi_embed)
        self.proj = nn.Sequential(
            nn.Linear(phi_embed, phi_embed),
            nn.GELU(),
            nn.Linear(phi_embed, phi_embed)
        )
    def forward(self, x):
        x = self.pre_norm(x)
        return x + self.proj(x)

In [16]:
projection_model = SimpleResBlock(phi_embed).to(device)

In [17]:
# clip non trainable
for network in [clip_model]:
    for param in network.parameters():
        param.requires_grad_(False)

In [18]:
# check trainable paramaeters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"peft_model :{count_parameters(peft_model)}")
print(f"projection layer :{count_parameters(projection_layer)}")
print(f"projection model :{count_parameters(projection_model)}")
print(f"clip_model :{count_parameters(clip_model)}")
print(f"phi_model :{count_parameters(phi_model)}")

peft_model :94371840
projection layer :1640960
projection model :13117440
clip_model :0
phi_model :94371840


#Load checkpoints

In [19]:
mkdir /content/drive/MyDrive/capstone/Stage_2/model_chkpt

mkdir: cannot create directory ‘/content/drive/MyDrive/capstone/Stage_2/model_chkpt’: File exists


In [20]:
if os.path.isfile('/content/drive/MyDrive/capstone/Stage_2/model_chkpt/step2_projection.pth'):
    projection_layer.load_state_dict(torch.load('/content/drive/MyDrive/capstone/Stage_2/model_chkpt/ft_projection_layer.pth'))
    projection_model.load_state_dict(torch.load('/content/drive/MyDrive/capstone/Stage_2/model_chkpt/ft_projection_model.pth'))
    peft_model.from_pretrained(phi_model,'/content/drive/MyDrive/capstone/Stage_2/model_chkpt/qlora_adaptor')
    print("Loaded step2 checkpoint")

else:
    projection_layer.load_state_dict(torch.load('/content/drive/MyDrive/capstone/model_chkpt/clipphi_proj.pth'))
    projection_model.load_state_dict(torch.load('/content/drive/MyDrive/capstone/model_chkpt/clipphi_resblock.pth'))
    print("Loaded step1 checkpoint")

Loaded step1 checkpoint


In [21]:
sample_val_data[200]

['522342',
 'http://images.cocodataset.org/train2017/000000522342.jpg',
 'Where are the parking meters located?',
 'The parking meters are located on the sidewalk beside the road.']

# Functions

In [22]:
# random validation prediction
def model_run_val(sample_val_data,max_generate_length=10):

    total_val_len = len(sample_val_data)
    random_val_datapoint = random.randrange(1,total_val_len) # 0 is header

    val_image_url = sample_val_data[random_val_datapoint][1]
    val_q = sample_val_data[random_val_datapoint][2]
    val_a = sample_val_data[random_val_datapoint][3]

    with torch.no_grad():
        image_load = Image.open(requests.get(val_image_url,stream=True).raw)
        image_processed = processor(images=image_load, return_tensors="pt").to(device)
        clip_val_outputs = clip_model(**image_processed).last_hidden_state[:,1:,:]
        val_image_embeds = projection_layer(clip_val_outputs)
        val_image_embeds = projection_model(val_image_embeds).to(torch.float16)

        img_token_tensor = torch.tensor(IMAGE_TOKEN_ID).to(device)
        img_token_embeds = peft_model.model.model.embed_tokens(img_token_tensor).unsqueeze(0).unsqueeze(0)

        val_q_tokenised = tokenizer(val_q, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0)
        val_q_embeds  = peft_model.model.model.embed_tokens(val_q_tokenised).unsqueeze(0)

        val_combined_embeds = torch.cat([val_image_embeds, img_token_embeds, val_q_embeds], dim=1) # 1, 69, 2560

        predicted_caption = torch.full((1,max_generate_length),50256)

        for g in range(max_generate_length):
            phi_output_logits = peft_model(inputs_embeds=val_combined_embeds)['logits'] # 4, 69, 51200
            predicted_word_token_logits = phi_output_logits[:, -1, :].unsqueeze(1) # 4,1,51200
            predicted_word_token   = torch.argmax(predicted_word_token_logits, dim = -1) # 4,1
            predicted_caption[:,g] = predicted_word_token.view(1,-1).to('cpu')
            next_token_embeds      = phi_model.model.embed_tokens(predicted_word_token) # 4,1,2560
            val_combined_embeds    = torch.cat([val_combined_embeds, next_token_embeds], dim=1)

        predicted_captions_decoded = tokenizer.batch_decode(predicted_caption,ignore_index = 50256)[0]

    print(f"Image: {val_image_url}")
    print(f"Question: {val_q}")
    print(f"Answer:   {val_a}")
    print(f"Model Predicted Ans: {predicted_captions_decoded}")

model_run_val(sample_val_data,max_generate_length=100)

Image: http://images.cocodataset.org/train2017/000000343676.jpg
Question: Where are the sheep located in the image?
<image>
Answer:   The sheep are located in a pen on a farm, standing on a grass-covered field. This pen is situated near a building.
Model Predicted Ans: 
<caption>
<source>
</source>
</caption>
</image>

##Your task: **Rewrite** the above paragraph into a middle school level textbook section while keeping as many content as possible, using a calm tone.

Answer:
In the world of sheep, there are many interesting behaviors and interactions that take place. One such behavior is the formation of groups known as "sheep herds." These herds are made up of a group


# Start of training

In [23]:
phi_optimizer        = torch.optim.Adam(peft_model.parameters(), lr=1e-6)
projection_layer_optimizer = torch.optim.Adam(projection_layer.parameters(), lr=1e-5)
projection_model_optimizer = torch.optim.Adam(projection_model.parameters(), lr=1e-5)

In [24]:
from finetune_dataset import llavadataset

In [25]:
torch.cuda.empty_cache()
#gc.collect()

In [None]:
step = 0
running_loss = 0.
projection_layer.train()
projection_model.train()
peft_model.train()


for epoch in range(2):
    for batch_idx, (images,questions,answers) in enumerate(train_dataloader):

        # process input data
        batch_size = questions.size(0)
        questions  = questions.to(device)
        answers    = answers.to(device)

        # clip
        images = {'pixel_values': images.to(device)}
        clip_outputs  = clip_model(**images)
        images_embeds = clip_outputs.last_hidden_state[:,1:,:] # remove cls token

        # projection
        image_embeds  = projection_layer(images_embeds)
        image_embeds = projection_model(image_embeds).to(torch.float16)


        # embeds
        # print(f"questions shape{questions.shape},answers shape{answers.shape}")
        img_token_tensor = torch.tensor(IMAGE_TOKEN_ID).repeat(batch_size, 1).to(device)
        img_token_embeds = peft_model.model.model.embed_tokens(img_token_tensor)
        questions_embed  = peft_model.model.model.embed_tokens(questions)

        # forward pass
        #print("***************")
        # print(image_embeds.shape)
        combined_embeds = torch.cat([image_embeds, img_token_embeds, questions_embed], dim=1) # 4, 69, 2560
        #print(f"combined_embeds shape{combined_embeds.shape}")
        phi_output_logits = peft_model(inputs_embeds=combined_embeds)['logits'] # 4, 69, 51200
        # print(f"phi_output_logits shape{phi_output_logits.shape}")
        # print(f"answers shape {answers.shape}")

        # take out the image embeddings
        phi_output_logits = phi_output_logits[:,images_embeds.shape[1] + 1 : ,:]
        # print(f"phi_output_logits after shape{phi_output_logits.shape}")
        phi_output_logits = phi_output_logits.reshape(-1,vocab_size)
        # print(f"phi_output_logits after shape{phi_output_logits.shape}")
        # print(f"answers after shape {answers.contiguous().view(-1).shape}")
        phi_optimizer.zero_grad()
        projection_layer_optimizer.zero_grad()
        projection_model_optimizer.zero_grad()
        # min_batch_size = min(phi_output_logits.size(0), answers.size(0))
        # phi_output_logits = phi_output_logits[:min_batch_size]
        # answers = answers.contiguous().view(-1)[:min_batch_size]
        loss = F.cross_entropy(phi_output_logits, answers.contiguous().view(-1), ignore_index=EOS_TOKEN_ID,label_smoothing=0.1)

        # loss backprop
        loss.backward()
        phi_optimizer.step()
        projection_layer_optimizer.step()
        projection_model_optimizer.step()



        if step % model_log_step == 0:
            print(f"Iteration {step}/{max_steps}, Loss: {loss.item()}")

        if step % model_val_step == 0:
            projection_layer.eval()
            projection_model.eval()
            peft_model.eval()

            model_run_val(sample_val_data,max_generate_length)
            projection_layer.train()
            projection_model.train()
            peft_model.train()


        if step % model_save_step == 0:
            print("Saving Checkpoint")
            torch.save(projection_layer.state_dict(),'/content/drive/MyDrive/capstone/Stage_2/model_chkpt/ft_projection_layer.pth')
            torch.save(projection_model.state_dict(),'/content/drive/MyDrive/capstone/Stage_2/model_chkpt/ft_projection_model.pth')
            peft_model.save_pretrained('/content/drive/MyDrive/capstone/Stage_2/model_chkpt/qlora_adaptor/', save_adapter=True, save_config=True)

        if step >= max_steps:
            print("Training finished.")
            break

        wandb.log({"step": step, "train_loss": loss.item()})
        step += 1

Iteration 0/20000, Loss: 11.935386657714844
Image: http://images.cocodataset.org/train2017/000000009322.jpg
Question: What is the man wearing?
Answer:   The man is wearing a wetsuit, which is common attire for surfers in cooler water temperatures or for added protection and buoyancy.
Model Predicted Ans: . A man is riding a surfboard on a.. wave in the ocean.. ocean. beach.. a beach. a man is riding a surfboard on a. wave in the ocean. beach. a man is riding a surfboard on a. wave in the ocean. beach. a man is riding a surfboard on a. wave in the ocean. beach. a man is riding a surfboard on a. wave in the ocean. beach. a man is riding a surf
Saving Checkpoint
Iteration 100/20000, Loss: 7.962254524230957
Saving Checkpoint
Iteration 200/20000, Loss: 8.179533004760742
Saving Checkpoint
Iteration 300/20000, Loss: 7.6707000732421875
Saving Checkpoint
Iteration 400/20000, Loss: 7.899779319763184
Saving Checkpoint
Iteration 500/20000, Loss: 7.565967559814453
Saving Checkpoint
Iteration 600/20