In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import numpy as np
import pandas as pd
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import wandb

In [2]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [3]:
model_id = "google/gemma-1.1-2b-it"

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
videos_df = pd.read_csv('csvs/videos_df.csv', index_col=0)

In [6]:
videos_df.head()

Unnamed: 0_level_0,video_description,video_transcript,video_name,channel_name,prompt
video_url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
https://www.youtube.com/watch?v=Z_EliVUkuFA,Summary extraction error: Unexpected response ...,The video discusses the potential of Google De...,DeepMind’s New AI: Assistant From The Future!,Two Minute Papers,"[""A video about the limitations of Google Deep..."
https://www.youtube.com/watch?v=_2bzwNyIjkY,The video provides information about Andrew Pr...,The video highlights the many improvements in ...,Blender 4.1 - Create Virtual Worlds…For Free!,Two Minute Papers,['A video about the split viewer node in Blend...
https://www.youtube.com/watch?v=1YEX4t79e0Q,Summary extraction error: Unexpected response ...,"OpenAI’s text to video AI, Sora took the world...",OpenAI Sora: Beauty And Horror!,Two Minute Papers,"[""A video about the AI's ability to create abs..."
https://www.youtube.com/watch?v=IS0xphCc5rI,"The conference ""Fully Connected"" is about expl...",OpenAI’s Sora is a brilliant new text-to-video...,OpenAI Sora Just Supercharged Filmmaking!,Two Minute Papers,['A video about exploring the intersection bet...
https://www.youtube.com/watch?v=Y9cwnHor8es,The video provides information about a paper o...,The video highlights the advancements in artif...,NVIDIA GTC: This Is The Future Of Everything!,Two Minute Papers,['A video about the potential impact of artifi...


In [7]:
dataset = videos_df[['video_description', 'video_name', 'prompt']]

In [8]:
dataset.head()

Unnamed: 0_level_0,video_description,video_name,prompt
video_url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://www.youtube.com/watch?v=Z_EliVUkuFA,Summary extraction error: Unexpected response ...,DeepMind’s New AI: Assistant From The Future!,"[""A video about the limitations of Google Deep..."
https://www.youtube.com/watch?v=_2bzwNyIjkY,The video provides information about Andrew Pr...,Blender 4.1 - Create Virtual Worlds…For Free!,['A video about the split viewer node in Blend...
https://www.youtube.com/watch?v=1YEX4t79e0Q,Summary extraction error: Unexpected response ...,OpenAI Sora: Beauty And Horror!,"[""A video about the AI's ability to create abs..."
https://www.youtube.com/watch?v=IS0xphCc5rI,"The conference ""Fully Connected"" is about expl...",OpenAI Sora Just Supercharged Filmmaking!,['A video about exploring the intersection bet...
https://www.youtube.com/watch?v=Y9cwnHor8es,The video provides information about a paper o...,NVIDIA GTC: This Is The Future Of Everything!,['A video about the potential impact of artifi...


In [9]:
# Look into the first row of the dataset
row = dataset.iloc[1].to_dict()

In [10]:
row

{'video_description': "The video provides information about Andrew Price's donut tutorial and a paper on simulations that look almost like reality. The video also mentions the Patreon supporters who make Two Minute Papers possible.",
 'video_name': 'Blender 4.1 - Create Virtual Worlds…For Free!',
 'prompt': "['A video about the split viewer node in Blender 4.1.']"}

In [11]:
def prompt_row(row):
    return (f"Below is a video title and description summary. " \
            f"Write a prompt that can be used to adress the video description. " \
            f"### Description:\n{row['video_description']}\n\n ### title:\n{row['video_name']}").format_map(row)

In [12]:
row2 = prompt_row(row)

In [13]:
row2

"Below is a video title and description summary. Write a prompt that can be used to adress the video description. ### Description:\nThe video provides information about Andrew Price's donut tutorial and a paper on simulations that look almost like reality. The video also mentions the Patreon supporters who make Two Minute Papers possible.\n\n ### title:\nBlender 4.1 - Create Virtual Worlds…For Free!"

In [14]:
prompts = [prompt_row(row) for _, row in dataset.iterrows()]

In [15]:
len(prompts)

876

In [16]:
EOS_TOKEN = "</s>"
outputs = [row['prompt'] + EOS_TOKEN for _, row in dataset.iterrows()]

In [17]:
outputs[0]

'["A video about the limitations of Google DeepMind\'s Gemini 1.5 Pro AI in"]</s>'

In [18]:
dataset = [{"prompt":s, "output":t, "example": s+t} for s,t in zip(prompts, outputs)]

In [19]:
dataset[0]

{'prompt': 'Below is a video title and description summary. Write a prompt that can be used to adress the video description. ### Description:\nSummary extraction error: Unexpected response format.\n\n ### title:\nDeepMind’s New AI: Assistant From The Future!',
 'output': '["A video about the limitations of Google DeepMind\'s Gemini 1.5 Pro AI in"]</s>',
 'example': 'Below is a video title and description summary. Write a prompt that can be used to adress the video description. ### Description:\nSummary extraction error: Unexpected response format.\n\n ### title:\nDeepMind’s New AI: Assistant From The Future!["A video about the limitations of Google DeepMind\'s Gemini 1.5 Pro AI in"]</s>'}

In [20]:
model_id = 'google/gemma-1.1-2b-it'
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [21]:
tokenizer.encode("My experiments are going strong!")

[2, 2926, 13818, 708, 2319, 3779, 235341]

In [22]:
with wandb.init(project="VideoFinder"):
    at = wandb.Artifact(
        name="video_prompts",
        type="dataset",
        description="Prompts for video descriptions",
    )
    at.add_file('csvs/videos_df.csv')

    table = wandb.Table(data=dataset)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33merikice1[0m ([33mwarik21[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.2654494382022472, max=1.0…

In [23]:
import random

random.shuffle(dataset)

train_dataset = dataset[:800]
eval_dataset = dataset[800:]

train_table = wandb.Table(dataframe=pd.DataFrame(train_dataset))
eval_table  = wandb.Table(dataframe=pd.DataFrame(eval_dataset))


with wandb.init(project="VideoFinder", job_type="split_data"):
    wandb.log({"train_dataset":train_table, "eval_dataset":eval_table})

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

VBox(children=(Label(value='2.189 MB of 2.189 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [24]:
def pack(dataset, max_seq_len=1024):
    tkds_ids = tokenizer([s["example"] for s in dataset])["input_ids"]
    
    all_token_ids = []
    for tokenized_input in tkds_ids:
        all_token_ids.extend(tokenized_input + [tokenizer.eos_token_id])
    
    packed_ds = []
    for i in range(0, len(all_token_ids), max_seq_len+1):
        input_ids = all_token_ids[i : i + max_seq_len+1]
        if len(input_ids) == (max_seq_len+1):
            packed_ds.append({"input_ids": input_ids[:-1], "labels": input_ids[1:]})  # < --- ‼️ ⛔️
	    # if you use the model.output.loss you don't need to shift, it is done for you!
    return packed_ds


train_ds_packed = pack(train_dataset)
eval_ds_packed = pack(eval_dataset)

In [25]:
import json
def save_jsonl(data, filename):
    with open(filename, 'w') as file:
        for entry in data:
            json.dump(entry, file)
            file.write('\n')


# dump everything to jsonl files
save_jsonl(train_ds_packed, "train_packed_VF.jsonl")
save_jsonl(eval_ds_packed, "eval_packed_VF.jsonl")

In [26]:
packed_at = wandb.Artifact(
    name="packed_VideoFinder",
    type="dataset",
    description="VideoFinder dataset packed in sequences",
    metadata={"max_seq_len":1024, "model_id":model_id})

packed_at.add_file("train_packed_VF.jsonl")
packed_at.add_file("eval_packed_VF.jsonl")

# log the artifact to the project, we can give this run a job_type like `preprocess`
with wandb.init(project="VideoFinder", job_type="preprocess"):
    wandb.log_artifact(packed_at)

VBox(children=(Label(value='1.331 MB of 1.331 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [27]:
from pathlib import Path

run = wandb.init(project="VideoFinder", job_type="train")

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011277777776639495, max=1.0…

In [28]:
artifact = run.use_artifact("packed_VideoFinder:latest", type="dataset")
artifact_dir = artifact.download()

[34m[1mwandb[0m:   2 of 2 files downloaded.  


In [29]:
artifact_dir

'c:\\Users\\eriki\\OneDrive\\Documents\\all_folder\\other_projects\\VideoFinder\\artifacts\\packed_VideoFinder-v1'

In [30]:
max_seq_len = artifact.metadata["max_seq_len"]

In [31]:
from torch.utils.data import DataLoader
from transformers import default_data_collator


batch_size = 1


train_dataloader = DataLoader(
    train_ds_packed,
    batch_size=batch_size,
    collate_fn=default_data_collator, # we don't need any special collator 😎
)


eval_dataloader = DataLoader(
    eval_ds_packed,
    batch_size=batch_size,
    collate_fn=default_data_collator,
    shuffle=False,
)


In [32]:
b = next(iter(train_dataloader))
b.keys(), b["input_ids"][0][:25], b["labels"][0][:25]


(dict_keys(['input_ids', 'labels']),
 tensor([     2,  33501,    603,    476,   3569,   4680,    578,   5966,  13367,
         235265,  15615,    476,  18335,    674,    798,    614,   1671,    577,
          75147,    573,   3569,   5966, 235265,  43774,   8290]),
 tensor([ 33501,    603,    476,   3569,   4680,    578,   5966,  13367, 235265,
          15615,    476,  18335,    674,    798,    614,   1671,    577,  75147,
            573,   3569,   5966, 235265,  43774,   8290, 235292]))

In [33]:
from types import SimpleNamespace


gradient_accumulation_steps = 32 // batch_size


config = SimpleNamespace(
    model_id=model_id,
    dataset_name="VideoFinder",
    precision="bf16",  # faster and better than fp16, requires new GPUs
    n_freeze=8,  # How many layers we don't train, LLama 7B has 32. #TODO : check the number of layers
    lr=2e-4,
    n_eval_samples=10, # How many samples to generate on validation
    max_seq_len=max_seq_len, # Length of the sequences to pack
    epochs=3,  # we do 3 pasess over the dataset.
    gradient_accumulation_steps=gradient_accumulation_steps,  # evey how many iterations we update the gradients, simulates larger batch sizes
    batch_size=batch_size,  # what my GPU can handle, depends on how many layers are we training  
    log_model=True,  # upload the model to W&B?
    mom=0.9, # optim param
    gradient_checkpointing = True,  # saves even more memory
    freeze_embed = True,  # why train this? let's keep them frozen ❄️
)


config.total_train_steps = config.epochs * len(train_dataloader) // config.gradient_accumulation_steps


In [34]:
model = AutoModelForCausalLM.from_pretrained(
    config.model_id,
    device_map=0,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    use_cache=False,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [35]:
def param_count(m):
    params = sum([p.numel() for p in m.parameters()])/1_000_000
    trainable_params = sum([p.numel() for p in m.parameters() if p.requires_grad])/1_000_000
    print(f"Total params: {params:.2f}M, Trainable: {trainable_params:.2f}M")
    return params, trainable_params

params, trainable_params = param_count(model)

Total params: 2506.17M, Trainable: 2506.17M


In [36]:
n_freeze = 8 # you can play with this parameter


# freeze layers (disable gradients)
for param in model.parameters(): param.requires_grad = False
for param in model.lm_head.parameters(): param.requires_grad = True
for param in model.model.layers[n_freeze:].parameters(): param.requires_grad = True


In [37]:
from transformers import get_cosine_schedule_with_warmup


optim = torch.optim.Adam(model.parameters(), lr=config.lr, betas=(0.9,0.99), eps=1e-5)
scheduler = get_cosine_schedule_with_warmup(
    optim,
    num_training_steps=config.total_train_steps,
    num_warmup_steps=config.total_train_steps // 10,
)


def loss_fn(x, y):
    "A Flat CrossEntropy" 
    return torch.nn.functional.cross_entropy(x.view(-1, x.shape[-1]), y.view(-1))


In [38]:
from types import SimpleNamespace
from transformers import GenerationConfig

gen_config = GenerationConfig.from_pretrained(config.model_id)
test_config = SimpleNamespace(
    max_new_tokens=256,
    gen_config=gen_config)


def generate(prompt, max_new_tokens=100, gen_config=gen_config):
    with torch.inference_mode():
        tokenized_prompt = tokenizer(prompt, return_tensors='pt')['input_ids'].cuda()
        output = model.generate(tokenized_prompt, 
                            max_new_tokens=max_new_tokens, 
                            generation_config=gen_config)
    return tokenizer.decode(output[0][len(tokenized_prompt[0]):], skip_special_tokens=True)


In [39]:
from tqdm.auto import tqdm

def prompt_table(prompts, log=True):
    table = wandb.Table(columns=["prompt", "generation", "concat", "max_new_tokens", "temperature", "top_p"])
    for prompt in tqdm(prompts):
        out = generate(prompt, test_config.max_new_tokens, test_config.gen_config)
        table.add_data(prompt, out, prompt+out, test_config.max_new_tokens, test_config.gen_config.temperature, test_config.gen_config.top_p)
    if log:
        wandb.log({"predictions":table})
    return table


In [40]:
def to_gpu(tensor_dict):
    return {k: v.to('cuda') for k, v in tensor_dict.items()}

class Accuracy:
    "A simple Accuracy function compatible with HF models"
    def __init__(self):
        self.count = 0
        self.tp = 0.
    def update(self, logits, labels):
        logits, labels = logits.argmax(dim=-1).view(-1).cpu(), labels.view(-1).cpu()
        tp = (logits == labels).sum()
        self.count += len(logits)
        self.tp += tp
        return tp / len(logits)
    def compute(self):
        return self.tp / self.count

In [41]:
@torch.no_grad()
def validate():
    model.eval()
    eval_acc = Accuracy()
    loss, total_steps = 0., 0
    for step, batch in enumerate(pbar:=tqdm(eval_dataloader, leave=False)):
        pbar.set_description(f"doing validation")
        batch = to_gpu(batch)
        total_steps += 1
        with torch.amp.autocast("cuda", dtype=torch.bfloat16):
            out = model(**batch)
            loss += loss_fn(out.logits, batch["labels"])  # you could use out.loss and not shift the dataset
        eval_acc.update(out.logits, batch["labels"])
    # we log results at the end
    wandb.log({"eval/loss": loss.item() / total_steps,
               "eval/accuracy": eval_acc.compute()})
    prompt_table(eval_dataset[:config.n_eval_samples], log=True)
    model.train()

In [42]:
from pathlib import Path
def save_model(model, model_name, models_folder="models", log=False):
    """Save the model to wandb as an artifact
    Args:
        model (nn.Module): Model to save.
        model_name (str): Name of the model.
        models_folder (str, optional): Folder to save the model. Defaults to "models".
    """
    model_name = f"{wandb.run.id}_{model_name}"
    file_name = Path(f"{models_folder}/{model_name}")
    file_name.parent.mkdir(parents=True, exist_ok=True)
    model.save_pretrained(file_name, safe_serialization=True)
    # save tokenizer for easy inference
    tokenizer = AutoTokenizer.from_pretrained(model.name_or_path)
    tokenizer.save_pretrained(model_name)
    if log:
        at = wandb.Artifact(model_name, type="model")
        at.add_dir(file_name)
        wandb.log_artifact(at)

In [43]:
wandb.init(project="alpaca_ft", # the project I am working on
           tags=["baseline","7b"],
           job_type="train",
           config=config) # the Hyperparameters I want to keep track of

# Training
acc = Accuracy()
model.train()
train_step = 0
for epoch in tqdm(range(config.epochs)):
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = to_gpu(batch)
        with torch.amp.autocast("cuda", dtype=torch.bfloat16):
            out = model(**batch)
            loss = loss_fn(out.logits, batch["labels"]) / config.gradient_accumulation_steps  # you could use out.loss and not shift the dataset  
            loss.backward()
        if step%config.gradient_accumulation_steps == 0:
            # we can log the metrics to W&B
            wandb.log({"train/loss": loss.item() * config.gradient_accumulation_steps,
                       "train/accuracy": acc.update(out.logits, batch["labels"]),
                       "train/learning_rate": scheduler.get_last_lr()[0],
                       "train/global_step": train_step})
            optim.step()
            scheduler.step()
            optim.zero_grad(set_to_none=True)
            train_step += 1
    validate()   

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


OutOfMemoryError: CUDA out of memory. Tried to allocate 1000.00 MiB. GPU 0 has a total capacity of 6.00 GiB of which 0 bytes is free. Of the allocated memory 11.39 GiB is allocated by PyTorch, and 35.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)