In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import regex as re
import os
import torch
from datasets import Dataset
from sklearn.model_selection import *
np.bool = bool
import torch.nn as nn
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch.nn.functional as F
# import regex as re
# from datasets import load_dataset, Dataset, DatasetDict
# import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
# import wandb


print(torch.cuda.is_available())

data = pd.read_parquet("merged.pq")
data

True


Unnamed: 0,lecture,question_group
0,Watch a video about Evolution by Natural Selec...,Identify and describe the properties of life\n...
1,Watch a video about the Scientific Method. Fig...,Identify the shared characteristics of the nat...
2,Watch a video about electrons and how the elec...,Describe matter and elements\nDescribe the int...
3,Watch a video about why we need oxygen and how...,Describe the properties of water that are crit...
4,Watch a video about proteins and protein enzym...,Describe the ways in which carbon is critical ...
...,...,...
801,Types of Higher Education Programs Today’s stu...,Read an example of the classification rhetoric...
802,How to Grow Tomatoes from a Seedling Growing t...,Read an example of the process analysis rhetor...
803,Defining Good Students Means More than Just Gr...,Read an example of the definition rhetorical mode
804,Comparing and Contrasting London and Washingto...,Read an example of the compare and contrast rh...


In [3]:
train_df, val_df = train_test_split(data, test_size=0.2, random_state=42)

In [4]:
model_name="microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def create_dataset(df):
    data_dict = {
        'input_ids': [],
        'attention_mask': [],
        'labels': []
    }

    for _, row in df.iterrows():
        lecture = row['lecture']
        question_group = row['question_group']

        # Tokenizing a prompt without the lecture to determine its size
        prompt_without_lecture = "###Lecture:\n\n" + " " + "\n\n###Learning Objectives:\n\n" + question_group
        prompt_without_lecture_length = len(tokenizer.tokenize(prompt_without_lecture))
        max_lecture_length = 2048 - prompt_without_lecture_length

        # Truncate the lecture
        avg_token_size = 3
        max_lecture_characters = (max_lecture_length * avg_token_size) 
        truncated_lecture = lecture[:max_lecture_characters]

        # Tokenize
        prompt = "###Lecture:\n\n" + truncated_lecture + "\n\n###Learning Objectives:\n\n" + question_group
        tokens = tokenizer(prompt, truncation=True, max_length=2048, padding='max_length', return_tensors='pt')

        input_ids = tokens['input_ids'].squeeze().tolist()
        attention_mask = tokens['attention_mask'].squeeze().tolist()

        # Labeling: -100 for all tokens except those in question_group
        labels = -100 * torch.ones_like(tokens['input_ids']).squeeze()
        question_tokens = tokenizer.encode(question_group, add_special_tokens=False)
        idx_start_positions = (tokens['input_ids'].squeeze() == question_tokens[0]).nonzero().squeeze()

        # Check if any starting positions were found
        if idx_start_positions.numel() == 0:  # No positions found
            continue
        elif len(idx_start_positions.size()) == 0:  # Only one position found
            idx_start = idx_start_positions.item()
        else:  # Multiple positions found
            idx_start = idx_start_positions[0].item()
        
        labels[idx_start:idx_start+len(question_tokens)] = tokens['input_ids'].squeeze()[idx_start:idx_start+len(question_tokens)]
        labels = labels.tolist()

        data_dict['input_ids'].append(input_ids)
        data_dict['attention_mask'].append(attention_mask)
        data_dict['labels'].append(labels)

    return Dataset.from_dict(data_dict)




train_dataset = create_dataset(train_df)
val_dataset = create_dataset(val_df)


In [5]:
def get_trainable_params(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [6]:
example = train_dataset.__getitem__(1)
for i in range(2048):
    print("token_id:", example['input_ids'][i], "|| attention_mask:", example['attention_mask'][i], "|| label:", example['labels'][i], "original text:", tokenizer.decode(example['input_ids'][i]))

token_id: 21017 || attention_mask: 1 || label: -100 original text: ###
token_id: 43 || attention_mask: 1 || label: -100 original text: L
token_id: 478 || attention_mask: 1 || label: -100 original text: ect
token_id: 495 || attention_mask: 1 || label: -100 original text: ure
token_id: 25 || attention_mask: 1 || label: -100 original text: :
token_id: 198 || attention_mask: 1 || label: -100 original text: 

token_id: 198 || attention_mask: 1 || label: -100 original text: 

token_id: 464 || attention_mask: 1 || label: -100 original text: The
token_id: 26632 || attention_mask: 1 || label: -100 original text:  Elements
token_id: 286 || attention_mask: 1 || label: -100 original text:  of
token_id: 262 || attention_mask: 1 || label: -100 original text:  the
token_id: 3611 || attention_mask: 1 || label: -100 original text:  General
token_id: 9344 || attention_mask: 1 || label: -100 original text:  Environment
token_id: 25 || attention_mask: 1 || label: -100 original text: :
token_id: 350 || att

# model training

In [12]:


# load_saved_model = False
load_saved_model = True

if load_saved_model:
    model = torch.load("model.pt")
    model.eval()

else:
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).cuda()
    model.train()

    for param in model.parameters():
        param.requires_grad = False
    for param in model.layers[-7:].parameters():
        param.requires_grad = True

    get_trainable_params(model)

    trainer:transformers.Trainer = transformers.Trainer(
        model=model, 
        train_dataset=train_dataset,
        args=transformers.TrainingArguments(
            per_device_train_batch_size=1, 
            gradient_accumulation_steps=1,
            warmup_steps=30, 
            max_steps=0, 
            num_train_epochs=1,
            learning_rate=1e-5, 
            fp16=True,
            logging_steps=10, 
            output_dir='outputs',
            report_to="none",
            lr_scheduler_type="constant",
            save_strategy="no"
        ),
        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )
    
    trainer.train()
    model.eval()
    torch.save(model, "model.pt")


model:transformers.PreTrainedModel


# model testing

In [None]:
with torch.no_grad():
  # check if the model works
  for id in range(0,5):
    lecture = val_df.iloc[id]['lecture']
    prompt = "###Lecture:\n\n" + lecture[:4500] + "\n\n###Learning Objectives:\n\n"

    example_batch = tokenizer(prompt, return_tensors='pt', return_attention_mask=False, max_length=2048, truncation=True)
    example_batch = {k: v.cuda() for k, v in example_batch.items()}

    #check if last token is padding
    print("last token:", example_batch['input_ids'][0][-1])

    with torch.cuda.amp.autocast():
      output_tokens = model.generate(**example_batch, max_new_tokens=60, temperature=1, do_sample=True, top_p=0.95, num_return_sequences=1)

    print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True), '\n\n')
    for question in val_df.iloc[id]["question_group"].split('\n'):
      print('original:', question)

    print("\n\n\n\n\n")