In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import regex as re
import os
import torch
from datasets import Dataset
from sklearn.model_selection import *
np.bool = bool
import torch.nn as nn
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch.nn.functional as F
# import regex as re
# from datasets import load_dataset, Dataset, DatasetDict
# import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
# import wandb


print(torch.cuda.is_available())

data = pd.read_parquet("merged.pq")
data

True


Unnamed: 0,lecture,question_group
0,Watch a video about Evolution by Natural Selec...,Identify and describe the properties of life\n...
1,Watch a video about the Scientific Method. Fig...,Identify the shared characteristics of the nat...
2,Watch a video about electrons and how the elec...,Describe matter and elements\nDescribe the int...
3,Watch a video about why we need oxygen and how...,Describe the properties of water that are crit...
4,Watch a video about proteins and protein enzym...,Describe the ways in which carbon is critical ...
...,...,...
801,Types of Higher Education Programs Today’s stu...,Read an example of the classification rhetoric...
802,How to Grow Tomatoes from a Seedling Growing t...,Read an example of the process analysis rhetor...
803,Defining Good Students Means More than Just Gr...,Read an example of the definition rhetorical mode
804,Comparing and Contrasting London and Washingto...,Read an example of the compare and contrast rh...


In [3]:
train_df, val_df = train_test_split(data, test_size=0.2, random_state=42)
# train_df = train_df[:10]
# val_df = val_df[:10]

In [4]:
model_name="microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def create_dataset(df):
    data_dict = {
        'input_ids': [],
        'attention_mask': [],
        'labels': []
    }

    for _, row in df.iterrows():
        lecture = row['lecture']
        question_group = row['question_group']

        # Tokenizing a prompt without the lecture to determine its size
        prompt_without_lecture = "###Lecture:\n\n" + " " + "\n\n###Learning Objectives:\n\n" + question_group
        prompt_without_lecture_length = len(tokenizer.tokenize(prompt_without_lecture))
        max_lecture_length = 2048 - prompt_without_lecture_length

        # Truncate the lecture
        avg_token_size = 3
        max_lecture_characters = (max_lecture_length * avg_token_size) 
        truncated_lecture = lecture[:max_lecture_characters]

        # Tokenize
        prompt = "###Lecture:\n\n" + truncated_lecture + "\n\n###Learning Objectives:\n\n" + question_group
        tokens = tokenizer(prompt, truncation=True, max_length=2048, padding='max_length', return_tensors='pt')

        input_ids = tokens['input_ids'].squeeze().tolist()
        attention_mask = tokens['attention_mask'].squeeze().tolist()

        # Labeling: -100 for all tokens except those in question_group
        labels = -100 * torch.ones_like(tokens['input_ids']).squeeze()
        question_tokens = tokenizer.encode(question_group, add_special_tokens=False)
        idx_start_positions = (tokens['input_ids'].squeeze() == question_tokens[0]).nonzero().squeeze()

        # Check if any starting positions were found
        if idx_start_positions.numel() == 0:  # No positions found
            continue
        elif len(idx_start_positions.size()) == 0:  # Only one position found
            idx_start = idx_start_positions.item()
        else:  # Multiple positions found
            idx_start = idx_start_positions[0].item()
        
        labels[idx_start:idx_start+len(question_tokens)] = tokens['input_ids'].squeeze()[idx_start:idx_start+len(question_tokens)]
        labels = labels.tolist()

        data_dict['input_ids'].append(input_ids)
        data_dict['attention_mask'].append(attention_mask)
        data_dict['labels'].append(labels)

    return Dataset.from_dict(data_dict)




train_dataset = create_dataset(train_df)
val_dataset = create_dataset(val_df)


In [5]:
def get_trainable_params(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [6]:
example = train_dataset.__getitem__(1)
for i in range(2048):
    print("token_id:", example['input_ids'][i], "|| attention_mask:", example['attention_mask'][i], "|| label:", example['labels'][i], "original text:", tokenizer.decode(example['input_ids'][i]))

token_id: 21017 || attention_mask: 1 || label: -100 original text: ###
token_id: 43 || attention_mask: 1 || label: -100 original text: L
token_id: 478 || attention_mask: 1 || label: -100 original text: ect
token_id: 495 || attention_mask: 1 || label: -100 original text: ure
token_id: 25 || attention_mask: 1 || label: -100 original text: :
token_id: 198 || attention_mask: 1 || label: -100 original text: 

token_id: 198 || attention_mask: 1 || label: -100 original text: 

token_id: 464 || attention_mask: 1 || label: -100 original text: The
token_id: 26632 || attention_mask: 1 || label: -100 original text:  Elements
token_id: 286 || attention_mask: 1 || label: -100 original text:  of
token_id: 262 || attention_mask: 1 || label: -100 original text:  the
token_id: 3611 || attention_mask: 1 || label: -100 original text:  General
token_id: 9344 || attention_mask: 1 || label: -100 original text:  Environment
token_id: 25 || attention_mask: 1 || label: -100 original text: :
token_id: 350 || att

# model training

In [7]:


load_saved_model = False
# load_saved_model = True

if load_saved_model:
    model = torch.load("model.pt")
    model.eval()

else:
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).cuda()
    model.train()

    for param in model.parameters():
        param.requires_grad = False
    for param in model.layers[-11:].parameters():
        param.requires_grad = True

    get_trainable_params(model)

    trainer:transformers.Trainer = transformers.Trainer(
        model=model, 
        train_dataset=train_dataset,
        args=transformers.TrainingArguments(
            per_device_train_batch_size=1, 
            gradient_accumulation_steps=1,
            warmup_steps=30, 
            max_steps=0, 
            num_train_epochs=2,
            learning_rate=1e-5, 
            fp16=True,
            logging_steps=10, 
            output_dir='outputs',
            report_to="none",
            lr_scheduler_type="constant",
            save_strategy="no",
            evaluation_strategy="epoch"
        ),
        eval_dataset=val_dataset,
        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )
    
    trainer.train()
    model.eval()
    torch.save(model, "model.pt")


model:transformers.PreTrainedModel


trainable params: 608454656 || all params: 1418270720 || trainable%: 42.90116459571273

Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin c:\ProgramData\Anaconda3\lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.dll
CUDA SETUP: CUDA runtime path found: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin\cudart64_110.dll
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary c:\ProgramData\Anaconda3\lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.dll...


  warn(msg)
  warn(msg)


  0%|          | 0/1288 [00:00<?, ?it/s]

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.5438, 'learning_rate': 1e-05, 'epoch': 0.02}
{'loss': 2.4443, 'learning_rate': 1e-05, 'epoch': 0.03}
{'loss': 2.2723, 'learning_rate': 1e-05, 'epoch': 0.05}
{'loss': 2.4036, 'learning_rate': 1e-05, 'epoch': 0.06}
{'loss': 2.2514, 'learning_rate': 1e-05, 'epoch': 0.08}
{'loss': 2.5326, 'learning_rate': 1e-05, 'epoch': 0.09}
{'loss': 2.404, 'learning_rate': 1e-05, 'epoch': 0.11}
{'loss': 2.3788, 'learning_rate': 1e-05, 'epoch': 0.12}
{'loss': 2.3714, 'learning_rate': 1e-05, 'epoch': 0.14}
{'loss': 2.2638, 'learning_rate': 1e-05, 'epoch': 0.16}
{'loss': 2.156, 'learning_rate': 1e-05, 'epoch': 0.17}
{'loss': 2.4223, 'learning_rate': 1e-05, 'epoch': 0.19}
{'loss': 2.1931, 'learning_rate': 1e-05, 'epoch': 0.2}
{'loss': 2.1037, 'learning_rate': 1e-05, 'epoch': 0.22}
{'loss': 2.3846, 'learning_rate': 1e-05, 'epoch': 0.23}
{'loss': 2.3, 'learning_rate': 1e-05, 'epoch': 0.25}
{'loss': 2.2947, 'learning_rate': 1e-05, 'epoch': 0.26}
{'loss': 2.3094, 'learning_rate': 1e-05, 'epoch': 0.28

  0%|          | 0/21 [00:00<?, ?it/s]

{'eval_loss': 2.2807137966156006, 'eval_runtime': 129.0679, 'eval_samples_per_second': 1.255, 'eval_steps_per_second': 0.163, 'epoch': 1.0}
{'loss': 2.103, 'learning_rate': 1e-05, 'epoch': 1.01}
{'loss': 2.0404, 'learning_rate': 1e-05, 'epoch': 1.02}
{'loss': 2.0909, 'learning_rate': 1e-05, 'epoch': 1.04}
{'loss': 2.1279, 'learning_rate': 1e-05, 'epoch': 1.06}
{'loss': 2.0235, 'learning_rate': 1e-05, 'epoch': 1.07}
{'loss': 1.9387, 'learning_rate': 1e-05, 'epoch': 1.09}
{'loss': 2.0016, 'learning_rate': 1e-05, 'epoch': 1.1}
{'loss': 2.0385, 'learning_rate': 1e-05, 'epoch': 1.12}
{'loss': 2.1281, 'learning_rate': 1e-05, 'epoch': 1.13}
{'loss': 2.0417, 'learning_rate': 1e-05, 'epoch': 1.15}
{'loss': 1.9699, 'learning_rate': 1e-05, 'epoch': 1.16}
{'loss': 2.0572, 'learning_rate': 1e-05, 'epoch': 1.18}
{'loss': 2.0917, 'learning_rate': 1e-05, 'epoch': 1.2}
{'loss': 2.1962, 'learning_rate': 1e-05, 'epoch': 1.21}
{'loss': 1.9323, 'learning_rate': 1e-05, 'epoch': 1.23}
{'loss': 1.9052, 'learn

  0%|          | 0/21 [00:00<?, ?it/s]

{'eval_loss': 2.2500364780426025, 'eval_runtime': 128.9613, 'eval_samples_per_second': 1.256, 'eval_steps_per_second': 0.163, 'epoch': 2.0}
{'train_runtime': 960.269, 'train_samples_per_second': 1.341, 'train_steps_per_second': 1.341, 'train_loss': 2.145574182457065, 'epoch': 2.0}


# model testing

In [9]:
with torch.no_grad():
  # check if the model works
  for id in range(0,5):
    lecture = val_df.iloc[id]['lecture']
    prompt = "###Lecture:\n\n" + lecture[:4500] + "\n\n###Learning Objectives number 1 to 10:\n\n1. Describe"

    example_batch = tokenizer(prompt, return_tensors='pt', return_attention_mask=False, max_length=2048, truncation=True)
    example_batch = {k: v.cuda() for k, v in example_batch.items()}

    with torch.cuda.amp.autocast():
      output_tokens = model.generate(**example_batch, max_new_tokens=60, temperature=1, do_sample=True, top_p=0.95, num_return_sequences=1)

    print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True), '\n\n')
    for question in val_df.iloc[id]["question_group"].split('\n'):
      print('original:', question)

    print("\n\n\n\n\n")



 ###Lecture:

11 Key Considerations Regardless of where your open working group may fall on the spectrum of formal to informal, there are certain things to consider doing and places you can look for support. Keep a record Kick off your committee by establishing a shared digital place where agendas, minutes, best practices, and other documents can reside. Avoid documents becoming orphaned in individual emails. Inventory the different ways to communicate with your community at your institution and establish when, how, and what you will communicate out from your group. One approach that has been taken by a number of open working groups is to consider an open way to document and keep a record. At the University of British Columbia, the UBC Wiki (MediaWiki) is used for sharing all agendas, activities, and members in the open. You may want to look at the Open Ed Tech Collaborative apps available via Sandstorm for collaborative editing tools that will allow you to share and edit documents. 