In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1,2,3,4,5,6,7"

In [2]:
import pandas as pd


data = pd.read_csv("data/train.csv")

In [5]:
# Installing More Dependencies
import torch
from datasets import load_dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer, SFTConfig
import os
from typing import List
from torch.utils.data import Dataset, DataLoader


In [6]:
class DatasetBase(Dataset):
    def __getitem__(self, index: int):
        # return self.data[index]
        item = self.data[index]
        # Explicitly return only the required fields
        return {
            "text": item["text"],
            "label": item["label"]
        }

    def __len__(self):
        return len(self.data)

In [None]:
class ConversationDataset(DatasetBase):
    def __init__(self, data: List[dict], tokenizer: AutoTokenizer):
        system_prompt = "You are an math AI tutor respond to a student based on the conversation history to solve a math question."
        self.data = [
            {
                "text": tokenizer.apply_chat_template([
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"Math problem: {sample['question']}\nStudent incorrect solution: {sample['student_incorrect_solution']}\nConversation history: {sample['conversation']}"}
                ], tokenize=False),
                "label": sample["teacher_responses"]
            }
            for sample in data
        ]

In [None]:
class DataCollator:
    def __init__(self, tokenizer: AutoTokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
    
        all_prompts = [sample["text"] for sample in batch]
        prompts_tokenized = self.tokenizer(all_prompts, return_tensors="pt", padding=True)

        all_inputs = [sample["text"] + sample["label"] + self.tokenizer.eos_token for sample in batch]
        inputs_tokenized = self.tokenizer(all_inputs, return_tensors="pt", padding=True)
        prompt_lens = prompts_tokenized.attention_mask.sum(dim=1)
        labels = inputs_tokenized.input_ids.clone()
        padding_mask = torch.arange(labels.shape[1]).repeat(labels.shape[0], 1) < prompt_lens.unsqueeze(1)
        labels[padding_mask] = -100
        labels = labels.masked_fill(inputs_tokenized.attention_mask == 0, -100)
        return {
            "input_ids": inputs_tokenized.input_ids,
            "attention_mask": inputs_tokenized.attention_mask,
            "labels": labels
        }

In [9]:
train_data = []
for idx, row in data[:500].iterrows():
    
    for idx, conversation in enumerate(row["format_conversation_teacher"]): 
        train_data.append({
            "question": row["question"],
            "student_incorrect_solution": row["student_incorrect_solution"],
            "conversation": conversation,
            "teacher_responses": row["teacher_responses"][idx]
            })


In [10]:
test_data = train_data[int(len(train_data) * 0.9):]
train_data = train_data[:int(len(train_data) * 0.9)]

In [None]:
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=64,
        bias="none",
        target_modules= ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        task_type="CAUSAL_LM", 
)

In [None]:
training_arguments = SFTConfig(
        output_dir="sft_test",              ### output directory
        per_device_train_batch_size=4,
        gradient_accumulation_steps=16,
        optim="paged_adamw_32bit",
        learning_rate=1e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=3,
        max_steps=200,
        fp16=True,
        push_to_hub=False,
        max_seq_length=1024,
        remove_unused_columns=False,        ### need to include for including "text"
        label_names=["labels"]
    )


In [None]:
def get_model_and_tokenizer(model_id):
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  tokenizer.pad_token = tokenizer.eos_token
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
  )
  model = AutoModelForCausalLM.from_pretrained(
      model_id, quantization_config=bnb_config, device_map="auto", 
  )
  model.config.use_cache=False
  model.config.pretraining_tp=1
  return model, tokenizer

model_id="meta-llama/Meta-Llama-3.1-8B-Instruct"

model, tokenizer = get_model_and_tokenizer(model_id)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
trainer = SFTTrainer(
        model=model,
        train_dataset=ConversationDataset(train_data, tokenizer),
        eval_dataset=ConversationDataset(test_data, tokenizer),
        data_collator=DataCollator(tokenizer),
        peft_config=peft_config,
        args=training_arguments,
        tokenizer=tokenizer,
    )

  trainer = SFTTrainer(


In [28]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model at the end of training
trainer.save_model()

Step,Training Loss
10,2.4262
20,1.8428
30,1.8379
40,1.6544
50,1.6816
60,1.649
70,1.6145
80,1.5564
90,1.5628
100,1.4732
