In [1]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, Trainer, pipeline
from peft import LoraConfig
from datasets import Dataset
import datasets
from trl import SFTTrainer, PPOTrainer

from tqdm import tqdm
#load model name
model_name = "meta-llama/Llama-3.2-3B-Instruct"
import argparse

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = AutoModelForCausalLM.from_pretrained(model_name,
                                            # low_cpu_mem_usage=True,
                                            torch_dtype = torch.float16,
                                            device_map={'': torch.cuda.current_device()}
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.31it/s]


In [7]:
question = "Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?"

In [8]:
prompt = f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful AI assistant for travel tips and recommendations<|eot_id|><|start_header_id|>user<|end_header_id|>
{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>'''

In [9]:
output = pipeline('text-generation', model=model, tokenizer=tokenizer)
ans = output(prompt.format(question = question), max_length=200)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[{'generated_text': "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are a helpful AI assistant for travel tips and recommendations<|eot_id|><|start_header_id|>user<|end_header_id|>\nBetty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nLet's break it down step by step:\n\n1. Betty needs $100 for the wallet.\n2. She already has half of the money, which is $100 / 2 = $50.\n3. Her parents give her $15.\n4. Her grandparents give her twice as much as her parents, which is $15 x 2 = $30.\n5. So, the total amount of money Betty now has is $50 (her own) + $15 (from parents) + $30 (from grandparents) = $95.\n6. To find out"}]

In [None]:
print(ans[0]['generated_text'])

In [12]:
peft_params = LoraConfig(
    r=128,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
"lm_head",
], 
)

In [None]:
dataset = datasets.load_dataset("gsm8k", "main")

train_dataset, test_dataset = dataset['train'], dataset['test']
# half_dataset = test_dataset.select(range(int(len(test_dataset) // 1.5)))
# test_dataset = [i for i in test_dataset if i not in half_dataset]
# test_dataset = Dataset.from_list(test_dataset)
#add half of the test dataset to the train dataset
train_dataset = datasets.concatenate_datasets([train_dataset, half_dataset])
def preprocess_function(examples):
    inputs = [f"<|im_start|>user\n{prompt}<|im_end|>\n" for prompt in examples["question"]]
    # print(inputs[0])
    targets = [f"<|im_start|>assistant\n{completion}<|im_end|>\n" for completion in examples["answer"]]
    # print(targets[0])
    # model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    model_inputs = tokenizer(inputs,
                            #  max_length=512, 
                            padding = False
                             )
    # labels = tokenizer(targets, max_length=512, truncation=True, padding = True)
    labels = tokenizer(targets,
                      #  max_length=512, 
                      padding = False
                       )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)

tokenized_eval_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)

Map: 100%|██████████| 8352/8352 [00:00<00:00, 11053.98 examples/s]
Map: 100%|██████████| 440/440 [00:00<00:00, 12004.15 examples/s]


In [9]:
training_params = TrainingArguments(
    output_dir="./gpt2-xl-results",
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    logging_steps=50,
    learning_rate=2e-4,
    logging_dir="./logs",
    save_strategy="epoch",
    # fp16=True,
    bf16 = True,
    optim="adamw_torch"
)

In [13]:
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_eval_dataset,
    peft_config=peft_params,
    max_seq_length=None,
    args=training_params,
    packing=False,
    tokenizer = tokenizer,
    
)



In [14]:
trainer.train()

You are not running the flash-attention implementation, expect numerical differences.


Step,Training Loss
50,0.5885
100,0.499
150,0.4363
200,0.4939
250,0.4351
300,0.5031
350,0.3629
400,0.3558
450,0.4527
500,0.2884




TrainOutput(global_step=41760, training_loss=0.26966988442044604, metrics={'train_runtime': 7206.7136, 'train_samples_per_second': 5.795, 'train_steps_per_second': 5.795, 'total_flos': 7.91839062423552e+16, 'train_loss': 0.26966988442044604, 'epoch': 5.0})

In [4]:
question = "Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?"
prompt1 = '''<|im_start|>user
{question}<|im_end|>

<|im_start|>assistant'''

input1 = prompt1.format(question = question)
print(input1)

<|im_start|>user
Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?<|im_end|>

<|im_start|>assistant


In [27]:
tokenizer("<|im_end|>")['input_ids']

[529, 29989, 326, 29918, 355, 29989, 29958]

In [29]:
tokenizer("<|")['input_ids']

[529, 29989]

In [21]:
output = model.generate(tokenizer(input1, 
return_tensors='pt').input_ids.to('cuda'), 
max_new_tokens = 2048,
eos_token_id = tokenizer("<|im_end|>")['input_ids'],
temperature = 0.2
)
output = tokenizer.decode(output[0])
a = output.split("<|im_start|>assistant\n")[1]
print(a)



Weng earns $12 an hour for babysitting. Since there are 60 minutes in an hour, she earns $12/60 = $0.20 per minute. 
Yesterday, she did 50 minutes of babysitting, so she earned 50 * $0.20 = $10. 
Therefore, Weng earned $10 for babysitting yesterday. 
<|


In [8]:
import os
import time
from groq import Groq
from credential import GROQ_API_KEY
client = Groq(
    api_key= GROQ_API_KEY#get tokenn from groq
)

In [19]:
def generate_answer_groq(prompt):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-70b-8192",
        temperature=0,
        max_tokens=2048,
    )

    return chat_completion.choices[0].message.content

In [10]:
prefix_prompt = '''Your task is to evaluate a generated answer from a model by comparing it with a correct reference answer. Determine if the generated answer matches the correct answer. If the generated answer is correct, respond with 1. If it is incorrect, respond with 0. Do not provide any other responses besides 1 or 0.'''


In [33]:
def generate_prompt(question, answer, i):
    input = f'''<|im_start|>user{question}<|im_end|>assistant'''
    output = model.generate(tokenizer(input, 
return_tensors='pt').input_ids.to('cuda'), 
max_new_tokens = 2048,
eos_token_id = tokenizer("<|im_end|>")['input_ids'],
temperature = 0.2
)
    answer_model = tokenizer.decode(output[0])
    
    try:
        answer_model = answer_model.split("assistant")[1]
    except:
        print(f'ans from question {i} is not in the correct format')
        # print(answer_model)
    return f'''{prefix_prompt}\n\ntrue: {answer}\n\npred: {answer_model}'''

In [29]:
import time
from tqdm import tqdm

In [38]:
cnt = 0
for question, answer in test_dataset:
    for i in tqdm(range(len(test_dataset))):
        
        prompt = generate_prompt(test_dataset[question][i], test_dataset[answer][i],i)
        # print(prompt)
        score = generate_answer_groq(prompt)
        if score == 1 or score == '1':
            cnt += 1
        time.sleep(0.2)
        # print("\n")
print(cnt)
        

  0%|          | 0/440 [00:01<?, ?it/s]


KeyboardInterrupt: 

In [None]:

output = model.generate(tokenizer(input1, 
return_tensors='pt').input_ids.to('cuda'), 
max_new_tokens = 2048,
eos_token_id = tokenizer("<|im_end|>")['input_ids']
)
answer_model = tokenizer.decode(output[0])
true_answer