In [21]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [26]:
import json
import random

def split_json(json_file, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    random.shuffle(data)
    
    total_size = len(data)
    train_size = int(total_size * train_ratio)
    val_size = int(total_size * val_ratio)
    
    train_data = data[:train_size]
    val_data = data[train_size:train_size + val_size]
    test_data = data[train_size + val_size:]
    
    with open('train.json', 'w') as f:
        json.dump(train_data, f, indent=4)
    with open('val.json', 'w') as f:
        json.dump(val_data, f, indent=4)
    with open('test.json', 'w') as f:
        json.dump(test_data, f, indent=4)

split_json('data_final.json')

In [8]:
from datasets import load_dataset

train_dataset = load_dataset('json', data_files="train.json")
eval_dataset = load_dataset('json', data_files="val.json")
test_dataset = load_dataset('json', data_files='test.json')

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")

In [22]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        quantization_config=bnb_config,
        cache_dir=".cache",
        token=HF_TOKEN
)

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True,
    token=HF_TOKEN
)

tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards: 100%|██████████| 2/2 [00:18<00:00,  9.26s/it]


In [23]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""You are Ranker, a Search model whose job is to identify which Search Results are most relevant to the provided query.
You will be given a query and 5 Search Results related to the query.
Rank the Search Results in the order of how well they answer the query or how related they are to the query.
Return only a list containing the ranked ordering of the Search Results. The list must contain only the indexes of the Search Results.

### Query:
{data_point["query"]}

### Search Results:
{data_point["search_results"]}

### Output: {data_point["output"]}
"""
    return tokenize(full_prompt)


In [26]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

eval_prompt = """You are Ranker, a Search model whose job is to identify which Search Results are most relevant to the provided query.
You will be given a query and 5 Search Results related to the query.
Rank the Search Results in the order of how well they answer the query or how related they are to the query.
Return only a list containing the ranked ordering of the Search Results. The list must contain only the indexes of the Search Results.

### Search Results:
Search Result 1: Products - Health E Stats - Prevalence of Overweight, Obesity, and Extreme Obesity Among Adults Aged 20 and Over: United States, 1960\u20131962 Through 2017\u20132018\nSnippet: Crude estimates (not age adjusted) for 2017\u20132018 are 31.1% for overweight, 42.5% for obesity, and 9.0% for severe obesity categories. 2Age adjusted by the direct method to the U.S. Census 2000 estimates using the age groups 20\u201339, 40\u201359, and 60\u201374. The 1960\u20131962 National Health ...\n\nSearch Result 2: Obesity and overweight\nSnippet: In 2022, 2.5 billion adults (18 years and older) were overweight. Of these, 890 million were living with obesity. In 2022, 43% of adults aged 18 years and over were overweight and 16% were living with obesity.\n\nSearch Result 3: Overweight & Obesity Statistics - NIDDK\nSnippet: Trends in overweight, obesity, and severe obesity for children, adolescents, and adults.\n\nSearch Result 4: FastStats - Overweight Prevalence\nSnippet: Percent of adults age 20 and older with overweight, including obesity: 73.6% (2017-2018) Source: Prevalence of Overweight, Obesity, and Severe Obesity Among Adults Ages 20 and Older: United States, 1960-1962 Through 2017-2018\n\nSearch Result 5: Adult Obesity Facts | Overweight & Obesity | CDC\nSnippet: [Read CDC National Center for Health ... non-Hispanic Asian adults (16.1%). The obesity prevalence was 39.8% among adults aged 20 to 39 years, 44.3% among adults aged 40 to 59 years, and 41.5% among adults aged 60 and older....\n\n

### Output:
"""

# Re-init the tokenizer so it doesn't add padding or eos token
eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
)

model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(model.generate(**model_input, max_new_tokens=16)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are Ranker, a Search model whose job is to identify which Search Results are most relevant to the provided query.
You will be given a query and 5 Search Results related to the query.
Rank the Search Results in the order of how well they answer the query or how related they are to the query.
Return only a list containing the ranked ordering of the Search Results. The list must contain only the indexes of the Search Results.

### Search Results:
Search Result 1: Products - Health E Stats - Prevalence of Overweight, Obesity, and Extreme Obesity Among Adults Aged 20 and Over: United States, 1960–1962 Through 2017–2018
Snippet: Crude estimates (not age adjusted) for 2017–2018 are 31.1% for overweight, 42.5% for obesity, and 9.0% for severe obesity categories. 2Age adjusted by the direct method to the U.S. Census 2000 estimates using the age groups 20–39, 40–59, and 60–74. The 1960–1962 National Health ...

Search Result 2: Obesity and overweight
Snippet: In 2022, 2.5 billion adults (18 

In [41]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [42]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

# Apply the accelerator. You can comment this out to remove the accelerator.
model = accelerator.prepare_model(model)

trainable params: 21260288 || all params: 3773331456 || trainable%: 0.5634354746703705


In [49]:
import transformers
from datetime import datetime

project = "finetuned"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset["train"],
    eval_dataset=tokenized_val_dataset["train"],
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=4,
        gradient_checkpointing=True,
        gradient_accumulation_steps=4,
        max_steps=1000,
        learning_rate=2.5e-5,
        logging_steps=50,
        fp16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",
        save_strategy="steps",
        save_steps=50, 
        evaluation_strategy="steps",
        eval_steps=50,
        do_eval=True,
        report_to="wandb",
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Step,Training Loss,Validation Loss
50,1.1265,1.112961
100,1.0818,1.093339
150,1.0461,1.081161
200,1.0254,1.0757
250,0.9868,1.077971
300,0.9625,1.08408
350,0.9282,1.083759
400,0.8932,1.09977
450,0.8618,1.113886
500,0.8271,1.137496


Checkpoint destination directory ./mistral-viggo-finetune/checkpoint-50 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./mistral-viggo-finetune/checkpoint-100 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./mistral-viggo-finetune/checkpoint-150 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./mistral-viggo-finetune/checkpoint-200 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./mistral-viggo-finetune/checkpoint-250 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./mistral-viggo-finetune/checkpoint-300 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./mistral-vigg

TrainOutput(global_step=1000, training_loss=0.8426206741333008, metrics={'train_runtime': 6031.7671, 'train_samples_per_second': 5.305, 'train_steps_per_second': 0.166, 'total_flos': 6.988615569679319e+17, 'train_loss': 0.8426206741333008, 'epoch': 17.02})

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
    cache_dir=".cache",
    token=HF_TOKEN
)

eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
    trust_remote_code=True,
    cache_dir=".cache",
    token=HF_TOKEN
)

[2024-05-01 10:28:11,735] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Loading checkpoint shards: 100%|██████████| 2/2 [00:21<00:00, 10.53s/it]


In [25]:
from peft import PeftModel

ft_model = PeftModel.from_pretrained(base_model, "mistral-finetuned/checkpoint-1000")

In [5]:
eval_prompt = """You are Ranker, a Search model whose job is to identify which Search Results are most relevant to the provided query.
You will be given a query and 5 Search Results related to the query.
Rank the Search Results in the order of how well they answer the query or how related they are to the query.
Return only a list containing the ranked ordering of the Search Results. The list must contain only the indexes of the Search Results.

### Search Results:
Search Result 1: Products - Health E Stats - Prevalence of Overweight, Obesity, and Extreme Obesity Among Adults Aged 20 and Over: United States, 1960\u20131962 Through 2017\u20132018\nSnippet: Crude estimates (not age adjusted) for 2017\u20132018 are 31.1% for overweight, 42.5% for obesity, and 9.0% for severe obesity categories. 2Age adjusted by the direct method to the U.S. Census 2000 estimates using the age groups 20\u201339, 40\u201359, and 60\u201374. The 1960\u20131962 National Health ...\n\nSearch Result 2: Obesity and overweight\nSnippet: In 2022, 2.5 billion adults (18 years and older) were overweight. Of these, 890 million were living with obesity. In 2022, 43% of adults aged 18 years and over were overweight and 16% were living with obesity.\n\nSearch Result 3: Overweight & Obesity Statistics - NIDDK\nSnippet: Trends in overweight, obesity, and severe obesity for children, adolescents, and adults.\n\nSearch Result 4: FastStats - Overweight Prevalence\nSnippet: Percent of adults age 20 and older with overweight, including obesity: 73.6% (2017-2018) Source: Prevalence of Overweight, Obesity, and Severe Obesity Among Adults Ages 20 and Older: United States, 1960-1962 Through 2017-2018\n\nSearch Result 5: Adult Obesity Facts | Overweight & Obesity | CDC\nSnippet: [Read CDC National Center for Health ... non-Hispanic Asian adults (16.1%). The obesity prevalence was 39.8% among adults aged 20 to 39 years, 44.3% among adults aged 40 to 59 years, and 41.5% among adults aged 60 and older....\n\n

### Output:
"""

model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=15)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are Ranker, a Search model whose job is to identify which Search Results are most relevant to the provided query.
You will be given a query and 5 Search Results related to the query.
Rank the Search Results in the order of how well they answer the query or how related they are to the query.
Return only a list containing the ranked ordering of the Search Results. The list must contain only the indexes of the Search Results.

### Search Results:
Search Result 1: Products - Health E Stats - Prevalence of Overweight, Obesity, and Extreme Obesity Among Adults Aged 20 and Over: United States, 1960–1962 Through 2017–2018
Snippet: Crude estimates (not age adjusted) for 2017–2018 are 31.1% for overweight, 42.5% for obesity, and 9.0% for severe obesity categories. 2Age adjusted by the direct method to the U.S. Census 2000 estimates using the age groups 20–39, 40–59, and 60–74. The 1960–1962 National Health ...

Search Result 2: Obesity and overweight
Snippet: In 2022, 2.5 billion adults (18 

In [18]:
import numpy as np

def precision_at_k(true_ranking, predicted_ranking, k):
    true_set = set(true_ranking[:k])
    predicted_set = set(predicted_ranking[:k])
    return len(true_set.intersection(predicted_set)) / k

def recall_at_k(true_ranking, predicted_ranking, k):
    true_set = set(true_ranking[:k])
    predicted_set = set(predicted_ranking[:k])
    return len(true_set.intersection(predicted_set)) / len(true_set)

def average_precision(true_ranking, predicted_ranking, k=None):
    if k is None:
        k = len(true_ranking)
    precisions = [precision_at_k(true_ranking, predicted_ranking, i+1) for i in range(k)]
    return np.mean(precisions)

def mean_recall_at_k(true_rankings, predicted_rankings, k):
    recalls = [recall_at_k(true, pred, k) for true, pred in zip(true_rankings, predicted_rankings)]
    return np.mean(recalls)

def mean_average_precision(true_rankings, predicted_rankings, k=None):
    average_precisions = [average_precision(true, pred, k) for true, pred in zip(true_rankings, predicted_rankings)]
    return np.mean(average_precisions)

def ndcg_at_k(true_ranking, predicted_ranking, k):
    dcg = 0
    idcg = 0
    for i in range(k):
        if i < len(true_ranking):
            dcg += 1 / np.log2(i + 2) if predicted_ranking[i] in true_ranking else 0
            idcg += 1 / np.log2(i + 2)
    return dcg / idcg

def mean_ndcg(true_rankings, predicted_rankings, k=None):
    ndcgs = [ndcg_at_k(true, pred, k) for true, pred in zip(true_rankings, predicted_rankings)]
    return np.mean(ndcgs)

def reciprocal_rank(true_ranking, predicted_ranking):
    for i, item in enumerate(predicted_ranking):
        if item in true_ranking:
            return 1 / (i + 1)
    return 0

def mean_reciprocal_rank(true_rankings, predicted_rankings):
    rr = [reciprocal_rank(true, pred) for true, pred in zip(true_rankings, predicted_rankings)]
    return np.mean(rr)

In [16]:
import json 

true_lst = []
predicted_lst = []

for data_point in test_dataset["train"]:
    eval_prompt = f"""You are Ranker, a Search model whose job is to identify which Search Results are most relevant to the provided query.
You will be given a query and 5 Search Results related to the query.
Rank the Search Results in the order of how well they answer the query or how related they are to the query.
Return only a list containing the ranked ordering of the Search Results. The list must contain only the indexes of the Search Results.

### Query:
{data_point["query"]}

### Search Results:
{data_point["search_results"]}

### Output:
"""

    model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

    ft_model.eval()
    with torch.no_grad():
        response = (eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=15)[0], skip_special_tokens=True, pad_token_id=eval_tokenizer.eos_token_id))
        output_part = response.split("### Output:", 1)[1]
        try:
            output_list_str = json.loads(output_part.strip())
        except Exception as e:
            print(f"Couldn't json parse")
            print(output_list_str)
            continue
        
        predicted = [item for item in output_list_str]
        
        predicted_lst.append(predicted)
        
        true = json.loads(data_point["output"])
        
        true_lst.append(true)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

Couldn't json parse
[5, 1, 4, 2, 3]


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

Couldn't json parse
[1, 2, 3, 4, 5]


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [19]:
print("Mean Precision @1:", mean_average_precision(true_lst, predicted_lst, 1))
print("Mean Recall @1:", mean_recall_at_k(true_lst, predicted_lst, 1))
print("Mean NDCG @1:", mean_ndcg(true_lst, predicted_lst, 1))
print("Mean Precision @3:", mean_average_precision(true_lst, predicted_lst, 3))
print("Mean Recall @3:", mean_recall_at_k(true_lst, predicted_lst, 3))
print("Mean NDCG @3:", mean_ndcg(true_lst, predicted_lst, 1))
print("Mean Reciprocal Rank:", mean_reciprocal_rank(true_lst, predicted_lst))

Mean Precision @1: 0.5321888412017167
Mean Recall @1: 0.5321888412017167
Mean NDCG @1: 1.0
Mean Precision @3: 0.6671435383881736
Mean Recall @3: 0.7954220314735335
Mean NDCG @3: 1.0
Mean Reciprocal Rank: 1.0
