In [1]:
from datasets import load_dataset
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from sentence_transformers import InputExample
from dotenv import load_dotenv
import os
from sentence_transformers import  CrossEncoder
from torch.utils.data import DataLoader
from nvidia_smi import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
from eval import MSEEval

In [2]:
# accelerator = Accelerator()

device = "cuda:1"

In [3]:
val_set_size = 0.05

base_model = "BAAI/bge-reranker-large"
dataset = load_dataset("parquet", data_files="../data/dataset/itrf_dataset_reranker_processed.parquet")
output_dir = "../models/itrf_reranker-large"
len(dataset["train"])
# open("../data/dataset/itrf_dataset_llm.parquet")

# split the data to train/val set
train_val = dataset["train"].train_test_split(
    test_size=val_set_size, shuffle=True, seed=2024
)
train_data = train_val["train"].shuffle(seed=2024)
val_data = train_val["test"].shuffle(seed=2024)

In [4]:
train_samples = [InputExample(texts=[ex["query"], ex["context"]], label=ex["llm_weighted_softmax"]) for ex in train_data]
val_samples = [InputExample(texts=[ex["query"], ex["context"]], label=ex["llm_weighted_softmax"]) for ex in val_data]

In [5]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_samples, shuffle=True, batch_size=3)
evaluator = MSEEval(val_dataloader, )

In [6]:
# Training
cross_encoder = CrossEncoder(base_model, num_labels=1, device=device)

In [7]:
cross_encoder.fit(
    train_dataloader=train_dataloader,
    evaluator=evaluator,
    epochs=1,
    warmup_steps=100,
    evaluation_steps=3,
    output_path=output_dir,
    save_best_model=True,
    use_amp=True,
    scheduler= 'warmupcosine',
    show_progress_bar=True,
)


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/16965 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/4762 [00:00<?, ?it/s]

ValueError: too many values to unpack (expected 2)

In [17]:
torch.cuda.empty_cache()

In [None]:
# optimizer_params={"lr": 2e-5, "eps": 1e-6, "correct_bias": False}, # Using AdamW optimizer
# weight_decay=0.01, # Using stardard weight decay
# optimizer_class=torch.optim.AdamW, # Using AdamW optimizer
# activation_fct=torch.nn.Sigmoid(), # Using BCEWithLogitsLoss so no activation is needed
# num_workers=4,
# accelerator=accelerator,
# loss_fct = None, # Using default loss function (BCEWithLogitsLoss)
# max_grad_norm: float = 1,

In [None]:
from LM_Cocktail import mix_models, mix_models_with_data

# By merging the fine-tuned model and the base model, LM-Cocktail can significantly enhance performance in downstream task while maintaining performance in other unrelated tasks.
# Optional (https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/reranker)
# Mix fine-tuned model and base model; then save it to output_path: ./mixed_model_1
model = mix_models(
    model_names_or_paths=["BAAI/bge-reranker-base", "your_fine-tuned_model"], 
    model_type='reranker', 
    weights=[0.5, 0.5],  # you can change the weights to get a better trade-off.
    output_path='./mixed_model_1')

# Deprecated

In [None]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [None]:
max_seq_length = 2048 # Supports automatic RoPE Scaling, so choose any number
packing = True # Packing multiple examples into one sequence
base_model = "BAAI/bge-reranker-large"
output_dir = "../models/itrf/reranker"

# Load environment variables from .env file
load_dotenv("../.env")

# Access the environment variable
token = os.getenv('HUGGINGFACE_TOKEN')
write_token = os.getenv('HUGGINGFACE__WRITE_TOKEN')

# Load model
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-base', device_map=device)
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-base', device_map=device)
model.eval()


In [None]:
pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]
with torch.no_grad():
    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
    scores = model(**inputs, return_dict=True).logits.view(-1, ).float()
    print([i for i in torch.nn.functional.sigmoid(scores)])

In [None]:
tokenized = tokenizer.predict(
            (pairs[1][0],
            pairs[1][1]),
            # truncation=True,
            # padding=False,
        )

In [None]:
datacollator = DataCollatorWithPadding(tokenizer)

datacollator([tokenized])