In [None]:
!pip install sentence_transformers

In [None]:
from sentence_transformers import CrossEncoder, InputExample
from torch.utils.data import DataLoader
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
import math
import pandas as pd
import torch

# Load the mined dataset
MAX_LENGTH = 512  # Giảm MAX_LENGTH
df = pd.read_csv('/kaggle/input/finetuune/corpus_part_6.csv')
df = df.dropna(subset=['query', 'document', 'label'])  # Remove rows with missing values
df['query'] = df['query'].astype(str)  # Ensure 'query' is string
df['document'] = df['document'].astype(str)  # Ensure 'document' is string
df['label'] = df['label'].astype(float)  # Ensure 'label' is float for InputExample

# Display dataset info
df.info()

# Split data into 80% for training and 20% for evaluation
split_idx = int(0.9 * len(df))
train_df = df[:split_idx]
eval_df = df[split_idx:]

# Prepare training and evaluation data
train_data = [
    InputExample(texts=[row['query'], row['document']], label=row['label'])
    for _, row in train_df.iterrows()
]

eval_pairs = [[row['query'], row['document']] for _, row in eval_df.iterrows()]
eval_labels = [int(row['label']) for _, row in eval_df.iterrows()]

# Use evaluator
evaluator = CEBinaryClassificationEvaluator(
    sentence_pairs=eval_pairs[:1000],  # Đánh giá trên một tập con nhỏ hơn
    labels=eval_labels[:1000],
    name="train-eval"
)

# Initialize CrossEncoder model
model_name = "/kaggle/input/bge_finetune_5file/transformers/default/1/bge_reranker_finetune_24Dec_latest4"
cross_encoder = CrossEncoder(model_name, num_labels=1, max_length=MAX_LENGTH)
cross_encoder.model = cross_encoder.model.to('cuda')

print("CrossEncoder device:", cross_encoder.model.device)

# Create DataLoader for training
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=4)  # Giảm batch_size

num_epochs = 1
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

print("Start training")
torch.cuda.empty_cache()

cross_encoder.fit(
    train_dataloader=train_dataloader,
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    optimizer_params={'lr': 3e-5},
    output_path="/kaggle/working/bge_finetune_24Dec",
    evaluator=evaluator,
    evaluation_steps=2000,  # Giảm tần suất đánh giá
    save_best_model=True,
    show_progress_bar=True,
    use_amp=True  # Mixed Precision
)

save_path = "/kaggle/working/bge_reranker_finetune_24Dec_latest5"
cross_encoder.save(save_path)

print("Training complete. Best model saved!")
