## Training a cross-encoder for trec-covid ranking 

This notebook trains a ranking model using the synthetic training data, generated by a large language model. 


In [None]:
!pip3 install --upgrade pandas requests sentence-transformers transformers pyarrow

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers import evaluation
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
train_data = "https://data.vespa-cloud.com/sample-apps-data/trec_covid_train_data_k1.parquet"

Data file pre-processing

In [None]:
df = pd.read_parquet(train_data)

In [None]:
df['gain'] = df['relevant'].apply(lambda relevant: 1 if relevant else 0)

In [None]:
def replace_none(text):
  if text == None:
    text = ''
  return text

In [None]:
df

In [None]:
train_samples = []
for (_, row) in df.iterrows():
  text = replace_none(row['title']) + ' ' + replace_none(row['abstract'])
  train_samples.append(InputExample(texts=[row['query'], text], label=float(row['gain'])))
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=64, drop_last=True)

## Train cross-encoder 
Define the model and training parameters. Notice the number of labels is one. 

In [None]:
model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
num_epochs = 2
num_labels = 1
max_length = 256
    
model = CrossEncoder(
  model_name, 
  num_labels=num_labels, 
  max_length=max_length, 
  default_activation_function=torch.nn.Identity(), 
  device=device
)

warmup_steps = 96
lr = 4e-7

In [None]:
model.fit(
  train_dataloader=train_dataloader,
  epochs=num_epochs,
  optimizer_params={'lr': lr},
)
model.save("model")

Training done - now we upload the model weights to HF

In [None]:
token='HF_TOKEN_KEY' # To upload model to Hugging Face 

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

In [None]:
automodel = AutoModelForSequenceClassification.from_pretrained("./model/")

In [None]:
autotokenizer = AutoTokenizer.from_pretrained("./model/")

In [None]:
name = "trec_covid_synthetic"

In [None]:
automodel.push_to_hub(name, use_auth_token=token)

In [None]:
autotokenizer.push_to_hub(name, use_auth_token=token)