# Finetuning the model

In this Notebook I try finetuning the embedding model.

## Setup

In [None]:
#!pip install transformers faiss-gpu faiss-cpu torch
#!pip install tira ir-datasets python-terrier
#!pip install sentence-transformers

In [None]:
import os
import json

import numpy as np
import pandas as pd
import torch
import pyterrier as pt
import faiss

# Encoder and Tokenizer models
from transformers import AutoTokenizer

# Tira and Pyterrier Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from tira.third_party_integrations import ir_datasets
from tira.rest_api_client import Client

In [None]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

# Print options for pandas
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.precision", 4)
pd.set_option("display.max_rows", None)
pd.set_option('display.float_format', '{:.5f}'.format)


# Use GPU if available
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print(f"device: {device}")

## Dataset and Text Corpus

In [None]:
from modules.data import load_corpus, train_test_split, relevant_corpus

corpus_path = "./dataset_corpus.json"
corpus = load_corpus(corpus_path) # TODO: corpus preprocessing?

# For testing on smaller corpus
dataset = pt.get_dataset("irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training")
corpus = relevant_corpus(corpus, dataset)

train_texts, val_texts = train_test_split(corpus)
print(f"{len(train_texts)} training samples, {len(val_texts)} val samples.")

## MLM Finetuning

In [None]:
from modules.model import FTModel
from modules.dataset import get_dataloader
from modules.train import train


# the model
model_name = "prajjwal1/bert-tiny"
mode = "mlm"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = FTModel(model_name, mode)

train_dataloader = get_dataloader(tokenizer, train_texts, mode, batch_size=4, shuffle=True)
val_dataloader = get_dataloader(tokenizer, val_texts, mode, batch_size=4, shuffle=False)

print("Starting Training")
trained_model = train(model, train_dataloader, val_dataloader, epochs=3, lr=2e-5, mode=mode)

In [None]:
new_name = "bert-tiny-ft-mlm-ep3"

encoder = trained_model.model.model
encoder.save_pretrained(new_name)
tokenizer.save_pretrained(new_name)

## Contrastive Finetuning

In [None]:
from modules.model import FtModel
from modules.dataset import get_dataloader
from modules.train import train

# the model
model_name = "prajjwal1/bert-tiny"
mode = "contrastive"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = FTModel(model_name, mode)

train_dataloader = get_dataloader(tokenizer, train_texts, mode, batch_size=4, shuffle=True)
val_dataloader = get_dataloader(tokenizer, val_texts, mode, batch_size=4, shuffle=True)

print("Starting Training")
trained_model = train(model, train_dataloader, val_dataloader, epochs=1, lr=2e-5, mode=mode)

## (Colab) save model into drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from shutil import copyfile, copytree

quelle = f"/content/{new_name}"
ziel = "/content/drive/My Drive/models/" + quelle.split("/")[-1]
copytree(quelle, ziel)