# Preamble

## Drive integration

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## GPU

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Free GPU memory

In [None]:
import gc
def free_gpu_memory():
  gc.collect()
  torch.cuda.empty_cache()

## Imports

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import pandas as pd

# Classifier Main

In [None]:
q1 = pd.read_csv('/content/drive/MyDrive/data/csv/queries.csv')

In [None]:
q1.head(5)

In [None]:
label_mapping = {'quantitative analysis': 0, 'general information': 1, 'miscellaneous':2}
q1['label'] = q1['label'].map(label_mapping)

In [None]:
# Load the pre-trained BioBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
model = BertForSequenceClassification.from_pretrained('dmis-lab/biobert-v1.1', num_labels=3)
model.to(device)

In [None]:
encoded_batch = tokenizer(
    list(q1['text']),
    padding=True,
    truncation=True,
    max_length=256,
    return_tensors="pt"
)

In [None]:
input_ids = encoded_batch['input_ids']
attention_masks = encoded_batch['attention_mask']
labels = torch.tensor(q1['label'].values)

In [None]:
# Data split
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, labels,
    test_size = 0.1, stratify = labels
)

train_masks, val_masks, _, _ = train_test_split(
    attention_masks, labels,
    test_size = 0.1, stratify = labels
)

In [None]:
# Create TensorDatasets
train_data = TensorDataset(train_inputs, train_masks, train_labels)
val_data = TensorDataset(val_inputs, val_masks, val_labels)

# Define Samplers & Loaders
train_dataloader = DataLoader(train_data, sampler = RandomSampler(train_data), batch_size = 32)
val_dataloader = DataLoader(val_data, sampler = SequentialSampler(val_data), batch_size = 32)

In [None]:
training_args = TrainingArguments(
    output_dir = "./results",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    num_train_epochs = 10,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=10,
)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_data,
    eval_dataset = val_data,
    tokenizer = tokenizer
)

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
model.eval()
print("Model loaded successfully.")

In [None]:
# Define batch classification function
def classify_batch(model, tokenizer, texts, label_dict):
    input_ids, attention_masks = encode_data(tokenizer, texts)
    input_ids, attention_masks = input_ids.to(device), attention_masks.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)
    predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()
    return [label_dict[p] for p in predictions]

In [None]:
# Inference
label_dict = {0: 'quantitative analysis', 1: 'general information', 2: 'miscellaneous'}

In [None]:
new_texts = [
    "How much is 5 multiplied by 10?",
    "What is the capital of France?",
    "What is the square root of 16?",
    "How many planets are there in the solar system?",
    "What is my risk for cardiovascular disease if my blood pressure goes up to 180?",
    "What causes the buildup of plaque in the arteries?",
    "What are the main causes of atherosclerosis?",
    "Is there a correlation between developing diabetes and the risk of cardiovascular disease?",
    "Will developing diabetes affect my risk of developing cardiovascular disease?",
    "Can I get tickets to the 9:00 showing of Cats?",
    "What will happen to my risk of cardiovascular disease if my blood pressure increases by 50%?"
]

In [None]:
predictions = classify_batch(model, tokenizer, new_texts, label_dict)
for text, label in zip(new_texts, predictions):
    print(text, "->", label)

## Save model

In [None]:
model.save_pretrained('/content/drive/MyDrive/classifiers/v1')

In [None]:
classifier_v1 = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/classifiers/v1')
classifier_v1.to(device)
classifier_v1.eval()
print("Classifier loaded successfully.")