# Preamble

## Drive integration

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## GPU

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Free GPU memory

In [None]:
import gc
def free_gpu_memory():
  gc.collect()
  torch.cuda.empty_cache()

## Imports

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import pandas as pd

# Classifier Main

In [None]:
q1 = pd.read_csv('/content/drive/MyDrive/data/csv/queries.csv')

In [None]:
q1.head(5)

In [None]:
label_mapping = {'quantitative analysis': 0, 'general information': 1, 'miscellaneous':2}
q1['label'] = q1['label'].map(label_mapping)

In [None]:
# Load the pre-trained BioBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
model = BertForSequenceClassification.from_pretrained('dmis-lab/biobert-v1.1', num_labels=3)
model.to(device)

In [None]:
encoded_batch = tokenizer(
    list(q1['text']),
    padding=True,
    truncation=True,
    max_length=256,
    return_tensors="pt"
)

In [None]:
input_ids = encoded_batch['input_ids']
attention_masks = encoded_batch['attention_mask']
labels = torch.tensor(q1['label'].values)

In [None]:
# Data split
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, labels,
    test_size = 0.1, stratify = labels
)

train_masks, val_masks, _, _ = train_test_split(
    attention_masks, labels,
    test_size = 0.1, stratify = labels
)

In [None]:
# Create TensorDatasets
train_data = TensorDataset(train_inputs, train_masks, train_labels)
val_data = TensorDataset(val_inputs, val_masks, val_labels)

# Define Samplers & Loaders
train_dataloader = DataLoader(train_data, sampler = RandomSampler(train_data), batch_size = 32)
val_dataloader = DataLoader(val_data, sampler = SequentialSampler(val_data), batch_size = 32)