# Installs

# Drive integration

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# GPU

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Free GPU memory

In [None]:
import gc
def free_gpu_memory():
  gc.collect()
  torch.cuda.empty_cache()

# Classifier Main

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
import pandas as pd

q1 = pd.read_csv('/content/drive/MyDrive/data/csv/queries.csv')

In [None]:
q1.head(5)

Unnamed: 0,text,label
0,What is my risk of developing heart disease ba...,quantitative analysis
1,Can you calculate the likelihood of having a h...,quantitative analysis
2,How does smoking increase my risk of developin...,quantitative analysis
3,"If I stop smoking, how much will it decrease m...",quantitative analysis
4,What is the estimated risk reduction in heart ...,quantitative analysis


In [None]:
label_mapping = {'quantitative analysis': 0, 'general information': 1, 'miscellaneous':2}
data['label'] = data['label'].map(label_mapping)

In [None]:
# Load the pre-trained BioBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
model = BertForSequenceClassification.from_pretrained('dmis-lab/biobert-v1.1', num_labels=3)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
free_gpu_memory()

In [None]:
# Tokenization and input formatting
def encode_data(tokenizer, texts, max_len=256):
  input_ids = []
  attention_masks = []

  for text in texts:
    encoded = tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=max_len,
      truncation=True,
      padding='max_length',
      return_attention_mask=True,
      return_tensors='pt',
    ).to(device)
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])

  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)

  return input_ids, attention_masks

In [None]:
texts = q1['text']  # Your text data
labels = torch.tensor(q1['label'].values)  # Your labels

input_ids, attention_masks = encode_data(tokenizer, texts)

In [None]:
# Data split
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, test_size=0.1)

In [None]:
# Create DataLoaders
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = 32)

In [None]:
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler = validation_sampler, batch_size = 32)

In [None]:
epochs = 32

In [None]:
# Model training setup
optimizer = AdamW(
  model.parameters(),
  lr = 2e-5,
  eps = 1e-8
)



In [None]:
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
# Training loop
model.to(device)
print("Model loaded successfully.")

Model loaded successfully.


In [None]:
# Define loss function
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
free_gpu_memory()

In [None]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs.logits

        # Calculate loss
        loss = loss_fn(logits, b_labels)
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss}")

    model.eval()
    eval_loss = 0
    predictions, true_labels = [], []

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs.logits

            # Calculate validation loss
            loss = loss_fn(logits, b_labels)
            eval_loss += loss.item()

        predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
        true_labels.extend(b_labels.cpu().numpy())

    eval_loss = eval_loss / len(validation_dataloader)
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')

    print(f"Validation loss: {eval_loss}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-score: {f1}")

Average training loss: 1.0402063488960267
Validation loss: 0.9813263416290283, Accuracy: 0.4375, Precision: 0.48055555555555557, Recall: 0.5396825396825397, F1-score: 0.43636363636363634
Average training loss: 0.9161432266235352
Validation loss: 0.9095447063446045, Accuracy: 0.5625, Precision: 0.6964285714285715, Recall: 0.626984126984127, F1-score: 0.5151515151515151
Average training loss: 0.8082299828529358
Validation loss: 0.778789222240448, Accuracy: 0.5625, Precision: 0.6964285714285715, Recall: 0.626984126984127, F1-score: 0.5151515151515151
Average training loss: 0.7203036427497864
Validation loss: 0.685928225517273, Accuracy: 0.75, Precision: 0.7916666666666666, Recall: 0.7777777777777777, F1-score: 0.6999999999999998
Average training loss: 0.5869138956069946
Validation loss: 0.5788795351982117, Accuracy: 0.875, Precision: 0.875, Recall: 0.8888888888888888, F1-score: 0.8634920634920634
Average training loss: 0.5067092537879944
Validation loss: 0.4737767279148102, Accuracy: 0.87

In [None]:
model.to(device)
model.eval()
print("Model loaded successfully.")

Model loaded successfully.


In [None]:
new_texts = [
    "How much is 5 multiplied by 10?",
    "What is the capital of France?",
    "What is the square root of 16?",
    "How many planets are there in the solar system?",
    "What is my risk for cardiovascular disease if my blood pressure goes up to 180?",
    "What causes the buildup of plaque in the arteries?",
    "What are the main causes of atherosclerosis?",
    "Is there a correlation between developing diabetes and the risk of cardiovascular disease?",
    "Will developing diabetes affect my risk of developing cardiovascular disease?",
    "Can I get tickets to the 9:00 showing of Cats?",
    "What will happen to my risk of cardiovascular disease if my blood pressure increases by 50%?"
]

In [None]:
import random
random.shuffle(new_texts)
new_texts

['Is there a correlation between developing diabetes and the risk of cardiovascular disease?',
 'What will happen to my risk of cardiovascular disease if my blood pressure increases by 50%?',
 'What causes the buildup of plaque in the arteries?',
 'What is my risk for cardiovascular disease if my blood pressure goes up to 180?',
 'What are the main causes of atherosclerosis?',
 'Can I get tickets to the 9:00 showing of Cats?',
 'Will developing diabetes affect my risk of developing cardiovascular disease?',
 'What is the capital of France?',
 'What is the square root of 16?',
 'How many planets are there in the solar system?',
 'How much is 5 multiplied by 10?']

In [None]:
label_dict = {0: 'quantitative analysis', 1: 'general information', 2: 'miscellaneous'}
label_dict

{0: 'quantitative analysis', 1: 'general information', 2: 'miscellaneous'}

In [None]:
def classify(model, tokenizer, text, label_dict):
    # Encode the text using the provided tokenizer
    input_ids, attention_masks = encode_data(tokenizer, [text])

    # Move tensors to the same device as the model
    input_ids = input_ids.to(device)
    attention_masks = attention_masks.to(device)

    # Predict
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)

    # Extract the predicted index and map to label name
    prediction_idx = torch.argmax(outputs.logits, dim=1).item()
    return label_dict[prediction_idx]

In [None]:
# Inference
for new_text in new_texts:
  print(new_text, classify(model, tokenizer, new_text, label_dict))

Is there a correlation between developing diabetes and the risk of cardiovascular disease? general information
What will happen to my risk of cardiovascular disease if my blood pressure increases by 50%? quantitative analysis
What causes the buildup of plaque in the arteries? general information
What is my risk for cardiovascular disease if my blood pressure goes up to 180? quantitative analysis
What are the main causes of atherosclerosis? general information
Can I get tickets to the 9:00 showing of Cats? miscellaneous
Will developing diabetes affect my risk of developing cardiovascular disease? quantitative analysis
What is the capital of France? miscellaneous
What is the square root of 16? miscellaneous
How many planets are there in the solar system? miscellaneous
How much is 5 multiplied by 10? miscellaneous


In [None]:
model.save_pretrained('/content/drive/MyDrive/classifiers/v2')

In [None]:
classifier_v2 = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/classifiers/v2')
classifier_v2.to(device)
classifier_v2.eval()
print("Classifier loaded successfully.")

Classifier loaded successfully.
