# **Import data**

In [38]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Sampler
from transformers import BertTokenizer, BertModel, get_scheduler, get_linear_schedule_with_warmup
from torch.optim import AdamW
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils import resample


import random, os
from google.colab import drive

In [2]:
!pip install huggingface_hub[hf_xet]

Collecting hf-xet>=0.1.4 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hf-xet
Successfully installed hf-xet-1.1.0


In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
torch.cuda.empty_cache()

In [6]:
DATA_PATH = '/content/drive/MyDrive/DevWorkshop_data'

TRAIN_PATH = DATA_PATH + '/train.csv'
TEST_PATH = DATA_PATH + '/test.csv'

In [7]:
df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)

# **Preprocess data**

## **Check data**

In [8]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)

In [9]:
df_train.head()

Unnamed: 0,topic_id,PID,relevance,title,abstract
0,CD007431,7072537,0,Lumbar spondylolisthesis. Clinical syndrome an...,"The paper gives a survey, based on literature ..."
1,CD007431,8748845,0,The C-reactive protein for detection of early ...,The tendency for short hospitalization after l...
2,CD007431,3819738,0,Pain in sciatica depresses lower limb nocicept...,The inhibitory effects of acute pain produced ...
3,CD007431,7941692,0,[Satisfaction following automated percutaneous...,182 patients assessed their condition after au...
4,CD007431,16261104,0,Adjacent segment degeneration at T1-T2 present...,A case report of a T1-T2 herniated disc adjace...


Amount of unique topics - train

In [39]:
uq_topics = df_train['topic_id'].unique()

len(uq_topics)

99

Amount of unique topics - test

In [40]:
uq_topics_test = df_test['topic_id'].unique()

len(uq_topics_test)

28

Topics from train that are also in test

In [41]:
[x for x in uq_topics if x in uq_topics_test]

['CD011686', 'CD011571', 'CD012164']

So for each topic there is need to create train_test split

Topics with most positive 'relevance':

In [42]:
df_train.groupby('topic_id').sum('relevance').sort_values(by='relevance', ascending=False)

Unnamed: 0_level_0,PID,relevance
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
CD011975,67435351855,414
CD010213,158206893401,402
CD012599,66093127885,402
CD009925,43777216611,314
CD011984,67413914183,307
...,...,...
CD006715,22770758,1
CD011549,78868246171,1
CD011571,99935044,1
CD007868,83228784,0


In [43]:
len(df_train.groupby('topic_id').sum('relevance')[df_train.groupby('topic_id').sum('relevance')['relevance'] >= 70])

23

In [10]:
top_topics = df_train.groupby('topic_id').sum('relevance')[
                df_train.groupby('topic_id').sum('relevance')['relevance'] >= 70
                ].sort_values(by='relevance', ascending=False).index

### Find the most balanced topics

In [11]:
# df_train \
#   .groupby('topic_id') \
#   .sum('relevance') \
#   .sort_values(by='relevance', ascending=False) \
#   .iloc[0:20]

balanced_topics = df_train \
  .groupby('topic_id')['relevance'] \
  .agg(
      rel_count=lambda x: (x == 1).sum(),
      not_rel_count=lambda x: (x == 0).sum(),
      count='count'
  ) \
  .sort_values(by='rel_count', ascending=False) \
  .iloc[0:15]

balanced_topics['balance_score'] = abs(balanced_topics['rel_count'] - balanced_topics['not_rel_count'])

balanced_topics.sort_values(by='balance_score', ascending=True)


Unnamed: 0_level_0,rel_count,not_rel_count,count,balance_score
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CD011431,184,661,845,477
CD008122,133,847,980,714
CD011134,141,1074,1215,933
CD008054,113,1823,1936,1710
CD010502,166,1975,2141,1809
CD009925,314,4113,4427,3799
CD012010,209,4697,4906,4488
CD012599,402,5165,5567,4763
CD011975,414,5214,5628,4800
CD011984,307,5320,5627,5013


## **Dataset and DataLoader**

In [18]:
def split_Xy(topic, df=df_train):
  df_topic = df[df['topic_id'] == topic]
  df_topic.dropna(inplace=True)
  df_topic.reset_index(drop=True, inplace=True)

  X = df_topic.drop(columns=['relevance', 'topic_id', 'PID'], axis=1) # we leave only topic and abstract
  y = df_topic['relevance']

  return X, y

Now checking optimal max_length for this dataset

In [37]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [36]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def calculate_max_length(X):
  title_lens = [len(tokenizer.tokenize(t)) for t in X.iloc[:, 0]]
  abstract_lens = [len(tokenizer.tokenize(a)) for a in X.iloc[:, 1]]

  print("Title stats:", np.percentile(title_lens, [50, 75, 90, 95]))
  print("Abstract stats:", np.percentile(abstract_lens, [50, 75, 90, 95]))
  print("Total stats:", np.percentile(title_lens + abstract_lens, [50, 75, 90, 95]))


for topic in top_topics[:5]:
  print(topic)
  X, y = split_Xy(topic)
  calculate_max_length(X)
  print()

CD011975


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_topic.dropna(inplace=True)


Title stats: [21. 26. 33. 38.]
Abstract stats: [151.  271.  353.1 403. ]
Total stats: [ 32.   151.   296.   353.05]

CD010213


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_topic.dropna(inplace=True)


Title stats: [22. 28. 34. 39.]
Abstract stats: [145. 252. 349. 413.]
Total stats: [ 36. 145. 280. 349.]

CD012599


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_topic.dropna(inplace=True)


Title stats: [21. 26. 33. 38.]
Abstract stats: [154.   271.75 354.3  403.15]
Total stats: [ 32.   154.   297.   354.25]

CD009925


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_topic.dropna(inplace=True)


Title stats: [20. 26. 33. 38.]
Abstract stats: [179. 285. 364. 417.]
Total stats: [ 33. 179. 306. 364.]

CD011984


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_topic.dropna(inplace=True)


Title stats: [21. 26. 33. 38.]
Abstract stats: [151.  271.  353.2 403. ]
Total stats: [ 32.   151.   296.   353.15]



In [44]:
class TextDataset(Dataset):
    def __init__(self, X, y, tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'), max_length=350):
        self.tokenizer = tokenizer
        self.max_length = max_length

        topics = X['title'].astype(str).tolist()
        abstracts = X['abstract'].astype(str).tolist()

        encodings = self.tokenizer(
            topics,
            abstracts,
            truncation=True,
            return_overflowing_tokens=False,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        self.input_ids = encodings['input_ids']
        self.attention_mask = encodings['attention_mask']
        self.token_type_ids = encodings['token_type_ids']
        self.y = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'token_type_ids': self.token_type_ids[idx],
            'labels': self.y[idx]
        }


In [45]:
def preprocess(topic, df=df_train):
  set_seed(42)

  df_majority = df[df['relevance'] == 0]
  df_minority = df[df['relevance'] == 1]

  X_maj, y_maj = split_Xy(topic, df_majority)
  X_min, y_min = split_Xy(topic, df_minority)

  X_rsmpld, y_resmpld = resample(
    X_maj,
    y_maj,
    replace=False,
    n_samples=len(y_min),
    random_state=42
  )

  X_balanced = pd.concat([X_rsmpld, X_min])
  y_balanced = pd.concat([y_resmpld, y_min])

  X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, stratify=y_balanced, test_size=0.25, random_state=42)

  train_dataset = TextDataset(X_train, y_train)
  test_dataset = TextDataset(X_test, y_test)

  train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
  test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

  return train_dataset, test_dataset, train_loader, test_loader

# **Model**

In [46]:
class BERTClassifier(nn.Module):
    def __init__(self, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        x = outputs.pooler_output
        x = self.dropout(x)
        x = self.fc(x)

        return x.squeeze(-1)

# **Training**

In [47]:
NUM_OF_EPOCHS = 10

In [48]:
def prepare_for_training(train_dataset, train_loader, lr=2e-5):
  set_seed(42)

  model = BERTClassifier()
  optimizer = AdamW(model.parameters(), lr=lr)
  loss_fn = nn.BCEWithLogitsLoss()  # For binary classification

  total_steps = len(train_loader) * NUM_OF_EPOCHS
  scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=int(0.1 * total_steps),
      num_training_steps=total_steps
  )

  y_train = train_dataset.y
  num_pos = (y_train == 1).sum()
  num_neg = (y_train == 0).sum()
  pos_weight = torch.tensor([num_neg / num_pos], dtype=torch.float32)

  loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model = model.to(device)
  loss_fn = loss_fn.to(device)

  return model, optimizer, loss_fn, scheduler, device

In [49]:
def train(model, optimizer, loss_fn, scheduler, device, train_loader, num_epochs=NUM_OF_EPOCHS):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, token_type_ids)
            loss = loss_fn(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f}")

    return model

# **Evaluation**

In [50]:
def evaluate(model, test_loader, device, threshold=0.5):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, token_type_ids)
            probs = torch.sigmoid(outputs)  # [batch_size]

            all_preds.append(probs.cpu())
            all_labels.append(labels.cpu())

    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)

    # # Binary predictions
    # preds_binary = (all_preds >= threshold).int()

    # # Metrics
    # accuracy = accuracy_score(all_labels, preds_binary)
    # f1 = f1_score(all_labels, preds_binary)
    # precision = precision_score(all_labels, preds_binary)
    # recall = recall_score(all_labels, preds_binary)

    thresholds = [0.1 * i for i in range(1, 10)]
    for t in thresholds:
        preds_binary = (all_preds >= t).int()
        f1 = f1_score(all_labels, preds_binary)
        precision = precision_score(all_labels, preds_binary, zero_division=0)
        recall = recall_score(all_labels, preds_binary, zero_division=0)
        accuracy = accuracy_score(all_labels, preds_binary)
        print(f"Threshold: {t:.1f} | F1: {f1:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | Accuracy: {accuracy:.4f}")


    # print({
    #     "accuracy": accuracy,
    #     "f1_score": f1,
    #     "precision": precision,
    #     "recall": recall
    # })

# **Whole pipeline**

In [56]:
def train_and_evaluate(topic, df=df_train):
  set_seed(42)

  train_dataset, test_dataset, train_loader, test_loader = preprocess(topic, df)

  model, optimizer, loss_fn, scheduler, device = prepare_for_training(train_dataset, train_loader)

  model = train(model, optimizer, loss_fn, scheduler, device, train_loader)

  yield model, test_loader, device

  save_dir = '/content/drive/MyDrive/DevWorkshop_data/models'
  os.makedirs(save_dir, exist_ok=True)

  save_path = os.path.join(save_dir, f"bert_classifier_{topic}_balanced.pt")
  torch.save(model.state_dict(), save_path)
  print(f"Model saved to: {save_path}")

  evaluate(model, test_loader, device)

  return model, test_loader, device

# **TESTS**

In [52]:
top_topics[0]

'CD011975'

In [53]:
# IGNORE ERRORS
import transformers
transformers.logging.set_verbosity_error()

In [54]:
model, test_loader, device = train_and_evaluate(top_topics[0])

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 36/36 [00:32<00:00,  1.09it/s]


Epoch 1 | Loss: 0.6921


Epoch 2: 100%|██████████| 36/36 [00:33<00:00,  1.08it/s]


Epoch 2 | Loss: 0.4994


Epoch 3: 100%|██████████| 36/36 [00:34<00:00,  1.04it/s]


Epoch 3 | Loss: 0.3651


Epoch 4: 100%|██████████| 36/36 [00:34<00:00,  1.04it/s]


Epoch 4 | Loss: 0.2714


Epoch 5: 100%|██████████| 36/36 [00:34<00:00,  1.05it/s]


Epoch 5 | Loss: 0.1739


Epoch 6: 100%|██████████| 36/36 [00:34<00:00,  1.04it/s]


Epoch 6 | Loss: 0.0848


Epoch 7: 100%|██████████| 36/36 [00:34<00:00,  1.05it/s]


Epoch 7 | Loss: 0.0578


Epoch 8: 100%|██████████| 36/36 [00:34<00:00,  1.05it/s]


Epoch 8 | Loss: 0.0439


Epoch 9: 100%|██████████| 36/36 [00:34<00:00,  1.05it/s]


Epoch 9 | Loss: 0.0388


Epoch 10: 100%|██████████| 36/36 [00:34<00:00,  1.04it/s]


Epoch 10 | Loss: 0.0363
Model saved to: /content/drive/MyDrive/DevWorkshop_data/models/bert_classifier_CD011975_balanced.pt


Evaluating: 100%|██████████| 12/12 [00:03<00:00,  3.11it/s]

Threshold: 0.1 | F1: 0.8148 | Precision: 0.7273 | Recall: 0.9263 | Accuracy: 0.7895
Threshold: 0.2 | F1: 0.8095 | Precision: 0.7391 | Recall: 0.8947 | Accuracy: 0.7895
Threshold: 0.3 | F1: 0.8195 | Precision: 0.7636 | Recall: 0.8842 | Accuracy: 0.8053
Threshold: 0.4 | F1: 0.8137 | Precision: 0.7615 | Recall: 0.8737 | Accuracy: 0.8000
Threshold: 0.5 | F1: 0.8020 | Precision: 0.7570 | Recall: 0.8526 | Accuracy: 0.7895
Threshold: 0.6 | F1: 0.7900 | Precision: 0.7524 | Recall: 0.8316 | Accuracy: 0.7789
Threshold: 0.7 | F1: 0.8061 | Precision: 0.7822 | Recall: 0.8316 | Accuracy: 0.8000
Threshold: 0.8 | F1: 0.7979 | Precision: 0.7857 | Recall: 0.8105 | Accuracy: 0.7947
Threshold: 0.9 | F1: 0.7937 | Precision: 0.7979 | Recall: 0.7895 | Accuracy: 0.7947





ValueError: not enough values to unpack (expected 3, got 1)

In [55]:
#load model
train_dataset, test_dataset, train_loader, test_loader = preprocess(top_topics[0])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BERTClassifier()
model.load_state_dict(torch.load(
    '/content/drive/MyDrive/DevWorkshop_data/models/bert_classifier_CD011975_balanced.pt',
    map_location=torch.device('cpu')
))
model = model.to(device)

evaluate(model, test_loader, device)

Evaluating: 100%|██████████| 12/12 [00:03<00:00,  3.12it/s]

Threshold: 0.1 | F1: 0.8148 | Precision: 0.7273 | Recall: 0.9263 | Accuracy: 0.7895
Threshold: 0.2 | F1: 0.8095 | Precision: 0.7391 | Recall: 0.8947 | Accuracy: 0.7895
Threshold: 0.3 | F1: 0.8195 | Precision: 0.7636 | Recall: 0.8842 | Accuracy: 0.8053
Threshold: 0.4 | F1: 0.8137 | Precision: 0.7615 | Recall: 0.8737 | Accuracy: 0.8000
Threshold: 0.5 | F1: 0.8020 | Precision: 0.7570 | Recall: 0.8526 | Accuracy: 0.7895
Threshold: 0.6 | F1: 0.7900 | Precision: 0.7524 | Recall: 0.8316 | Accuracy: 0.7789
Threshold: 0.7 | F1: 0.8061 | Precision: 0.7822 | Recall: 0.8316 | Accuracy: 0.8000
Threshold: 0.8 | F1: 0.7979 | Precision: 0.7857 | Recall: 0.8105 | Accuracy: 0.7947
Threshold: 0.9 | F1: 0.7937 | Precision: 0.7979 | Recall: 0.7895 | Accuracy: 0.7947



