# **Import data**

In [1]:
# !apt-get update
# !apt-get install -y build-essential python3-dev
# !pip install captum==0.4.0 torch   torchtext==0.6.0 --no-cache-dir

In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Sampler
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, get_scheduler, get_linear_schedule_with_warmup
from torch.optim import AdamW
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils import resample


import random, os
# from google.colab import drive

from collections import Counter

In [2]:
# !pip install huggingface_hub[hf_xet]

In [3]:
drive.mount('/content/drive')

NameError: name 'drive' is not defined

In [5]:
torch.cuda.empty_cache()

In [7]:
DATA_PATH = 'data'

TRAIN_PATH = DATA_PATH + '/train.csv'
TEST_PATH = DATA_PATH + '/test.csv'

In [8]:
df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)

# **Preprocess data**

## **Check data**

In [4]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)

In [9]:
df_train.head()

Unnamed: 0,topic_id,PID,relevance,title,abstract
0,CD007431,7072537,0,Lumbar spondylolisthesis. Clinical syndrome an...,"The paper gives a survey, based on literature ..."
1,CD007431,8748845,0,The C-reactive protein for detection of early ...,The tendency for short hospitalization after l...
2,CD007431,3819738,0,Pain in sciatica depresses lower limb nocicept...,The inhibitory effects of acute pain produced ...
3,CD007431,7941692,0,[Satisfaction following automated percutaneous...,182 patients assessed their condition after au...
4,CD007431,16261104,0,Adjacent segment degeneration at T1-T2 present...,A case report of a T1-T2 herniated disc adjace...


Amount of unique topics - train

In [10]:
uq_topics = df_train['topic_id'].unique()

len(uq_topics)

99

Amount of unique topics - test

In [11]:
uq_topics_test = df_test['topic_id'].unique()

len(uq_topics_test)

28

Topics from train that are also in test

In [12]:
[x for x in uq_topics if x in uq_topics_test]

['CD011686', 'CD011571', 'CD012164']

So for each topic there is need to create train_test split

Topics with most positive 'relevance':

In [13]:
df_train.groupby('topic_id').sum('relevance').sort_values(by='relevance', ascending=False)

Unnamed: 0_level_0,PID,relevance
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
CD011975,67435351855,414
CD010213,158206893401,402
CD012599,66093127885,402
CD009925,43777216611,314
CD011984,67413914183,307
...,...,...
CD006715,22770758,1
CD011549,78868246171,1
CD011571,99935044,1
CD007868,83228784,0


In [14]:
len(df_train.groupby('topic_id').sum('relevance')[df_train.groupby('topic_id').sum('relevance')['relevance'] >= 70])

23

In [22]:
top_topics = df_train.groupby('topic_id').sum('relevance')[
                df_train.groupby('topic_id').sum('relevance')['relevance'] >= 70
                ].sort_values(by='relevance', ascending=False).index

### Find the most balanced topics

In [16]:
# df_train \
#   .groupby('topic_id') \
#   .sum('relevance') \
#   .sort_values(by='relevance', ascending=False) \
#   .iloc[0:20]

balanced_topics = df_train \
  .groupby('topic_id')['relevance'] \
  .agg(
      rel_count=lambda x: (x == 1).sum(),
      not_rel_count=lambda x: (x == 0).sum(),
      count='count'
  ) \
  .sort_values(by='rel_count', ascending=False) \
  .iloc[0:15]

balanced_topics['balance_score'] = abs(balanced_topics['rel_count'] - balanced_topics['not_rel_count'])

balanced_topics.sort_values(by='balance_score', ascending=True)


Unnamed: 0_level_0,rel_count,not_rel_count,count,balance_score
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CD011431,184,661,845,477
CD008122,133,847,980,714
CD011134,141,1074,1215,933
CD008054,113,1823,1936,1710
CD010502,166,1975,2141,1809
CD009925,314,4113,4427,3799
CD012010,209,4697,4906,4488
CD012599,402,5165,5567,4763
CD011975,414,5214,5628,4800
CD011984,307,5320,5627,5013


## **Dataset and DataLoader**

In [10]:
def split_Xy(topic, df=df_train):
  df_topic = df[df['topic_id'] == topic]
  df_topic.dropna(inplace=True)
  df_topic.reset_index(drop=True, inplace=True)

  X = df_topic.drop(columns=['relevance', 'topic_id', 'PID'], axis=1) # we leave only topic and abstract
  y = df_topic['relevance']

  return X, y

Now checking optimal max_length for this dataset

In [11]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def calculate_max_length(X):
  title_lens = [len(tokenizer.tokenize(t)) for t in X.iloc[:, 0]]
  abstract_lens = [len(tokenizer.tokenize(a)) for a in X.iloc[:, 1]]

  print("Title stats:", np.percentile(title_lens, [50, 75, 90, 95]))
  print("Abstract stats:", np.percentile(abstract_lens, [50, 75, 90, 95]))
  print("Total stats:", np.percentile(title_lens + abstract_lens, [50, 75, 90, 95]))


for topic in top_topics[:5]:
  print(topic)
  X, y = split_Xy(topic)
  calculate_max_length(X)
  print()

NameError: name 'top_topics' is not defined

In [13]:
class TextDataset(Dataset):
    def __init__(self, X, y, tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'), max_length=350):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.X = X

        topics = X['title'].astype(str).tolist()
        abstracts = X['abstract'].astype(str).tolist()

        encodings = self.tokenizer(
            topics,
            abstracts,
            truncation=True,
            return_overflowing_tokens=False,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        self.input_ids = encodings['input_ids']
        self.attention_mask = encodings['attention_mask']
        self.token_type_ids = encodings['token_type_ids']
        self.y = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'token_type_ids': self.token_type_ids[idx],
            'labels': self.y[idx]
        }


In [21]:
# word_counter = Counter()
# for (label, line) in df_train:
#     word_counter.update(tokenizer(line))

# voc = Vocab(word_counter, min_freq=10)

def collate_batch(batch):
    labels = torch.tensor([label - 1 for label, _ in batch])
    text_list = [tokenizer(line) for _, line in batch]

    text = torch.tensor([voc[t] for tokens in text_list for t in tokens])
    # the offset of each example
    offsets = torch.tensor(
        [0] + [len(tokens) for tokens in text_list][:-1]
    ).cumsum(dim=0)

    return labels, text, offsets

In [14]:
def preprocess(topic, df=df_train):
  set_seed(42)

  df_majority = df[df['relevance'] == 0]
  df_minority = df[df['relevance'] == 1]

  X_maj, y_maj = split_Xy(topic, df_majority)
  X_min, y_min = split_Xy(topic, df_minority)

  X_rsmpld, y_resmpld = resample(
    X_maj,
    y_maj,
    replace=False,
    n_samples=len(y_min),
    random_state=42
  )

  X_balanced = pd.concat([X_rsmpld, X_min])
  y_balanced = pd.concat([y_resmpld, y_min])

  X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, stratify=y_balanced, test_size=0.25, random_state=42)

  train_dataset = TextDataset(X_train, y_train)
  test_dataset = TextDataset(X_test, y_test)

  # word_counter = Counter()

  # for (abstr, title) in zip(df_train['abstract'], df_train['title']):
  #   word_counter.update(tokenizer(abstr))
  #   word_counter.update(tokenizer(title))

  # voc = Vocab(word_counter, min_freq=10)

  # def collate_batch(batch):
  #   labels = torch.tensor([label - 1 for label, _ in batch])
  #   text_list = [tokenizer(line) for _, line in batch]

  #   # flatten tokens across the whole batch
  #   text = torch.tensor([voc[t] for tokens in text_list for t in tokens])
  #   # the offset of each example
  #   offsets = torch.tensor(
  #       [0] + [len(tokens) for tokens in text_list][:-1]
  #   ).cumsum(dim=0)

  #   return labels, text, offsets

  train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
  test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

  return train_dataset, test_dataset, train_loader, test_loader

# **Model**

In [15]:
from transformers import AutoConfig

class BERTClassifier(nn.Module):
    def __init__(self, dropout=0.3):
        super().__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        # Change input size to 2 to match the logits shape
        self.fc = nn.Linear(2, 1)
        self.config = AutoConfig.from_pretrained('bert-base-uncased')
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        # Use outputs.logits instead of outputs.pooler_output
        x = outputs.logits  # Access the logits attribute for classification
        x = self.dropout(x)
        x = self.fc(x)

        return x.squeeze(-1)

# **Training**

In [16]:
NUM_OF_EPOCHS = 10

In [17]:
def prepare_for_training(train_dataset, train_loader, lr=2e-5):
  set_seed(42)

  model = BERTClassifier()
  optimizer = AdamW(model.parameters(), lr=lr)
  loss_fn = nn.BCEWithLogitsLoss()  # For binary classification

  total_steps = len(train_loader) * NUM_OF_EPOCHS
  scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=int(0.1 * total_steps),
      num_training_steps=total_steps
  )

  y_train = train_dataset.y
  num_pos = (y_train == 1).sum()
  num_neg = (y_train == 0).sum()
  pos_weight = torch.tensor([num_neg / num_pos], dtype=torch.float32)

  loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model = model.to(device)
  loss_fn = loss_fn.to(device)

  return model, optimizer, loss_fn, scheduler, device

In [18]:
def train(model, optimizer, loss_fn, scheduler, device, train_loader, num_epochs=NUM_OF_EPOCHS):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, token_type_ids)
            loss = loss_fn(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f}")

    return model

# **Evaluation**

In [19]:
def evaluate(model, test_loader, device, threshold=0.5):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, token_type_ids)
            probs = torch.sigmoid(outputs)  # [batch_size]

            all_preds.append(probs.cpu())
            all_labels.append(labels.cpu())

    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)

    # # Binary predictions
    # preds_binary = (all_preds >= threshold).int()

    # # Metrics
    # accuracy = accuracy_score(all_labels, preds_binary)
    # f1 = f1_score(all_labels, preds_binary)
    # precision = precision_score(all_labels, preds_binary)
    # recall = recall_score(all_labels, preds_binary)

    thresholds = [0.1 * i for i in range(1, 10)]
    for t in thresholds:
        preds_binary = (all_preds >= t).int()
        f1 = f1_score(all_labels, preds_binary)
        precision = precision_score(all_labels, preds_binary, zero_division=0)
        recall = recall_score(all_labels, preds_binary, zero_division=0)
        accuracy = accuracy_score(all_labels, preds_binary)
        print(f"Threshold: {t:.1f} | F1: {f1:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | Accuracy: {accuracy:.4f}")


    # print({
    #     "accuracy": accuracy,
    #     "f1_score": f1,
    #     "precision": precision,
    #     "recall": recall
    # })

# **Whole pipeline**

In [20]:
def train_and_evaluate(topic, df=df_train):
  set_seed(42)

  train_dataset, test_dataset, train_loader, test_loader = preprocess(topic, df)

  model, optimizer, loss_fn, scheduler, device = prepare_for_training(train_dataset, train_loader)

  model = train(model, optimizer, loss_fn, scheduler, device, train_loader)

  yield model, test_loader, device

  save_dir = '/content/drive/MyDrive/DevWorkshop_data/models'
  os.makedirs(save_dir, exist_ok=True)

  save_path = os.path.join(save_dir, f"bert_classifier_{topic}_balanced_1.pt")
  torch.save(model.state_dict(), save_path)
  print(f"Model saved to: {save_path}")

  evaluate(model, test_loader, device)

# **TESTS**

In [36]:
top_topics[0]

'CD011975'

In [24]:
# IGNORE ERRORS
import transformers
transformers.logging.set_verbosity_error()

In [42]:
model, test_loader, device = train_and_evaluate(top_topics[0])

Epoch 1: 100%|██████████| 36/36 [00:32<00:00,  1.11it/s]


Epoch 1 | Loss: 0.7117


Epoch 2: 100%|██████████| 36/36 [00:33<00:00,  1.07it/s]


Epoch 2 | Loss: 0.6303


Epoch 3: 100%|██████████| 36/36 [00:33<00:00,  1.07it/s]


Epoch 3 | Loss: 0.5494


Epoch 4: 100%|██████████| 36/36 [00:33<00:00,  1.09it/s]


Epoch 4 | Loss: 0.4773


Epoch 5: 100%|██████████| 36/36 [00:33<00:00,  1.07it/s]


Epoch 5 | Loss: 0.4127


Epoch 6: 100%|██████████| 36/36 [00:33<00:00,  1.07it/s]


Epoch 6 | Loss: 0.3669


Epoch 7: 100%|██████████| 36/36 [00:33<00:00,  1.08it/s]


Epoch 7 | Loss: 0.2930


Epoch 8: 100%|██████████| 36/36 [00:33<00:00,  1.07it/s]


Epoch 8 | Loss: 0.2869


Epoch 9: 100%|██████████| 36/36 [00:33<00:00,  1.07it/s]


Epoch 9 | Loss: 0.2784


Epoch 10: 100%|██████████| 36/36 [00:33<00:00,  1.07it/s]


Epoch 10 | Loss: 0.2415
Model saved to: /content/drive/MyDrive/DevWorkshop_data/models/bert_classifier_CD011975_balanced_1.pt


Evaluating: 100%|██████████| 12/12 [00:03<00:00,  3.18it/s]

Threshold: 0.1 | F1: 0.8052 | Precision: 0.6838 | Recall: 0.9789 | Accuracy: 0.7632
Threshold: 0.2 | F1: 0.8186 | Precision: 0.7333 | Recall: 0.9263 | Accuracy: 0.7947
Threshold: 0.3 | F1: 0.8230 | Precision: 0.7544 | Recall: 0.9053 | Accuracy: 0.8053
Threshold: 0.4 | F1: 0.8252 | Precision: 0.7658 | Recall: 0.8947 | Accuracy: 0.8105
Threshold: 0.5 | F1: 0.8195 | Precision: 0.7636 | Recall: 0.8842 | Accuracy: 0.8053
Threshold: 0.6 | F1: 0.8159 | Precision: 0.7736 | Recall: 0.8632 | Accuracy: 0.8053
Threshold: 0.7 | F1: 0.8081 | Precision: 0.7767 | Recall: 0.8421 | Accuracy: 0.8000
Threshold: 0.8 | F1: 0.8021 | Precision: 0.7938 | Recall: 0.8105 | Accuracy: 0.8000
Threshold: 0.9 | F1: 0.0000 | Precision: 0.0000 | Recall: 0.0000 | Accuracy: 0.5000





ValueError: not enough values to unpack (expected 3, got 1)

In [25]:
#load model
train_dataset, test_dataset, train_loader, test_loader = preprocess(top_topics[0])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BERTClassifier()
model.load_state_dict(torch.load(
    'bert_classifier_CD011975_balanced_1.pt',
    map_location=torch.device('cpu')
))
model = model.to(device)

# evaluate(model, test_loader, device)

model.safetensors:  14%|#4        | 62.9M/440M [00:00<?, ?B/s]

# **Model explainability**

In [52]:
model.eval()
pass

In [None]:
import torchtext

def forward_with_sigmoid(input):
    return torch.sigmoid(model(input))

TEXT = torchtext.data.Field(lower=True, tokenize=tokenizer)
Label = torchtext.data.LabelField(dtype = torch.float)

In [None]:
from torchtext import vocab

loaded_vectors = vocab.GloVe(name='6B', dim=100)

# If you prefer to use pre-downloaded glove vectors, you can load them with the following two command line
TEXT.build_vocab(train, vectors=loaded_vectors, max_size=len(loaded_vectors.stoi))

TEXT.vocab.set_vectors(stoi=loaded_vectors.stoi, vectors=loaded_vectors.vectors, dim=loaded_vectors.dim)
Label.build_vocab(train_df)

.vector_cache/glove.6B.zip: 862MB [02:40, 5.38MB/s]                           
100%|█████████▉| 399999/400000 [00:25<00:00, 15678.72it/s]


NameError: name 'train' is not defined

In [27]:
import shap
import scipy
import transformers

In [None]:
# def predict_proba(text_list):
#     inputs = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True, max_length=512)
#     with torch.no_grad():
#         logits = model(**inputs).logits

#     probabilities = F.softmax(logits, dim=-1)
#     return probabilities.numpy()

def predict_proba_shap(text_list):
    model.eval() # Ensure model is in eval mode

    inputs = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True, max_length=512)

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)

    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

    probabilities_positive = torch.sigmoid(logits).cpu().numpy() # Probability of the positive class
    probabilities_negative = 1 - probabilities_positive

    return np.vstack([probabilities_negative, probabilities_positive]).T


In [35]:
sent_analyzer = transformers.pipeline(
    "sentiment-analysis",
    tokenizer=tokenizer,
    return_all_scores=True,
    model=model.bert,
)

In [60]:
test_dataset.X.loc[290]['abstract']

'To compare the yield of multiple-marker biochemical screening with that of minor fetal anomalies observed on ultrasound for detection of aneuploidy in low-risk patients.'

In [64]:
sent_analyzer(test_dataset.X.loc[290])

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [44]:
background_adult = shap.maskers.Independent(X_adult, max_samples=100)
explainer = shap.Explainer(sent_analyzer, background_adult, output_names=['Irrelavant', 'Relevant'])

NameError: name 'X_adult' is not defined

In [45]:
shaps = explainer(test_dataset.X.loc[290])
shap.plots.text(shaps)

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  50%|█████     | 1/2 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 3it [01:24, 42.29s/it]               


In [43]:
shap.heatmap(shaps)

AttributeError: module 'shap' has no attribute 'heatmap'