In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW
from tqdm import tqdm

import random, os

In [36]:
torch.cuda.empty_cache()

In [2]:
# DO PODMIANKI!
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

X_train = df_train.drop(['relevance', 'topic_id', 'PID'], axis=1)
y_train = df_train['relevance']

X_test = df_test.drop(['relevance', 'topic_id', 'PID'], axis=1)
y_test = df_test['relevance']

In [3]:
X_train.columns

Index(['topic_title', 'topic_objective', 'article_title', 'abstract'], dtype='object')

In [4]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)

In [5]:
class TextDataset(Dataset):
    def __init__(self, X, y, tokenizer, text_cols, numerical_cols, max_length=128):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.text_cols = text_cols
        self.numerical_cols = numerical_cols
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # Concatenate the text columns together
        text = " ".join(str(self.X.iloc[idx][col]) for col in self.text_cols)

        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        if self.numerical_cols:
            numerical_feats = torch.tensor(
                self.X.iloc[idx][self.numerical_cols].values,
                dtype=torch.float32
            )
        else:
            numerical_feats = torch.empty(0)  # or torch.zeros(1) if needed


        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'numerical_feats': numerical_feats,
            'labels': torch.tensor(self.y[idx], dtype=torch.float32)
        }


In [6]:
class BERTClassifier(nn.Module):
    def __init__(self, model_name='bert-base-uncased', num_numerical_feats=2, num_labels=1):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)

        self.fc = nn.Linear(self.bert.config.hidden_size + num_numerical_feats-2, num_labels)

    def forward(self, input_ids, attention_mask, numerical_feats):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # Shape: (batch_size, hidden_size)

        pooled_output = self.dropout(pooled_output)

        if numerical_feats.nelement() != 0:
            combined = torch.cat((pooled_output, numerical_feats), dim=1)
        else:
            combined = pooled_output


        logits = self.fc(combined)

        return logits


In [7]:
set_seed(42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BERTClassifier(num_numerical_feats=2, num_labels=1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
optimizer = AdamW(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()  # For binary classification

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [9]:
set_seed(42)

# DO PODMIANKI NAZWY KOLUMN!
train_dataset = TextDataset(
    X_train,
    y_train,
    tokenizer,
    text_cols=["topic_title", "topic_objective", "article_title", "abstract"],
    numerical_cols=[]
    )
test_dataset = TextDataset(
    X_test,
    y_test,
    tokenizer,
    text_cols=["topic_title", "topic_objective", "article_title", "abstract"],
    numerical_cols=[]
    )

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [10]:
set_seed(42)

epochs = 20

# Training loop
model.train()

for epoch in range(epochs):
    total_loss = 0

    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        numerical_feats = batch['numerical_feats'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask, numerical_feats)

        loss = criterion(outputs.squeeze(), labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f}")

100%|██████████| 155/155 [00:22<00:00,  6.75it/s]


Epoch 1 | Loss: 0.7631


100%|██████████| 155/155 [00:20<00:00,  7.42it/s]


Epoch 2 | Loss: 0.7412


100%|██████████| 155/155 [00:20<00:00,  7.52it/s]


Epoch 3 | Loss: 0.7465


100%|██████████| 155/155 [00:21<00:00,  7.31it/s]


Epoch 4 | Loss: 0.7495


100%|██████████| 155/155 [00:20<00:00,  7.65it/s]


Epoch 5 | Loss: 0.7605


100%|██████████| 155/155 [00:20<00:00,  7.53it/s]


Epoch 6 | Loss: 0.7289


100%|██████████| 155/155 [00:20<00:00,  7.46it/s]


Epoch 7 | Loss: 0.7517


100%|██████████| 155/155 [00:26<00:00,  5.79it/s]


Epoch 8 | Loss: 0.7306


100%|██████████| 155/155 [00:20<00:00,  7.46it/s]


Epoch 9 | Loss: 0.7548


100%|██████████| 155/155 [00:20<00:00,  7.62it/s]


Epoch 10 | Loss: 0.7488


100%|██████████| 155/155 [00:20<00:00,  7.58it/s]


Epoch 11 | Loss: 0.7359


100%|██████████| 155/155 [00:20<00:00,  7.58it/s]


Epoch 12 | Loss: 0.7624


100%|██████████| 155/155 [00:20<00:00,  7.54it/s]


Epoch 13 | Loss: 0.7578


100%|██████████| 155/155 [00:20<00:00,  7.41it/s]


Epoch 14 | Loss: 0.7403


100%|██████████| 155/155 [00:20<00:00,  7.57it/s]


Epoch 15 | Loss: 0.7372


100%|██████████| 155/155 [00:20<00:00,  7.54it/s]


Epoch 16 | Loss: 0.7285


100%|██████████| 155/155 [00:20<00:00,  7.64it/s]


Epoch 17 | Loss: 0.7677


100%|██████████| 155/155 [00:20<00:00,  7.47it/s]


Epoch 18 | Loss: 0.7238


100%|██████████| 155/155 [00:20<00:00,  7.45it/s]


Epoch 19 | Loss: 0.7276


100%|██████████| 155/155 [00:20<00:00,  7.63it/s]

Epoch 20 | Loss: 0.7712





In [11]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def evaluate(model, test_loader, device, threshold=0.5):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numerical_feats = batch['numerical_feats'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, numerical_feats)
            probs = torch.sigmoid(outputs.squeeze())

            all_preds.append(probs.cpu())
            all_labels.append(labels.cpu())

    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)

    # Apply threshold to get binary predictions
    preds_binary = (all_preds >= threshold).int()

    # Calculate metrics
    accuracy = accuracy_score(all_labels, preds_binary)
    f1 = f1_score(all_labels, preds_binary)
    precision = precision_score(all_labels, preds_binary)
    recall = recall_score(all_labels, preds_binary)

    print({
        "accuracy": accuracy,
        "f1_score": f1,
        "precision": precision,
        "recall": recall
    })


In [None]:
evaluate(model, test_loader, device)