In [None]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW
from tqdm import tqdm

In [None]:
# DO PODMIANKI!
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

X_train = df_train.drop('label', axis=1)
y_train = df_train['label']

X_test = df_test.drop('label', axis=1)
y_test = df_test['label']

In [None]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)

In [None]:
class TextDataset(Dataset):
    def __init__(self, X, y, tokenizer, text_cols, numerical_cols, max_length=512):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.text_cols = text_cols
        self.numerical_cols = numerical_cols
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # Concatenate the text columns together
        text = " ".join(str(self.X.iloc[idx][col]) for col in self.text_cols)

        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        # num features
        numerical_feats = torch.tensor(
            self.X.iloc[idx][self.numerical_cols].values,
            dtype=torch.float32
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'numerical_feats': numerical_feats,
            'labels': torch.tensor(self.y[idx], dtype=torch.float32)
        }


In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, model_name='bert-base-uncased', num_numerical_feats=2, num_labels=1):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)

        self.fc = nn.Linear(self.bert.config.hidden_size + num_numerical_feats, num_labels)

    def forward(self, input_ids, attention_mask, numerical_feats):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # Shape: (batch_size, hidden_size)

        pooled_output = self.dropout(pooled_output)

        # Concatenate BERT output with numerical features
        combined = torch.cat((pooled_output, numerical_feats), dim=1)

        logits = self.fc(combined)

        return logits


In [None]:
set_seed(42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BERTWithNumericalFeatures(num_numerical_feats=2, num_labels=1)

In [None]:
optimizer = AdamW(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()  # For binary classification

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [None]:
set_seed(42)

# DO PODMIANKI NAZWY KOLUMN!
train_dataset = TextDataset(X_train, y_train, tokenizer, text_cols=["topic", "abstract"], numerical_cols=["num_feature1", "num_feature2"])
test_dataset = TextDataset(X_test, y_test, tokenizer, text_cols=["topic", "abstract"], numerical_cols=["num_feature1", "num_feature2"])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
set_seed(42)

epochs = 20

# Training loop
model.train()

for epoch in range(epochs):
    total_loss = 0

    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        numerical_feats = batch['numerical_feats'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask, numerical_feats)

        loss = criterion(outputs.squeeze(), labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f}")

In [None]:
def evaluate(model, val_loader, device):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numerical_feats = batch['numerical_feats'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, numerical_feats)
            probs = torch.sigmoid(outputs.squeeze())

            all_preds.append(probs.cpu())
            all_labels.append(labels.cpu())

    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)

    return all_preds, all_labels

In [None]:
preds, labels = evaluate(model, val_loader, device)