In [2]:
import pandas as pd
import numpy as np
import json
import torch
from transformers import BertTokenizer
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import Dataset, DataLoader
import tqdm as notebook_tqdm

import mlflow
import mlflow.pytorch
from transformers import BertModel
import torch.nn as nn
from torch.optim import AdamW
from sklearn.metrics import f1_score, accuracy_score

# Load dataset
train_df = pd.read_json('train.json').transpose()

# Extracting the title, abstract, and classes
train_df['Text'] = train_df['Title'] + " " + train_df['Abstract']  # Concatenating title and abstract
texts = train_df['Text'].tolist()
labels = train_df['Classes'].tolist()

# Define class labels (list of unique subject areas)
label_classes = ['CE', 'ENV', 'BME', 'PE', 'METAL', 'ME', 'EE', 'CPE', 'OPTIC', 
                 'NANO', 'CHE', 'MATENG', 'AGRI', 'EDU', 'IE', 'SAFETY', 'MATH', 'MATSCI']

# Binarize the labels for multi-label classification
mlb = MultiLabelBinarizer(classes=label_classes)
binarized_labels = mlb.fit_transform(labels)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Custom Dataset class for BERT input
class ScopusDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        
        # Tokenize the input text
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            truncation=True,
            add_special_tokens=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(label)  # Multi-label target
        }

# Hyperparameters
MAX_LEN = 512
BATCH_SIZE = 16

# Create the dataset and dataloader
train_dataset = ScopusDataset(texts, binarized_labels, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

from transformers import BertModel
import torch.nn as nn

class BERTMultiLabelClassifier(nn.Module):
    def __init__(self, n_classes):
        super(BERTMultiLabelClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    
    def forward(self, input_ids, attention_mask):
        # Pass inputs through BERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        return self.out(output)

# Training function
def train_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(data_loader)

# Evaluation function
def eval_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            predictions.append(torch.sigmoid(outputs).cpu().numpy())
            true_labels.append(labels.cpu().numpy())

    predictions = np.vstack(predictions)
    true_labels = np.vstack(true_labels)
    return total_loss / len(data_loader), predictions, true_labels

# Training loop with MLflow
N_EPOCHS = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Start MLflow run
with mlflow.start_run():
    model = BERTMultiLabelClassifier(n_classes=len(label_classes))
    model = model.to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = nn.BCEWithLogitsLoss()

    # Log hyperparameters
    mlflow.log_param("epochs", N_EPOCHS)
    mlflow.log_param("batch_size", BATCH_SIZE)
    mlflow.log_param("learning_rate", 2e-5)
    mlflow.log_param("model", "BERT-base-uncased")

    for epoch in range(N_EPOCHS):
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, val_preds, val_true = eval_model(model, test_loader, criterion, device)
        
        # Convert predictions to binary format (threshold of 0.5)
        val_preds = (val_preds > 0.5).astype(int)
        
        # Compute evaluation metrics
        f1 = f1_score(val_true, val_preds, average='micro')

        # Log metrics for each epoch
        mlflow.log_metric("train_loss", train_loss, step=epoch)
        mlflow.log_metric("val_loss", val_loss, step=epoch)
        mlflow.log_metric("f1_score", f1, step=epoch)
        
        print(f'Epoch {epoch + 1}/{N_EPOCHS}')
        print(f'Train Loss: {train_loss:.4f}')
        print(f'Validation Loss: {val_loss:.4f}')
        print(f'F1 Score: {f1:.4f}')

    # Log the model at the end of training
    mlflow.pytorch.log_model(model, "bert_multi_label_model")


KeyboardInterrupt: 