# News classification

## Libraries

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#MANAGEMENT PURPOSES ONLY-
from tqdm import tqdm
tqdm_disabled = False

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from torch.nn.utils import clip_grad_norm_ as clip_grad_norm
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW


from transformers import BertModel as Model
from transformers import BertTokenizer as Tokenizer
from transformers import get_linear_schedule_with_warmup

In [None]:
# Set seed to ensure reproducibility
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
# Set device to CUDA if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:')
print(device)
print(torch.version.cuda) 

In [None]:
pre_trained_model = 'bert-base-uncased'
finetuning = False

## Hyperparameters

In [None]:
BATCH_SIZE =32
EPOCHS = 100
LEARNING_RATE = 2e-5
MAX_TOKEN_LENGTH = 100

## Load data

Load train, test and dev data.

In [None]:
train = pd.read_csv('./data/train.csv', sep=',',header=None)
test = pd.read_csv('./data/test.csv', sep=',',header=None)
dev = pd.read_csv('./data/development.csv', sep=',',header=None)
classes = pd.read_csv('./data/classes.csv', sep=',',header=None)

In [None]:
# Split sets into attributes and labels
X_train = train[[1,2]].to_numpy()
y_train = train[[0]].to_numpy().transpose().flatten()
X_test = test[[1,2]].to_numpy()
y_test = test[[0]].to_numpy().transpose().flatten()
X_dev = dev[[1,2]].to_numpy()
y_dev = dev[[0]].to_numpy().transpose().flatten()
classes = classes.to_numpy()
classes = {classes[i][0]:classes[i][1] for i in range(len(classes))}

## Data pre-processing

In [None]:
# Define a tokenizer function
tokenizer = Tokenizer.from_pretrained(pre_trained_model, do_lower_case=False)

In [None]:
def get_encoding(text, max_length, truncation=True):
    return tokenizer.encode_plus(
            text, 
            max_length=max_length, 
            add_special_tokens=True,
            return_token_type_ids=False, 
            padding='max_length', 
            truncation=truncation,
            return_attention_mask=True, 
            return_tensors='pt')

## Dataset class

In [None]:
# Let's define a dataset class in order to use it in our pytorch model

class NewsClassifierDataset(Dataset):

    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, i):
        x = self.X[i][0]+' '+self.X[i][1]
        y = self.y[i]
            
        encoding = get_encoding(x, MAX_TOKEN_LENGTH, truncation=True)
        
        return {
            'X': x, 
            'y': torch.tensor(y, dtype=torch.long),
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

In [None]:
num_workers = 0

train_data_loader = DataLoader(
    NewsClassifierDataset(X_train, y_train),
    batch_size=BATCH_SIZE,
    num_workers=num_workers
)

development_data_loader = DataLoader(
    NewsClassifierDataset(X_dev, y_dev),
    batch_size=BATCH_SIZE,
    num_workers=num_workers
)

test_data_loader = DataLoader(
    NewsClassifierDataset(X_test, y_test),
    batch_size=BATCH_SIZE,
    num_workers=num_workers
)

## Model

In [None]:
class NewsClassifierModel(nn.Module):

    def __init__(self, n_classes):
        super(NewsClassifierModel, self).__init__()
        self.bert = Model.from_pretrained(pre_trained_model)
        self.drop = nn.Dropout(p=0.40)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.frezzed_bert = False
        
    def forward(self, input_ids, attention_mask, freeze_bert=True):
        
        
        # Turn on/off the BERT layers
        # Useefull for finetuning

        # Freeze BERT layers if freeze_bert is True and not previously frozen
        if freeze_bert and not self.frezzed_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
            self.frezzed_bert = True
        
        # Unfreeze BERT layers if freeze_bert is False and previously frozen
        elif not freeze_bert and self.frezzed_bert:
            for param in self.bert.parameters():
                param.requires_grad = True
            self.frezzed_bert = False
        

        o = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        
        output = self.drop(o.get('pooler_output'))
        output = self.out(output)
        output = F.softmax(output, dim=1)
        
        return output

In [None]:
model = NewsClassifierModel(len(classes))
model = model.to(device)

In [None]:
# If model checkpoint is available, load it
if os.path.exists('./checkpoint.bin'):
    model.load_state_dict(torch.load('./checkpoint.bin'))

## Training

In [None]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
loss_fn = CrossEntropyLoss().to(device)

In [None]:
def criterion(output, target):

    y_true = torch.zeros_like(output)
    
    for i in range(len(target)):
        y_true[i][target[i].item()-1] = 1

    return loss_fn(output, y_true)

In [None]:
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    num_training_steps = (len(train_data_loader) * EPOCHS)
)

In [None]:
def eval(model, data_loader):
    
    n_correct = 0
    n_samples = 0
    loss = 0

    model = model.eval()

    with torch.no_grad():
        for sample in data_loader:
            input_ids = sample['input_ids'].to(device)
            attention_mask = sample['attention_mask'].to(device)
            targets = sample['y'].to(device)

            output = model(input_ids = input_ids, attention_mask = attention_mask)
            
            _, predicted = torch.max(output, dim=1)
            predicted += torch.ones_like(predicted)
            
            loss = criterion(output, targets)
            n_correct += torch.sum(predicted == targets)
            n_samples += len(targets)

        accuracy = 100 * (n_correct.double() / n_samples)

    return accuracy, loss

In [None]:
def epoch_step(model, freeze_bert=False):

    epoch_loss = []
    n_correct = 0
    n_samples = 0

    model = model.train()

    for sample in tqdm(train_data_loader, desc='Training...', disable=tqdm_disabled):
       
        optimizer.zero_grad()
        
        input_ids = sample['input_ids'].to(device)
        attention_mask = sample['attention_mask'].to(device)
        targets = sample['y'].to(device)

        output = model(input_ids = input_ids, attention_mask = attention_mask, freeze_bert=freeze_bert)
        
        _, predicted = torch.max(output, dim=1)
        predicted += torch.ones_like(predicted)

        loss = criterion(output, targets)

        epoch_loss.append(loss.item())

        n_correct += torch.sum(predicted == targets)
        n_samples += len(targets)

        loss.backward()

        """
        Gradient clipping
        clip_grad_norm() performs gradient clipping. 
        It is used to mitigate the problem of exploding gradients, 
        """
        clip_grad_norm(model.parameters(), max_norm = 1.0)

        optimizer.step()
        scheduler.step()


    accuracy = 100 * (n_correct.double() / n_samples)
    return accuracy, np.mean(epoch_loss)

### Training model with BERT model weights frozen

In [None]:
train_history = []
dev_history = []
best_dev_accuracy = -1
for epoch in tqdm(range(EPOCHS), desc="Epochs", disable=tqdm_disabled):

    print(f"Training... {epoch+1}/{EPOCHS}")

    epoch_step(model, freeze_bert=(not finetuning))

    accuracy, loss = eval(model, train_data_loader)
    dev_acuracy, dev_loss = eval(model, development_data_loader)
    
    train_history.append((epoch+1, accuracy, loss))
    dev_history.append((epoch+1, dev_acuracy, dev_loss))

    print(f"Epoch: {epoch+1}/{EPOCHS}, Loss: {loss:.4f}, Train accuracy: {accuracy:.4f}%, Dev accuracy:{dev_acuracy:.4f}%")
    if dev_acuracy > best_dev_accuracy:
        torch.save(model.state_dict(), 'checkpoint.bin')
        best_accuracy = dev_acuracy

In [None]:
## Plot the training history
plt.plot(train_history, label="Train")
plt.plot(dev_history, label="Dev")
plt.title(pre_trained_model)
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(labels=['Train', 'Dev'], loc='upper left')
plt.show()

### Finetuning

In [None]:
finetuning = True

In [None]:
train_history = []
dev_history = []
best_dev_accuracy = -1
for epoch in tqdm(range(EPOCHS), desc="Epochs", disable=tqdm_disabled):

    print(f"Training... {epoch+1}/{EPOCHS}")

    epoch_step(model, freeze_bert=(not finetuning))

    accuracy, loss = eval(model, train_data_loader)
    dev_acuracy, dev_loss = eval(model, development_data_loader)
    
    train_history.append((epoch+1, accuracy, loss))
    dev_history.append((epoch+1, dev_acuracy, dev_loss))

    print(f"Epoch: {epoch+1}/{EPOCHS}, Loss: {loss:.4f}, Train accuracy: {accuracy:.4f}%, Dev accuracy:{dev_acuracy:.4f}%")
    if dev_acuracy > best_dev_accuracy:
        torch.save(model.state_dict(), 'checkpoint.bin')
        best_accuracy = dev_acuracy

In [None]:
## Plot the training history
plt.plot(train_history, label="Train")
plt.plot(dev_history, label="Dev")
plt.title(pre_trained_model)
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(labels=['Train', 'Dev'], loc='upper left')
plt.show()

## Test 

In [None]:
# Evaluate model on test set
test_acuracy, test_loss = eval(model, test_data_loader)
print(f"Loss: {test_loss:.4f},  Accuracy: {test_acuracy:.4f}%")