In [113]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import transformers
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler,TensorDataset
# from datasets import Dataset
from torch.utils.data import Dataset 
import torch
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
import evaluate
from torch.nn import CrossEntropyLoss
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [114]:
df = pd.read_csv("data/balancednewcategory.csv")
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date,PESTEL_label
0,https://www.huffingtonpost.com/entry/mortgage-...,Mortgage Deal Reached In 2008 Shows Pitfalls T...,BUSINESS,"The Obama administration, which is pushing sta...",Loren Berlin,2012-02-05,Economic
1,https://www.huffingtonpost.com/entry/women-in-...,"Women in Business: Kate O'Brien Minson, Presid...",BUSINESS,Kate has lived and breathed the therapeutic ap...,"Laura Dunn, ContributorSocial Media and Commun...",2015-04-25,Economic
2,https://www.huffingtonpost.com/entry/like-athl...,"Like Athletes, Business Owners Need to Learn F...",BUSINESS,"Business owners and top executives can also ""w...","Mary Ellen Biery, ContributorResearch Speciali...",2015-01-19,Economic
3,https://www.huffingtonpost.com/entry/donald-tr...,Trump Could Trigger The Longest Recession Sinc...,BUSINESS,Yikes.,Ben Walsh,2016-06-27,Economic
4,https://www.huffingtonpost.com/entry/grocery-c...,Grocery Chains Made A Promise To The First Lad...,BUSINESS,An AP investigation found that major grocers o...,"Mike Schneider, AP",2015-12-07,Economic


In [115]:
df["content"] = df["headline"] + " " + df["short_description"]
df = df[['PESTEL_label', 'content']]
df.head()

Unnamed: 0,PESTEL_label,content
0,Economic,Mortgage Deal Reached In 2008 Shows Pitfalls T...
1,Economic,"Women in Business: Kate O'Brien Minson, Presid..."
2,Economic,"Like Athletes, Business Owners Need to Learn F..."
3,Economic,Trump Could Trigger The Longest Recession Sinc...
4,Economic,Grocery Chains Made A Promise To The First Lad...


In [116]:
# Convert content to string and handle 'NaN' values
df['content'] = df['content'].apply(lambda x: '' if pd.isna(x) else str(x))

In [117]:
def clean_text(text):
    text = text.replace('\n', ' ').strip()
    return text

df['content'] = df['content'].apply(clean_text)

In [118]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['PESTEL_label'], random_state=42)

In [119]:
class NewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_len, pestel_to_idx):
        self.df = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pestel_to_idx = pestel_to_idx

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index] 
        content = row['content']
        label = row['PESTEL_label']

        inputs = self.tokenizer.encode_plus(
            content,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )

        return {
            'ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'targets': torch.tensor(self.pestel_to_idx[label], dtype=torch.long)
        }
        
pestel_to_idx = {
    "Political": 0,
    "Economic": 1,
    "Social": 2,
    "Technological": 3,
    "Environmental": 4,
    "Legal": 5
}

In [120]:
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
MAX_LEN = 256 # can try 128
BATCH_SIZE = 16 # can try 32 / 64

train_set = NewsDataset(train_df, tokenizer, MAX_LEN, pestel_to_idx)
test_set = NewsDataset(test_df, tokenizer, MAX_LEN, pestel_to_idx)



In [121]:
distilbert_model = transformers.DistilBertModel.from_pretrained('distilbert-base-uncased')

In [122]:
class PestelClassifier(torch.nn.Module):
    def __init__(self, distilbert, num_classes):
        super(PestelClassifier, self).__init__()
        self.distilbert = distilbert
        self.dropout = torch.nn.Dropout(0.3)
        self.output = torch.nn.Linear(768, num_classes)
        
        # self.classifier = torch.nn.Sequential(
        #     torch.nn.Linear(768, 256),
        #     torch.nn.ReLU(),
        #     torch.nn.Dropout(0.3),
        #     torch.nn.Linear(256, num_classes)
        # )


    def forward(self, ids, mask):
        output = self.distilbert(ids, attention_mask=mask)
        output = self.dropout(output[0][:, 0, :])  # CLS token
        output = self.output(output)
        return output
    
        # x = self.distilbert(ids, attention_mask=mask).last_hidden_state[:, 0, :]  # CLS
        # x = self.classifier(x)
        # return x

In [123]:
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)

In [124]:
num_classes = 6  # For PESTEL
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PestelClassifier(distilbert_model, num_classes)
model.to(device)

PestelClassifier(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): 

In [125]:
EPOCHS = 10
LEARNING_RATE = 1e-7 # can try higher 
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
loss_function = torch.nn.CrossEntropyLoss()

In [126]:
# change to methods and call -> train for train and test loader, eval for val loader

for epoch in range(EPOCHS):
    model.train()
    train_bar = tqdm(train_loader, desc=f"Epoch {epoch+1} - Training")
    for batch in train_bar:
        ids = batch['ids'].to(device)
        mask = batch['mask'].to(device)
        targets = batch['targets'].to(device)

        optimizer.zero_grad()
        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()

        train_bar.set_postfix(loss=loss.item())

    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    
    test_bar = tqdm(test_loader, desc=f"Epoch {epoch+1} - Testing")
    for batch in test_bar:
        ids = batch['ids'].to(device)
        mask = batch['mask'].to(device)
        targets = batch['targets'].to(device)

        with torch.no_grad():
            outputs = model(ids, mask)
            loss = loss_function(outputs, targets)
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == targets).sum().item()
            total_samples += targets.size(0)

            test_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(test_loader)
    accuracy = total_correct / total_samples

    print(f"Epoch: {epoch + 1}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

Epoch 1 - Training: 100%|██████████| 840/840 [30:34<00:00,  2.18s/it, loss=1.8] 
Epoch 1 - Testing: 100%|██████████| 210/210 [01:51<00:00,  1.89it/s, loss=1.79]


Epoch: 1, Loss: 1.7704, Accuracy: 0.2024


Epoch 2 - Training: 100%|██████████| 840/840 [32:21<00:00,  2.31s/it, loss=1.8]   
Epoch 2 - Testing: 100%|██████████| 210/210 [01:46<00:00,  1.98it/s, loss=1.77]


Epoch: 2, Loss: 1.7249, Accuracy: 0.3402


Epoch 3 - Training: 100%|██████████| 840/840 [27:38<00:00,  1.97s/it, loss=1.65]
Epoch 3 - Testing: 100%|██████████| 210/210 [01:33<00:00,  2.23it/s, loss=1.75]


Epoch: 3, Loss: 1.6590, Accuracy: 0.4497


Epoch 4 - Training: 100%|██████████| 840/840 [25:19<00:00,  1.81s/it, loss=1.76]
Epoch 4 - Testing: 100%|██████████| 210/210 [01:33<00:00,  2.25it/s, loss=1.71]


Epoch: 4, Loss: 1.5577, Accuracy: 0.5179


Epoch 5 - Training: 100%|██████████| 840/840 [26:59<00:00,  1.93s/it, loss=1.53]
Epoch 5 - Testing: 100%|██████████| 210/210 [01:45<00:00,  2.00it/s, loss=1.64]


Epoch: 5, Loss: 1.4435, Accuracy: 0.5723


Epoch 6 - Training:  53%|█████▎    | 447/840 [16:47<14:45,  2.25s/it, loss=1.24]


KeyboardInterrupt: 

In [None]:
def evaluate_bert_model(model, data_loader, loss_function, device, set_name="Test"):
    model.eval()
    total_loss = 0
    all_preds = []
    all_targets = []
    loop = tqdm(data_loader, desc=f"{set_name} Evaluation")

    with torch.no_grad():
        for batch in loop:
            ids = batch['ids'].to(device)
            mask = batch['mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(ids, mask)
            loss = loss_function(outputs, targets)
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().tolist())
            all_targets.extend(targets.cpu().tolist())

            loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_targets, all_preds)
    precision = precision_score(all_targets, all_preds, average='macro', zero_division=0)
    recall = recall_score(all_targets, all_preds, average='macro', zero_division=0)
    f1 = f1_score(all_targets, all_preds, average='macro', zero_division=0)

    print(f"{set_name} | Loss: {avg_loss:.4f} | Acc: {accuracy*100:.2f}% | P: {precision:.4f} | R: {recall:.4f} | F1: {f1:.4f}")

    return avg_loss, accuracy, precision, recall, f1

In [None]:
model.load_state_dict(torch.load("./checkpoints/best_model.pt", map_location=device))
model.to(device)

test_loss, test_accuracy = evaluate_bert_model(
    model=model,
    data_loader=test_loader,
    loss_function=loss_function,
    device=device,
)

print(f"\nFinal Test Accuracy: {test_accuracy * 100:.2f}% | Test Loss: {test_loss:.4f}")