# Annotation Prediction Experiment

## Importing libraries

In [212]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from huggingface_hub import notebook_login

## Preparing data for BERT model

In [129]:
data = pd.read_csv("../data/annotated_data.csv")

In [130]:
data.head()

Unnamed: 0,index,followers,connections,time_spent,content_links,media_type,num_hashtags,hashtag_followers,hashtags,reactions,comments,views,content,label
0,4,6484.0,500+,2 months ago,[['https://www.linkedin.com/in/ACoAABhNxDUB9IX...,article,3,0,"[['#verifiedresumes', 'https://www.linkedin.co...",22,2,,I count myself fortunate to have spent time wi...,Others
1,23,6484.0,500+,10 months ago,"[['https://lnkd.in/exKRtb6', 'https://lnkd.in/...",image,0,0,[],22,1,,No-one can be sure how America will ‘snap back...,Educational Resources
2,28,6484.0,500+,11 months ago,"[['https://lnkd.in/evGsZSH', 'https://lnkd.in/...",article,5,0,"[['#apprenticeships', 'https://www.linkedin.co...",10,0,,We've known since the Great Depression that si...,Others
3,37,6484.0,500+,1 year ago,[['https://www.linkedin.com/feed/hashtag/?keyw...,video,1,0,"[['#apprenticeship', 'https://www.linkedin.com...",31,4,,Great to talk with Fox Business today on why c...,Trends
4,51,6484.0,500+,2 years ago,[['https://www.linkedin.com/feed/hashtag/?keyw...,article,1,0,"[['#apprenticeship', 'https://www.linkedin.com...",27,1,,Where can an #apprenticeship take you ? Grea...,Others


### Mapping Category label to index labels

In [131]:
data[['label']].value_counts()

label                 
Others                    1131
Interactive Promotions     107
Trends                     104
Professional Growth         98
Events                      90
Educational Resources       70
Name: count, dtype: int64

In [132]:
data = data[["content", "label"]]

label_mapping = {
    "Professional Growth": 1.0,
    "Events": 2.0,
    "Interactive Promotions": 3.0,
    "Educational Resources": 4.0,
    "Trends": 5.0,
    "Others": 0.0
}
index_label_mapping = {v: k for k, v in label_mapping.items()}

data["label"] = data["label"].map(label_mapping)

### Train Test Split

In [133]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["content"].tolist(), data["label"].tolist(), test_size=0.2, random_state=42
)

### Tokenization

In [189]:
#tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")


class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text, max_length=self.max_len, padding="max_length",
            truncation=True, return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

### Creating Dataloader and Handling Imbalance classes using WeightedRandomSampler

In [190]:
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
test_dataset = TextDataset(test_texts, test_labels, tokenizer)

In [191]:
class_counts = np.bincount([int(label) for label in train_labels])
class_weights = 1.0 / class_counts
sample_weights = [class_weights[int(label)] for label in train_labels]
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

In [192]:
train_loader = DataLoader(train_dataset, batch_size=16, sampler=sampler)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

## Model Fine tuning and Evaluation

### Loading BERT and optimizer

In [204]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Bert Base Uncased
#model = BertForSequenceClassification.from_pretrained(
#    "bert-base-uncased", num_labels=len(label_mapping)
#).to(device)

# Distil Bert
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", 
                                                            num_labels=len(label_mapping)).to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [205]:
criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float32).to(device))
optimizer = AdamW(model.parameters(), lr=2e-5)



### Training

In [206]:
epochs = 15

model.train()

for epoch in range(epochs):
    total_loss, total_correct, total_samples = 0, 0, 0
    all_preds, all_labels = [], [] 

    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=1)
        total_correct += (preds == labels).sum().item()
        total_samples += labels.size(0)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average="weighted")
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}, Accuracy: {total_correct/total_samples:.4f}, F1-score: {f1:.4f}")


Epoch 1, Loss: 127.8043, Accuracy: 0.2812, F1-score: 0.2378
Epoch 2, Loss: 69.0846, Accuracy: 0.6687, F1-score: 0.6095
Epoch 3, Loss: 29.8155, Accuracy: 0.7711, F1-score: 0.6982
Epoch 4, Loss: 13.7125, Accuracy: 0.8039, F1-score: 0.7236
Epoch 5, Loss: 7.4639, Accuracy: 0.8445, F1-score: 0.7766
Epoch 6, Loss: 4.9999, Accuracy: 0.8711, F1-score: 0.8335
Epoch 7, Loss: 3.7229, Accuracy: 0.9187, F1-score: 0.9085
Epoch 8, Loss: 2.6576, Accuracy: 0.9469, F1-score: 0.9430
Epoch 9, Loss: 1.6060, Accuracy: 0.9695, F1-score: 0.9681
Epoch 10, Loss: 1.2833, Accuracy: 0.9781, F1-score: 0.9775
Epoch 11, Loss: 1.0592, Accuracy: 0.9859, F1-score: 0.9857
Epoch 12, Loss: 0.9222, Accuracy: 0.9859, F1-score: 0.9857
Epoch 13, Loss: 0.6827, Accuracy: 0.9891, F1-score: 0.9889
Epoch 14, Loss: 0.5719, Accuracy: 0.9945, F1-score: 0.9945
Epoch 15, Loss: 0.4931, Accuracy: 0.9914, F1-score: 0.9913


### Evaluation

In [207]:
def evaluate(model, dataloader):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    print(f"Test Accuracy: {acc:.4f}")
    print(f"Test F1-score: {f1:.4f}")


In [208]:
evaluate(model, test_loader)

Test Accuracy: 0.7531
Test F1-score: 0.7555


## Model Saving, Loading and Prediction

### Model Saving

In [209]:
model_save_path = "../models/Distil_bert_post_classifier_v1.pth"
tokenizer_save_path = "../models/Distil_bert_tokenizer"
torch.save(model.state_dict(), model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)
print(f"Model saved to {model_save_path}")

Model saved to ../models/Distil_bert_post_classifier_v1.pth


### Prediction Function

In [210]:
def predict_single_text(text, model, tokenizer, device, index_label_mapping):
    model.eval()
    encoding = tokenizer(
        text, max_length=128, padding="max_length",
        truncation=True, return_tensors="pt",
    )
    input_ids, attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().detach().numpy()

    return index_label_mapping[preds[0]]

### Loading Saved Model from local repo

In [147]:
loaded_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_mapping)).to(device)
loaded_model.load_state_dict(torch.load(model_save_path))
loaded_model.eval()

tokenizer = BertTokenizer.from_pretrained(tokenizer_save_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Uploading Model on Huggingface

In [213]:
notebook_login()

model.push_to_hub("mujtabakk/DistilBert-LinkedIn-Posts-Classfication")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mujtabakk/DistilBert-LinkedIn-Posts-Classfication/commit/a760ca28eda3e1c4a37c5b0df1fb3c5614cd17f6', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='a760ca28eda3e1c4a37c5b0df1fb3c5614cd17f6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mujtabakk/DistilBert-LinkedIn-Posts-Classfication', endpoint='https://huggingface.co', repo_type='model', repo_id='mujtabakk/DistilBert-LinkedIn-Posts-Classfication'), pr_revision=None, pr_num=None)

In [215]:
tokenizer.push_to_hub("mujtabakk/DistilBert-LinkedIn-Posts-Classification")

CommitInfo(commit_url='https://huggingface.co/mujtabakk/DistilBert-LinkedIn-Posts-Classification/commit/e1618695a250d3ecced329675b23ae0db758718b', commit_message='Upload tokenizer', commit_description='', oid='e1618695a250d3ecced329675b23ae0db758718b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mujtabakk/DistilBert-LinkedIn-Posts-Classification', endpoint='https://huggingface.co', repo_type='model', repo_id='mujtabakk/DistilBert-LinkedIn-Posts-Classification'), pr_revision=None, pr_num=None)

### Loading model from Huggingface

In [218]:
model_name = "mujtabakk/DistilBert-LinkedIn-Posts-Classfication"
loaded_model2 = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained("../models/Distil_bert_tokenizer")

### Example Prediction

#### Gold Label: Events

In [219]:
sample_text = "Join us for the Annual Tech Innovation Summit on November 15th!"
predicted_label = predict_single_text(sample_text, loaded_model2, tokenizer, device, index_label_mapping)
print(f"Predicted Label: {predicted_label}")

Predicted Label: Events


#### Gold Label: Professional Growth

In [220]:
sample_text = "Just got a job offer from Google and Microsoft—Google offers better work-life balance, while Microsoft has better food. Which one should I choose?"
predicted_label = predict_single_text(sample_text, loaded_model2, tokenizer, device, index_label_mapping)
print(f"Predicted Label: {predicted_label}")

Predicted Label: Others


#### Gold Label: Events

In [221]:
sample_text = "Get free tickets to my amazing lecture that teaches you about deep neural networks at the PyCon event."
predicted_label = predict_single_text(sample_text, loaded_model2, tokenizer, device, index_label_mapping)
print(f"Predicted Label: {predicted_label}")

Predicted Label: Events


#### Gold Label: Educational Resources

In [222]:
sample_text = "Just completed a data science bootcamp! Amazing learning experience, but I think traditional degrees still have their advantages."
predicted_label = predict_single_text(sample_text, loaded_model2, tokenizer, device, index_label_mapping)
print(f"Predicted Label: {predicted_label}")

Predicted Label: Others


#### Gold Label: Trends

In [223]:
sample_text = "The latest breakthroughs in AI-generated art are stunning—but will they replace human creativity?"
predicted_label = predict_single_text(sample_text, loaded_model2, tokenizer, device, index_label_mapping)
print(f"Predicted Label: {predicted_label}")

Predicted Label: Others


#### Gold Label: Trends

In [224]:
sample_text = "The rise of remote work is reshaping the future of the workplace"
predicted_label = predict_single_text(sample_text, loaded_model2, tokenizer, device, index_label_mapping)
print(f"Predicted Label: {predicted_label}")

Predicted Label: Trends
