## Installing Packages

In [None]:
!pip3 install sklearn
!pip3 install transformers
!pip3 install sentence-transformers
!pip install early_stopping

In [None]:
from torch import cuda
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import one_hot
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau
from datasets import load_metric
from datasets import Dataset

import json
import numpy as np
from tqdm.auto import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

from sklearn.metrics import accuracy_score, precision_score, f1_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import DistilBertConfig, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from early_stopping import EarlyStopping

## DISTILBERT MODEL

In [None]:
def labels_to_text(labels, label_mapping):
    # Reverse the label_mapping dictionary to map numbers to category names

    reverse_mapping = {value: key for key, value in label_mapping.items()}

    # Convert list of numbers to list of text using the reversed mapping
    text_labels = [reverse_mapping[label] for label in labels if label in reverse_mapping]

    return text_labels

In [None]:
# Define your dataset class
class MyDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=12)

early_stopper = EarlyStopping(
    depth=5,
    ignore=20,
    method='consistency'
)

# Load your dataset and label mapping
with open('/path/augmented/dataset.json', 'r') as f:
    data = json.load(f)

label_mapping = {
    "Politics": 0,
    "Health": 1,
    "Finance": 2,
    "Travel": 3,
    "Food": 4,
    "Education": 5,
    "Environment": 6,
    "Fashion": 7,
    "Science": 8,
    "Sports": 9,
    "Technology": 10,
    "Entertainment": 11
}

# Process your dataset
texts = []
labels = []

for label, sentences in data.items():
    if label in label_mapping:
        for sentence in sentences:
            texts.append(sentence)
            label_tensor = label_mapping[label]  # Single integer label
            labels.append(label_tensor)
    else:
        print(f"Warning: Label '{label}' not found in label mapping.")

# Tokenize texts
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

# Convert labels to PyTorch tensor
labels = torch.tensor(labels)

# Split the dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(inputs['input_ids'], labels, test_size=0.3, random_state=42)

# Create DataLoader instances
train_dataset = MyDataset(train_texts, train_labels)
test_dataset = MyDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Train the model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scheduler = ReduceLROnPlateau(optimizer, 'min')
criterion = torch.nn.CrossEntropyLoss()

epochs = 20
for epoch in range(epochs):
    # Training
    model.train()
    train_loss = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(input_ids=inputs.squeeze(1), labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)

    # Evaluation
    model.eval()
    test_loss = 0
    y_true, y_pred = [], []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(input_ids=inputs.squeeze(1), labels=labels)
            loss = outputs.loss
            test_loss += loss.item()
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            y_true.extend(labels.tolist())
            y_pred.extend(predictions.tolist())
    test_loss /= len(test_loader)
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss}, Test Loss: {test_loss}")
    print(classification_report(y_true, y_pred, target_names=label_mapping.keys()))
    if early_stopper.check(test_loss):
        print('BREAKING THE TRAINING LOOP')
        break

    # Update the scheduler
    scheduler.step(test_loss)
    lr = optimizer.param_groups[0]['lr']
    print(f"Learning rate: {lr}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20, Train Loss: 1.843723941933025, Test Loss: 0.8988357782363892
               precision    recall  f1-score   support

     Politics       0.85      0.88      0.87        33
       Health       0.76      0.93      0.84        30
      Finance       0.80      0.90      0.85        41
       Travel       0.87      0.97      0.92        35
         Food       0.97      0.94      0.95        32
    Education       0.90      1.00      0.95        36
  Environment       0.94      0.48      0.64        33
      Fashion       0.83      1.00      0.91        24
      Science       0.76      0.52      0.62        25
       Sports       0.97      1.00      0.98        31
   Technology       0.68      0.68      0.68        28
Entertainment       0.96      0.90      0.93        29

     accuracy                           0.86       377
    macro avg       0.86      0.85      0.84       377
 weighted avg       0.86      0.86      0.85       377

Learning rate: 5e-05
Epoch 2/20, Train Loss:

In [None]:
# Define the file path for saving the model
model_path = "distilbert_classifier_model.pth"

# Save the trained model
torch.save(model.state_dict(), model_path)

print("Trained model saved at:", model_path)


In [None]:
import torch
import csv
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import pandas as pd

# Load the saved model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=12)
model.load_state_dict(torch.load("distilbert_classifier_model.pth", map_location=torch.device('cpu')))
model.eval()

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Read text from the input file
input_file = "path/test_shuffle.txt"
with open(input_file, "r") as f:
    texts = f.readlines()

# Tokenize the text
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Map predicted labels to classes
predicted_labels = torch.argmax(logits, dim=1)
label_mapping = {
    0: "Politics",
    1: "Health",
    2: "Finance",
    3: "Travel",
    4: "Food",
    5: "Education",
    6: "Environment",
    7: "Fashion",
    8: "Science",
    9: "Sports",
    10: "Technology",
    11: "Entertainment"
}
predicted_classes = [label_mapping[label.item()] for label in predicted_labels]

# Save results to CSV file
output_csv = "/path/inference_results.csv"
df = pd.DataFrame({"ID": range(len(texts)), "Label": predicted_classes})
df.to_csv(output_csv, index=False)

print("Inference results saved to:", output_csv)
