In [1]:
import os
import json
import pandas as pd
import torch
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

In [2]:
# Set device

device = 'cuda' if cuda.is_available() else 'cpu'

# Initialize model and tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=12, problem_type="multi_label_classification")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Load data
with open('train.json', 'r') as f:
    data = json.load(f)

# Define label mapping
label_mapping = {
    "Politics": 0, "Health": 1, "Finance": 2, "Travel": 3, "Food": 4,
    "Education": 5, "Environment": 6, "Fashion": 7, "Science": 8,
    "Sports": 9, "Technology": 10, "Entertainment": 11
}


# Initialize lists for sentences and labels
train_sentences = []
test_sentences = []
train_labels = []
test_labels = []


# Split into training and test sets per class
for label in data.keys():
    sentences = data[label]
    labels = [label_mapping[label]] * len(sentences)

    train_sents, test_sents, train_lbls, test_lbls = train_test_split(sentences, labels, test_size=0.1)

    train_sentences.extend(train_sents)
    test_sentences.extend(test_sents)
    train_labels.extend(train_lbls)
    test_labels.extend(test_lbls)


def one_hot_encode(label, num_classes):
    one_hot = [0.0] * num_classes
    one_hot[label] = 1.0
    return one_hot

# One-hot encode labels
num_classes = len(label_mapping)
train_labels = [one_hot_encode(label, num_classes) for label in train_labels]
test_labels = [one_hot_encode(label, num_classes) for label in test_labels]

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):

        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}

        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Tokenize sentences
train_inputs = tokenizer(train_sentences, truncation=True, padding=True, return_tensors="pt")
test_inputs = tokenizer(test_sentences, truncation=True, padding=True, return_tensors="pt")

# TextDataset
train_dataset = TextDataset(train_inputs, train_labels)
test_dataset = TextDataset(test_inputs, test_labels)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [4]:
# Model Training

model.to(device)

optim = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 100

for epoch in range(epochs):  # Number of epochs

    model.train()
    total_loss = 0
    total_accuracy = 0
    i = 0
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Get the logits from the model output

        _, predicted_labels = torch.max(logits, dim=1)


        label_true = torch.argmax(labels, dim = 1)
        correct = (predicted_labels == label_true).sum().item()

        # Compute cross-entropy loss
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(logits, labels)

        loss.backward()
        optim.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # Validation loop
    model.eval()
    val_loss = 0
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(test_loader)
    if (epoch+1)%10 == 0:
      print(f'Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}')



Epoch 10/100, Train Loss: 1.3451427221298218, Validation Loss: 0.7207010984420776
Epoch 20/100, Train Loss: 0.4760288894176483, Validation Loss: 0.7035806775093079
Epoch 30/100, Train Loss: 0.1962185651063919, Validation Loss: 0.6876175999641418
Epoch 40/100, Train Loss: 0.0994560718536377, Validation Loss: 0.6883150339126587
Epoch 50/100, Train Loss: 0.06522887200117111, Validation Loss: 0.6826522946357727
Epoch 60/100, Train Loss: 0.04786122217774391, Validation Loss: 0.6806918382644653
Epoch 70/100, Train Loss: 0.03901243209838867, Validation Loss: 0.6808144450187683
Epoch 80/100, Train Loss: 0.030921983532607555, Validation Loss: 0.679891049861908
Epoch 90/100, Train Loss: 0.026677953079342842, Validation Loss: 0.681160569190979
Epoch 100/100, Train Loss: 0.024675155989825726, Validation Loss: 0.6852779984474182


In [5]:

# Test

number_to_label = {value: key for key, value in label_mapping.items()}

# Define class for testing
class TestDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length=128):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        encoding = self.tokenizer(sentence, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return encoding

# Load test sentences
test_file = 'test_shuffle.txt'
with open(test_file, 'r') as file:
    test_sentences = file.readlines()

# Create dataset
test_dataset = TestDataset(test_sentences, tokenizer)

# Create DataLoader
batch_size = 4
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Model evaluation
model.eval()
predictions = []
for batch_idx, batch in enumerate(test_loader):
    input_ids = torch.squeeze(batch['input_ids'], dim=1).to(device)
    attention_mask = batch['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)

    # Process outputs (if necessary)
    logits = outputs.logits
    _, predicted_labels = torch.max(logits, dim=1)

    for label in predicted_labels.cpu().numpy():
        category = number_to_label[label.item()]
        predictions.append(category)

# Create final DataFrame
data = {"ID": range(len(predictions)), "Label": predictions}
df = pd.DataFrame(data)

# Save to a CSV file
output_file = "predictions_og_train.csv"
df.to_csv(output_file, index=False)

print(f"Predictions saved to {output_file}")


Predictions saved to predictions_og_train.csv


No validation set

In [6]:

# Load data
with open('train.json', 'r') as f:
    data = json.load(f)

# Define label mapping
label_mapping = {
    "Politics": 0, "Health": 1, "Finance": 2, "Travel": 3, "Food": 4,
    "Education": 5, "Environment": 6, "Fashion": 7, "Science": 8,
    "Sports": 9, "Technology": 10, "Entertainment": 11
}


# Initialize lists for sentences and labels
train_sentences = []
train_labels = []

# Generate training data
for label in data.keys():
    sentences = data[label]
    labels = [label_mapping[label]] * len(sentences)

    train_sentences.extend(sentences)
    train_labels.extend(labels)

def one_hot_encode(label, num_classes):
    one_hot = [0.0] * num_classes
    one_hot[label] = 1.0
    return one_hot

# One-hot encode labels
num_classes = len(label_mapping)
train_labels = [one_hot_encode(label, num_classes) for label in train_labels]

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Tokenize sentences
train_inputs = tokenizer(train_sentences, truncation=True, padding=True, return_tensors="pt")

# Create TextDataset for training
train_dataset = TextDataset(train_inputs, train_labels)

# Create DataLoader for training set
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)


In [7]:

# Model Training


model.to(device)

optim = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 100

for epoch in range(epochs):  # Number of epochs

    model.train()
    total_loss = 0
    total_accuracy = 0
    i = 0
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Get the logits from the model output

        _, predicted_labels = torch.max(logits, dim=1)

        label_true = torch.argmax(labels, dim = 1)
        correct = (predicted_labels == label_true).sum().item()

        # Compute loss
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(logits, label_true)

        loss.backward()
        optim.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    if (epoch+1)%10 == 0:
      print(f'Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss}')




Epoch 10/100, Train Loss: 0.02659794936577479
Epoch 20/100, Train Loss: 0.012459679506719112
Epoch 30/100, Train Loss: 0.009110673641165098
Epoch 40/100, Train Loss: 0.007444395373264949
Epoch 50/100, Train Loss: 0.005695155511299769
Epoch 60/100, Train Loss: 0.0043899849988520145
Epoch 70/100, Train Loss: 0.0036366108494500318
Epoch 80/100, Train Loss: 0.0034003082352379956
Epoch 90/100, Train Loss: 0.002900610833118359
Epoch 100/100, Train Loss: 0.00244701545064648


In [8]:

# Test

number_to_label = {value: key for key, value in label_mapping.items()}

# Define class for testing
class TestDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length=128):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        encoding = self.tokenizer(sentence, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return encoding

# Load test sentences
test_file = 'test_shuffle.txt'
with open(test_file, 'r') as file:
    test_sentences = file.readlines()

# Create dataset
test_dataset = TestDataset(test_sentences, tokenizer)

# Create DataLoader
batch_size = 4
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Model evaluation
model.eval()
predictions = []
for batch_idx, batch in enumerate(test_loader):
    input_ids = torch.squeeze(batch['input_ids'], dim=1).to(device)
    attention_mask = batch['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)

    # Process outputs (if necessary)
    logits = outputs.logits
    _, predicted_labels = torch.max(logits, dim=1)


    for label in predicted_labels.cpu().numpy():
        category = number_to_label[label.item()]
        predictions.append(category)

# Create final DataFrame
data = {"ID": range(len(predictions)), "Label": predictions}
df = pd.DataFrame(data)

# Save to a CSV file
output_file = "predictions_augmented_train.csv"
df.to_csv(output_file, index=False)

print(f"Predictions saved to {output_file}")


Predictions saved to predictions_augmented_train.csv
