In [2]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [6]:
import torch
import torch.nn as nn
from transformers import Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from transformers import BertTokenizer

# Step 1: Define a Custom Transformer Model
class CustomTransformerModel(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_attention_heads, num_layers, num_classes):
        super(CustomTransformerModel, self).__init__()

        # Embedding layer for token inputs
        self.embedding = nn.Embedding(vocab_size, hidden_size)

        # Transformer Encoder layer (self-attention mechanism)
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size,
            nhead=num_attention_heads,
            dim_feedforward=hidden_size * 4,  # Feedforward layer size
            dropout=0.1
        )

        # Stack of encoder layers
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)

        # Fully connected layer for classification
        self.fc = nn.Linear(hidden_size, num_classes)

        # Cross entropy loss for classification
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask=None, labels=None):  # Add labels as an argument
        # Embedding
        x = self.embedding(input_ids)

        # Transformer encoder forward pass
        x = x.permute(1, 0, 2)  # Change to (seq_len, batch_size, hidden_size) format
        x = self.transformer_encoder(x)

        # Use the [CLS] token's hidden state (first token) for classification
        cls_output = x[0, :, :]  # [CLS] is the first token

        # Final fully connected layer (classification)
        logits = self.fc(cls_output)

        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return (loss, logits) if loss is not None else logits  # Return loss during training, logits otherwise

# Step 2: Create a Specialized Dataset
data = {
    'text': [
        "The new AI model is groundbreaking.",  # Technology
        "Health is very important, eat balanced food.",  # Health
        "The government announced new policies today.",  # Politics
        "This new app is so useful, I highly recommend it.",  # Technology
        "The healthcare system needs improvement.",  # Health
        "The new election results are out.",  # Politics
    ],
    'label': [0, 1, 2, 0, 1, 2]  # 0 for Technology, 1 for Health, 2 for Politics
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)

# Split dataset into train and test sets (80% train, 20% test)
train_dataset, test_dataset = dataset.train_test_split(test_size=0.2).values()

# Step 3: Tokenize the Dataset
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_data(dataset):
    return dataset.map(lambda examples: tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128), batched=True)

train_dataset = tokenize_data(train_dataset)
test_dataset = tokenize_data(test_dataset)

# Add labels to tokenized datasets
train_dataset = train_dataset.add_column("labels", train_dataset['label'])
test_dataset = test_dataset.add_column("labels", test_dataset['label'])

# Step 4: Define Metrics Function (Accuracy)
def compute_metrics(pred):
    predictions = pred.predictions.argmax(-1)  # Take the index of max logits
    accuracy = accuracy_score(pred.label_ids, predictions)
    return {"accuracy": accuracy}

# Step 5: Set Up Training Arguments
training_args = TrainingArguments(
    output_dir="./results",          # Output directory for saving results
    evaluation_strategy="epoch",     # Evaluation strategy
    learning_rate=1e-4,              # Learning rate
    per_device_train_batch_size=8,   # Training batch size
    per_device_eval_batch_size=8,    # Evaluation batch size
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Weight decay for regularization
)

# Step 6: Initialize Model and Trainer
vocab_size = len(tokenizer)  # Vocabulary size based on tokenizer
hidden_size = 256  # Hidden size (number of features per token)
num_attention_heads = 4  # Number of attention heads
num_layers = 4  # Number of transformer layers
num_classes = 3  # Number of output classes (Technology, Health, Politics)

model = CustomTransformerModel(vocab_size=vocab_size, hidden_size=hidden_size, num_attention_heads=num_attention_heads, num_layers=num_layers, num_classes=num_classes)

trainer = Trainer(
    model=model,                        # Model to train
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Evaluation dataset
    compute_metrics=compute_metrics      # Metrics function
)

# Step 7: Train the Model
trainer.train()

# Step 8: Evaluate the Model
eval_results = trainer.evaluate()

# Print Evaluation Results
print(f"Evaluation Results: {eval_results}")

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.379145,0.0
2,No log,1.692977,0.0
3,No log,1.727492,0.0


Evaluation Results: {'eval_loss': 1.7274924516677856, 'eval_accuracy': 0.0, 'eval_runtime': 0.1071, 'eval_samples_per_second': 18.675, 'eval_steps_per_second': 9.337, 'epoch': 3.0}
