In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [15]:
import torch
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score
import pandas as pd

# Step 1: Create a Multi-Task Dataset (Manually created for demonstration)
data = {
    'text': [
        "The new AI model is groundbreaking.",  # Technology topic
        "Health is very important, eat balanced food.",  # Health topic
        "The government announced new policies today.",  # Politics topic
        "This new app is so useful, I highly recommend it.",  # Technology topic
        "The healthcare system needs improvement.",  # Health topic
        "The new election results are out.",  # Politics topic
    ],
    'sentiment': [1, 1, 0, 1, 0, 0],  # 1 for positive, 0 for negative sentiment
    'topic': [0, 1, 2, 0, 1, 2]  # 0 for Technology, 1 for Health, 2 for Politics
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into train and test sets (80% train, 20% test)
train_dataset, test_dataset = dataset.train_test_split(test_size=0.2).values()

# Step 2: Load Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Function to tokenize the dataset
def tokenize_data(dataset):
    return dataset.map(lambda examples: tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512), batched=True)

# Tokenize the datasets
train_dataset = tokenize_data(train_dataset)
test_dataset = tokenize_data(test_dataset)

# Step 3: Create a Multi-Task Model with Shared Layers
class MultiTaskBertModel(torch.nn.Module):
    def __init__(self, model_name, num_labels_sentiment, num_labels_topic):
        super(MultiTaskBertModel, self).__init__()
        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained(model_name)

        # Task-specific heads
        self.sentiment_head = torch.nn.Linear(self.bert.config.hidden_size, num_labels_sentiment)
        self.topic_head = torch.nn.Linear(self.bert.config.hidden_size, num_labels_topic)

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get BERT's output
        output = self.bert(input_ids, attention_mask=attention_mask)
        hidden_state = output.last_hidden_state[:, 0]  # Use the [CLS] token's hidden state

        # Perform classification for both tasks
        sentiment_logits = self.sentiment_head(hidden_state)
        topic_logits = self.topic_head(hidden_state)

        # Calculate loss if labels are provided
        if labels is not None:
            sentiment_labels = labels[:, 0]  # Sentiment labels
            topic_labels = labels[:, 1]      # Topic labels

            loss_fct = torch.nn.CrossEntropyLoss()

            sentiment_loss = loss_fct(sentiment_logits.view(-1, 2), sentiment_labels.view(-1))  # Sentiment loss
            topic_loss = loss_fct(topic_logits.view(-1, 3), topic_labels.view(-1))  # Topic loss

            # Combine the losses
            loss = sentiment_loss + topic_loss
            return {"loss": loss, "sentiment_logits": sentiment_logits, "topic_logits": topic_logits}

        # If labels are not provided, just return logits
        return {"sentiment_logits": sentiment_logits, "topic_logits": topic_logits}

# Step 4: Define Metrics Function (Accuracy for both tasks)
def compute_metrics(pred):
    # Assuming pred.predictions is a tuple/list of (sentiment_logits, topic_logits)
    sentiment_logits, topic_logits = pred.predictions

    sentiment_labels = pred.label_ids[:, 0]
    sentiment_predictions = sentiment_logits.argmax(-1)

    topic_labels = pred.label_ids[:, 1]
    topic_predictions = topic_logits.argmax(-1)

    sentiment_accuracy = accuracy_score(sentiment_labels, sentiment_predictions)
    topic_accuracy = accuracy_score(topic_labels, topic_predictions)

    return {"sentiment_accuracy": sentiment_accuracy, "topic_accuracy": topic_accuracy}

# Step 5: Set Up Training Arguments
training_args = TrainingArguments(
    output_dir="./results",          # Output directory for saving results
    evaluation_strategy="epoch",     # Evaluation strategy
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=8,   # Training batch size
    per_device_eval_batch_size=8,    # Evaluation batch size
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Weight decay for regularization
)

# Step 6: Initialize the Multi-Task Model
model = MultiTaskBertModel(model_name="bert-base-uncased", num_labels_sentiment=2, num_labels_topic=3)
import torch
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score
import pandas as pd

# Step 1: Create a Multi-Task Dataset (Manually created for demonstration)
data = {
    'text': [
        "The new AI model is groundbreaking.",  # Technology topic
        "Health is very important, eat balanced food.",  # Health topic
        "The government announced new policies today.",  # Politics topic
        "This new app is so useful, I highly recommend it.",  # Technology topic
        "The healthcare system needs improvement.",  # Health topic
        "The new election results are out.",  # Politics topic
    ],
    'sentiment': [1, 1, 0, 1, 0, 0],  # 1 for positive, 0 for negative sentiment
    'topic': [0, 1, 2, 0, 1, 2]  # 0 for Technology, 1 for Health, 2 for Politics
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into train and test sets (80% train, 20% test)
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset['train']
test_dataset = dataset['test']


# Step 2: Load Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Function to tokenize the dataset
def tokenize_data(dataset):
    return dataset.map(lambda examples: tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512), batched=True)

# Tokenize the datasets
train_dataset = tokenize_data(train_dataset)
test_dataset = tokenize_data(test_dataset)

# **Add the labels columns to the tokenized datasets:**
train_dataset = train_dataset.add_column("labels", [[i["sentiment"], i["topic"]] for i in train_dataset])
test_dataset = test_dataset.add_column("labels", [[i["sentiment"], i["topic"]] for i in test_dataset])


# Step 3: Create a Multi-Task Model with Shared Layers
class MultiTaskBertModel(torch.nn.Module):
    def __init__(self, model_name, num_labels_sentiment, num_labels_topic):
        super(MultiTaskBertModel, self).__init__()
        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained(model_name)

        # Task-specific heads
        self.sentiment_head = torch.nn.Linear(self.bert.config.hidden_size, num_labels_sentiment)
        self.topic_head = torch.nn.Linear(self.bert.config.hidden_size, num_labels_topic)

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get BERT's output
        output = self.bert(input_ids, attention_mask=attention_mask)
        hidden_state = output.last_hidden_state[:, 0]  # Use the [CLS] token's hidden state

        # Perform classification for both tasks
        sentiment_logits = self.sentiment_head(hidden_state)
        topic_logits = self.topic_head(hidden_state)

        # Calculate loss if labels are provided
        if labels is not None:
            sentiment_labels = labels[:, 0]  # Sentiment labels
            topic_labels = labels[:, 1]      # Topic labels

            loss_fct = torch.nn.CrossEntropyLoss()

            sentiment_loss = loss_fct(sentiment_logits.view(-1, 2), sentiment_labels.view(-1))  # Sentiment loss
            topic_loss = loss_fct(topic_logits.view(-1, 3), topic_labels.view(-1))  # Topic loss

            # Combine the losses
            loss = sentiment_loss + topic_loss
            return {"loss": loss, "sentiment_logits": sentiment_logits, "topic_logits": topic_logits}

        # If labels are not provided, just return logits
        return {"sentiment_logits": sentiment_logits, "topic_logits": topic_logits}

# Step 4: Define Metrics Function (Accuracy for both tasks)
def compute_metrics(pred):
    # Assuming pred.predictions is a tuple/list of (sentiment_logits, topic_logits)
    sentiment_logits, topic_logits = pred.predictions

    sentiment_labels = pred.label_ids[:, 0]
    sentiment_predictions = sentiment_logits.argmax(-1)

    topic_labels = pred.label_ids[:, 1]
    topic_predictions = topic_logits.argmax(-1)

    sentiment_accuracy = accuracy_score(sentiment_labels, sentiment_predictions)
    topic_accuracy = accuracy_score(topic_labels, topic_predictions)

    return {"sentiment_accuracy": sentiment_accuracy, "topic_accuracy": topic_accuracy}

# Step 5: Set Up Training Arguments
training_args = TrainingArguments(
    output_dir="./results",          # Output directory for saving results
    evaluation_strategy="epoch",     # Evaluation strategy
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=8,   # Training batch size
    per_device_eval_batch_size=8,    # Evaluation batch size
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Weight decay for regularization
)

# Step 6: Initialize the Multi-Task Model
model = MultiTaskBertModel(model_name="bert-base-uncased", num_labels_sentiment=2, num_labels_topic=3)

# Step 7: Initialize Trainer
trainer = Trainer(
    model=model,                        # Model to train
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Evaluation dataset
    compute_metrics=compute_metrics      # Metrics function
)

# Step 8: Train the Model
trainer.train()

# Step 9: Evaluate the Model
eval_results = trainer.evaluate()

# Print Evaluation Results
print(f"Evaluation Results: {eval_results}")


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Sentiment Accuracy,Topic Accuracy
1,No log,1.815003,1.0,0.0
2,No log,1.918838,0.5,0.0
3,No log,1.974719,0.5,0.0


Evaluation Results: {'eval_loss': 1.974718689918518, 'eval_sentiment_accuracy': 0.5, 'eval_topic_accuracy': 0.0, 'eval_runtime': 3.4242, 'eval_samples_per_second': 0.584, 'eval_steps_per_second': 0.292, 'epoch': 3.0}
