# RoBERTa


## Install and Imports


In [None]:
import os

HF_TOKEN = "put your token here"
WANDB_TOKEN = "put your token here"

In [None]:
!source ~/.bashrc

In [None]:
!huggingface-cli login --token={HF_TOKEN}

In [None]:
import wandb

# WANDB_TOKEN = userdata.get('WANDB_TOKEN')
wandb.login(key=WANDB_TOKEN)

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login
from tqdm import tqdm

## Experiment 1 - RoBERTa Base on Unprocessed Text


In [None]:
model_id = "roberta-base"
dataset_id = "ImperialIndians23/nlp_cw_data_unprocessed"
repository_id = "ImperialIndians23/RobertaBaseUnprocessed"

### Dataset - Load and Tokenize


In [None]:
dataset = load_dataset(dataset_id)

In [None]:
# Training and testing datasets
train_dataset = dataset["train"]

# Validation dataset
val_dataset = dataset["valid"]

In [None]:
# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)


# This function tokenizes the input text using the RoBERTa tokenizer.
# It applies padding and truncation to ensure that all sequences have the same length (512 tokens).
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)


train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

In [None]:
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
from collections import Counter

Counter(dataset["train"]["label"])

In [None]:
num_labels = 2
class_names = [0, 1]
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

### Metrics


In [None]:
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

### Model - Definition and Training


In [None]:
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

training_config = {
    "learning_rate": 1e-5,
    "num_train_epochs": 3,
    "weight_decay": 0.01,
    "gradient_accumulation_steps": 4,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 2,
    "lr_scheduler_type": "inverse_sqrt",
    "warmup_steps": 500,
    "load_best_model_at_end": True,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "metric_for_best_model": "f1",
}

training_args = TrainingArguments(
    output_dir=repository_id,
    logging_dir=f"{repository_id}/logs",
    logging_steps=10,
    save_total_limit=3,
    push_to_hub=True,
    report_to="wandb",
    **training_config,
)

In [None]:
run = wandb.init(
    project="nlp_cw",
    name="roberta - unprocessed text - 3 epochs -1e-5 lr",
    # Track hyperparameters and run metadata
    config=training_config,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# Save the tokenizer and create a model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()

# Push the results to the hub
trainer.push_to_hub()

### Evaluate


In [None]:
valid_dataset = dataset["valid"]

In [None]:
from transformers import pipeline
from tqdm import tqdm

classifier = pipeline("text-classification", repository_id)

batch_size = 4
predictions = []

# Retrieve all texts from the dataset
texts = [example["text"] for example in valid_dataset]

# Process texts in batches
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i : i + batch_size]
    batch_results = classifier(batch_texts)

    # Extract and store predictions from results
    batch_predictions = [result["label"] for result in batch_results]
    predictions.extend(batch_predictions)

In [None]:
Counter(predictions)

In [None]:
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

gold = dataset["valid"]["label"]
t1p = precision_score(gold, predictions)
t1r = recall_score(gold, predictions)
t1f = f1_score(gold, predictions)
print("Precision:", t1p)
print("Recall:", t1r)
print("F1:", t1f)
print("-" * 40)

## Experiment 2 : Downsample Negative Samples


In [None]:
from datasets import load_dataset

dataset = load_dataset("ImperialIndians23/nlp_cw_data_unprocessed")

In [None]:
from datasets import DatasetDict, Dataset, concatenate_datasets
import random

train_dataset = dataset["train"]

# Separate the dataset by label
label_0_dataset = train_dataset.filter(lambda example: example["label"] == 0)
label_1_dataset = train_dataset.filter(lambda example: example["label"] == 1)

num_label_1 = len(label_1_dataset)

desired_num_label_0 = 2 * num_label_1

random.seed(42)
downsampled_label_0_indices = random.sample(
    range(len(label_0_dataset)), k=desired_num_label_0
)

downsampled_label_0_dataset = label_0_dataset.select(downsampled_label_0_indices)

# Concatenate downsampled label 0 dataset with label 1 dataset
balanced_train_dataset = concatenate_datasets(
    [downsampled_label_0_dataset, label_1_dataset]
)

dataset["train"] = balanced_train_dataset

In [None]:
dataset.push_to_hub("ImperialIndians23/nlp_cw_data_unprocessed_downsampled")

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login
from tqdm import tqdm

import wandb
import os
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }


HF_TOKEN = "put your token here"
WANDB_TOKEN = "put your token here"

# WANDB_TOKEN = userdata.get('WANDB_TOKEN')
wandb.login(key=WANDB_TOKEN)

model_id = "roberta-base"
username = "ImperialIndians23"
dataset_id = "ImperialIndians23/nlp_cw_data_unprocessed_downsampled"

repository_id = "ImperialIndians23/RobertaBaseUnprocessedDownsampledLowLR"

dataset = load_dataset(dataset_id)

print("Processing the dataset...")

# Training and testing datasets
train_dataset = dataset["train"]
test_dataset = dataset["valid"].shard(num_shards=2, index=0)

# Validation dataset
val_dataset = dataset["valid"].shard(num_shards=2, index=1)

# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)


# This function tokenizes the input text using the RoBERTa tokenizer.
# It applies padding and truncation to ensure that all sequences have the same length (512 tokens).
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

num_labels = 2
class_names = [0, 1]
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})


model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

training_config = {
    "learning_rate": 1e-5,
    "num_train_epochs": 2,
    "weight_decay": 0.01,
    "gradient_accumulation_steps": 4,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 2,
    "lr_scheduler_type": "inverse_sqrt",
    "warmup_steps": 500,
    "load_best_model_at_end": True,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "metric_for_best_model": "f1",
}

training_args = TrainingArguments(
    output_dir=repository_id,
    logging_dir=f"{repository_id}/logs",
    logging_steps=10,
    save_total_limit=3,
    push_to_hub=True,
    report_to="wandb",
    **training_config,
)

run = wandb.init(
    project="nlp_cw",
    name=f"roberta - {dataset_id.strip(username)} - {training_config['num_train_epochs']} epochs - {training_config['learning_rate']}",
    # Track hyperparameters and run metadata
    config=training_config,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate()

# Save the tokenizer and create a model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()

# Push the results to the hub
trainer.push_to_hub()

## Experiment 3 - Processed Text


In [None]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login
from tqdm import tqdm

import wandb
import os
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }


HF_TOKEN = "put your token here"
WANDB_TOKEN = "put your token here"

# WANDB_TOKEN = userdata.get('WANDB_TOKEN')
wandb.login(key=WANDB_TOKEN)

model_id = "roberta-base"
username = "ImperialIndians23/"
dataset_id = "ImperialIndians23/nlp_cw_data_processed"

repository_id = "ImperialIndians23/RobertaBaseProcessed"

dataset = load_dataset(dataset_id)

print("Processing the dataset...")

# Training and testing datasets
train_dataset = dataset["train"]

# Validation dataset
val_dataset = dataset["valid"]

# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)


# This function tokenizes the input text using the RoBERTa tokenizer.
# It applies padding and truncation to ensure that all sequences have the same length (512 tokens).
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

num_labels = 2
class_names = [0, 1]
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})


model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

training_config = {
    "learning_rate": 1e-5,
    "num_train_epochs": 2,
    "weight_decay": 0.01,
    "gradient_accumulation_steps": 4,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 2,
    "lr_scheduler_type": "inverse_sqrt",
    "warmup_steps": 500,
    "load_best_model_at_end": True,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "metric_for_best_model": "f1",
}

training_args = TrainingArguments(
    output_dir=repository_id,
    logging_dir=f"{repository_id}/logs",
    logging_steps=10,
    save_total_limit=3,
    push_to_hub=True,
    report_to="wandb",
    **training_config,
)

run = wandb.init(
    project="nlp_cw",
    name=f"roberta - {dataset_id.strip(username)} - {training_config['num_train_epochs']} epochs - {training_config['learning_rate']}",
    # Track hyperparameters and run metadata
    config=training_config,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate()

# Save the tokenizer and create a model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()

# Push the results to the hub
trainer.push_to_hub()

## Experiment 4 - Down Sampling Processed Data


In [None]:
from datasets import load_dataset

dataset = load_dataset("ImperialIndians23/nlp_cw_data_processed")

In [None]:
from datasets import DatasetDict, Dataset, concatenate_datasets
import random

train_dataset = dataset["train"]

# Separate the dataset by label
label_0_dataset = train_dataset.filter(lambda example: example["label"] == 0)
label_1_dataset = train_dataset.filter(lambda example: example["label"] == 1)

num_label_1 = len(label_1_dataset)

desired_num_label_0 = 2 * num_label_1

random.seed(42)
downsampled_label_0_indices = random.sample(
    range(len(label_0_dataset)), k=desired_num_label_0
)

downsampled_label_0_dataset = label_0_dataset.select(downsampled_label_0_indices)

# Concatenate downsampled label 0 dataset with label 1 dataset
balanced_train_dataset = concatenate_datasets(
    [downsampled_label_0_dataset, label_1_dataset]
)

dataset["train"] = balanced_train_dataset

In [None]:
dataset.push_to_hub("ImperialIndians23/nlp_cw_data_processed_downsampled")

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login
from tqdm import tqdm

import wandb
import os
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }


HF_TOKEN = "put your token here"
WANDB_TOKEN = "put your token here"

# WANDB_TOKEN = userdata.get('WANDB_TOKEN')
wandb.login(key=WANDB_TOKEN)

model_id = "roberta-base"
username = "ImperialIndians23/"
dataset_id = "ImperialIndians23/nlp_cw_data_processed_downsampled"

repository_id = "ImperialIndians23/RobertaBaseProcessedDownsampled"

dataset = load_dataset(dataset_id)

print("Processing the dataset...")

# Training and testing datasets
train_dataset = dataset["train"]
test_dataset = dataset["valid"].shard(num_shards=2, index=0)

# Validation dataset
val_dataset = dataset["valid"].shard(num_shards=2, index=1)

# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)


# This function tokenizes the input text using the RoBERTa tokenizer.
# It applies padding and truncation to ensure that all sequences have the same length (512 tokens).
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

num_labels = 2
class_names = [0, 1]
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})


model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

training_config = {
    "learning_rate": 1e-5,
    "num_train_epochs": 4,
    "weight_decay": 0.01,
    "gradient_accumulation_steps": 4,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 2,
    "lr_scheduler_type": "inverse_sqrt",
    "warmup_steps": 500,
    "load_best_model_at_end": True,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "metric_for_best_model": "f1",
}

training_args = TrainingArguments(
    output_dir=repository_id,
    logging_dir=f"{repository_id}/logs",
    logging_steps=10,
    save_total_limit=3,
    push_to_hub=True,
    report_to="wandb",
    **training_config,
)

run = wandb.init(
    project="nlp_cw",
    name=f"roberta - {dataset_id.strip(username)} - {training_config['num_train_epochs']} epochs - {training_config['learning_rate']}",
    # Track hyperparameters and run metadata
    config=training_config,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate()

# Save the tokenizer and create a model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()

# Push the results to the hub
trainer.push_to_hub()

## Experiment 5 - Custom RoBERTa model - Unprocessed Text (Downsampled) + Keyword


In [None]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login
from tqdm import tqdm

import wandb
import os
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from transformers import EvalPrediction

In [None]:
from datasets import load_dataset

dataset = load_dataset("ImperialIndians23/nlp_cw_data_processed_downsampled")

In [None]:
dataset

### Generate Embeddings for Community Keywords


In [None]:
from transformers import RobertaTokenizerFast

model_id = "roberta-base"


tokenizer = RobertaTokenizerFast.from_pretrained(model_id)


def tokenize_data(batch):
    # Tokenize the main text
    text_encoding = tokenizer(
        batch["text"],
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )

    # Tokenize the community keyword
    community_encoding = tokenizer(
        batch["community"],
        padding=True,
        truncation=True,
        max_length=64,
        return_tensors="pt",
    )

    return {
        "input_ids": text_encoding["input_ids"],
        "attention_mask": text_encoding["attention_mask"],
        "community_input_ids": community_encoding["input_ids"],
        "community_attention_mask": community_encoding["attention_mask"],
    }


dataset["train"] = dataset["train"].map(tokenize_data, batched=True)
dataset["valid"] = dataset["valid"].map(tokenize_data, batched=True)

In [None]:
dataset["train"].set_format(
    "torch",
    columns=[
        "input_ids",
        "attention_mask",
        "label",
        "community_input_ids",
        "community_attention_mask",
    ],
)
dataset["valid"].set_format(
    "torch",
    columns=[
        "input_ids",
        "attention_mask",
        "label",
        "community_input_ids",
        "community_attention_mask",
    ],
)

In [None]:
dataset

In [None]:
from transformers import DataCollatorWithPadding
import torch


class CustomDataCollatorWithPadding(DataCollatorWithPadding):
    def __call__(self, features):
        batch = super().__call__(features)

        community_input_ids = torch.stack(
            [feature["community_input_ids"] for feature in features]
        )
        community_attention_mask = torch.stack(
            [feature["community_attention_mask"] for feature in features]
        )

        padded_community_input_ids = self.pad_tensors(
            community_input_ids, self.tokenizer.pad_token_id
        )
        padded_community_attention_mask = self.pad_tensors(community_attention_mask, 0)

        batch["community_input_ids"] = padded_community_input_ids
        batch["community_attention_mask"] = padded_community_attention_mask

        return batch

    def pad_tensors(self, tensors, pad_token_id):
        max_length = max(t.size(0) for t in tensors)
        # Pad each tensor to match the longest one
        padded = torch.stack(
            [
                torch.cat(
                    [
                        t,
                        torch.full(
                            (max_length - t.size(0),), pad_token_id, dtype=t.dtype
                        ),
                    ]
                )
                for t in tensors
            ]
        )
        return padded

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import RobertaModel, RobertaTokenizer, AdamW
from torch import nn, optim
from torch.nn import functional as F


class RobertaClassifier(nn.Module):
    def __init__(self, num_labels, config):
        super(RobertaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base", config=config)
        self.mapper = nn.Linear(
            self.roberta.config.hidden_size * 2, self.roberta.config.hidden_size * 2
        )
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.roberta.config.hidden_size * 2, num_labels)
        self.loss_fn = nn.CrossEntropyLoss()
        self.config = config

    def forward(
        self,
        input_ids,
        attention_mask,
        community_input_ids,
        community_attention_mask,
        labels=None,
    ):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        text_embedding = outputs[1]

        # Process community keywords
        community_outputs = self.roberta(
            input_ids=community_input_ids, attention_mask=community_attention_mask
        )
        community_embedding = community_outputs[1]

        # print(f"Text embedding shape: {text_embedding.shape}")
        # print(f"Community embedding shape: {community_embedding.shape}")

        # Concatenate
        combined_embedding = torch.cat((text_embedding, community_embedding), dim=1)

        mapped_embedding = self.mapper(combined_embedding)
        # Apply dropout to the output of the mapper
        dropped_embedding = self.dropout(mapped_embedding)

        # Pass the result through the classifier to get logits
        logits = self.classifier(dropped_embedding)

        # Compute loss if labels are provided (during training)
        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return loss, logits
        else:
            return logits

In [None]:
num_labels = 2
class_names = [0, 1]
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained("roberta-base")
config.update({"id2label": id2label})

model = RobertaClassifier(num_labels=2, config=config)

In [None]:
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

### Training


In [None]:
model_id = "roberta-base"
username = "ImperialIndians23/"
dataset_id = "ImperialIndians23/nlp_cw_data_unprocessed_downsampled_keyword"

repository_id = "ImperialIndians23/RobertaBaseUnprocessedDownsampledKeywordDropout"

In [None]:
import wandb

WANDB_TOKEN = "put your token here"

# WANDB_TOKEN = userdata.get('WANDB_TOKEN')
wandb.login(key=WANDB_TOKEN)

In [None]:
training_config = {
    "learning_rate": 1e-5,
    "num_train_epochs": 3,
    "weight_decay": 0.01,
    "gradient_accumulation_steps": 4,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 2,
    "lr_scheduler_type": "inverse_sqrt",
    "warmup_steps": 500,
    "load_best_model_at_end": True,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "metric_for_best_model": "f1",
}

training_args = TrainingArguments(
    output_dir=repository_id,
    logging_dir=f"{repository_id}/logs",
    logging_steps=10,
    save_total_limit=3,
    push_to_hub=True,
    report_to="wandb",
    **training_config,
)

In [None]:
run = wandb.init(
    project="nlp_cw",
    name=f"roberta - keyword unprocessed downsampled dropout - {training_config['num_train_epochs']} epochs",
    # Track hyperparameters and run metadata
    config=training_config,
)

In [None]:
data_collator = CustomDataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["valid"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
torch.save(
    trainer.model.state_dict(),
    "./nlp/ImperialIndians23/RobertaBaseUnprocessedDownsampledKeyword/final_model.pth",
)

In [None]:
trainer.evaluate()

# Save the tokenizer and create a model card
model.save_pretrained(repository_id)
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()

# Push the results to the hub
trainer.push_to_hub()

In [None]:
loaded_model = RobertaClassifier(num_labels=2, config=config)

model_save_path = "./nlp/ImperialIndians23/RobertaBaseUnprocessedDownsampledKeyword/final_model.pth"
model_state_dict = torch.load(model_save_path)

loaded_model.load_state_dict(model_state_dict)

loaded_model.eval()
print("loaded..")

### Evaluation


In [None]:
from torch.utils.data import DataLoader

# Assuming dataset["valid"] is already tokenized and ready for input
valid_dataloader = DataLoader(
    dataset["valid"],
    batch_size=1,
    collate_fn=data_collator,
    shuffle=False,
)

In [None]:
dataset["valid"]

In [None]:
from tqdm import tqdm

loaded_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)

all_preds = []
all_labels = []

with torch.no_grad():  # No need to track gradients during evaluation
    for batch in tqdm(valid_dataloader):
        # print(batch.keys())
        # Move batch to the same device as the loaded_model
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch.pop("labels").detach().cpu().numpy()
        outputs = loaded_model(**batch)
        logits = outputs.logits if hasattr(outputs, "logits") else outputs[0]
        preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()

        # Accumulate predictions and labels
        all_preds.append(preds)
        all_labels.append(labels)

In [None]:
assert len(all_preds) == len(all_labels)

In [None]:
all_preds = [int(x) for x in all_preds]

In [None]:
from collections import Counter

Counter(list(all_preds))

In [None]:
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

gold = dataset["valid"]["label"]
t1p = precision_score(gold, all_preds)
t1r = recall_score(gold, all_preds)
t1f = f1_score(gold, all_preds)
print("Precision:", t1p)
print("Recall:", t1r)
print("F1:", t1f)
print("-" * 40)

## Experiment 6 - Custom RoBERTa model - Processed Text (Downsampled) + Keyword


In [None]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login
from tqdm import tqdm

import wandb
import os
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from transformers import EvalPrediction
from transformers import DataCollatorWithPadding

from torch.utils.data import DataLoader
from transformers import RobertaModel, RobertaTokenizer, AdamW
from torch import nn, optim
from torch.nn import functional as F


class RobertaClassifier(nn.Module):
    def __init__(self, num_labels, config):
        super(RobertaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base", config=config)
        self.mapper = nn.Linear(
            self.roberta.config.hidden_size * 2, self.roberta.config.hidden_size * 2
        )
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.roberta.config.hidden_size * 2, num_labels)
        self.loss_fn = nn.CrossEntropyLoss()
        self.config = config

    def forward(
        self,
        input_ids,
        attention_mask,
        community_input_ids,
        community_attention_mask,
        labels=None,
    ):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        text_embedding = outputs[1]

        # Process community keywords
        community_outputs = self.roberta(
            input_ids=community_input_ids, attention_mask=community_attention_mask
        )
        community_embedding = community_outputs[1]

        # print(f"Text embedding shape: {text_embedding.shape}")
        # print(f"Community embedding shape: {community_embedding.shape}")

        # Concatenate
        combined_embedding = torch.cat((text_embedding, community_embedding), dim=1)

        mapped_embedding = self.mapper(combined_embedding)
        # Apply dropout to the output of the mapper
        dropped_embedding = self.dropout(mapped_embedding)

        # Pass the result through the classifier to get logits
        logits = self.classifier(dropped_embedding)

        # Compute loss if labels are provided (during training)
        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return loss, logits
        else:
            return logits


class CustomDataCollatorWithPadding(DataCollatorWithPadding):
    def __call__(self, features):
        batch = super().__call__(features)

        community_input_ids = torch.stack(
            [feature["community_input_ids"] for feature in features]
        )
        community_attention_mask = torch.stack(
            [feature["community_attention_mask"] for feature in features]
        )

        padded_community_input_ids = self.pad_tensors(
            community_input_ids, self.tokenizer.pad_token_id
        )
        padded_community_attention_mask = self.pad_tensors(community_attention_mask, 0)

        batch["community_input_ids"] = padded_community_input_ids
        batch["community_attention_mask"] = padded_community_attention_mask

        return batch

    def pad_tensors(self, tensors, pad_token_id):
        max_length = max(t.size(0) for t in tensors)
        # Pad each tensor to match the longest one
        padded = torch.stack(
            [
                torch.cat(
                    [
                        t,
                        torch.full(
                            (max_length - t.size(0),), pad_token_id, dtype=t.dtype
                        ),
                    ]
                )
                for t in tensors
            ]
        )
        return padded


def compute_metrics(p: EvalPrediction):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }


HF_TOKEN = "put your token here"
WANDB_TOKEN = "put your token here"

# WANDB_TOKEN = userdata.get('WANDB_TOKEN')
wandb.login(key=WANDB_TOKEN)

model_id = "roberta-base"
username = "ImperialIndians23"
downsampled = True
processed = True

dataset_id = "ImperialIndians23/nlp_cw_data"
run_name = f"roberta - keyword"

if processed:
    dataset_id += "_processed"
    run_name += " processed "
else:
    dataset_id += "_unprocessed"
    run_name += " unprocessed "

if downsampled:
    dataset_id += "_downsampled"
    run_name += "downsampled "


training_config = {
    "learning_rate": 1e-5,
    "num_train_epochs": 3,
    "weight_decay": 0.01,
    "gradient_accumulation_steps": 4,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 2,
    "lr_scheduler_type": "inverse_sqrt",
    "warmup_steps": 500,
    "load_best_model_at_end": True,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "metric_for_best_model": "f1",
}

run_name += f"- {training_config['num_train_epochs']} epochs"


repository_id = "ImperialIndians23/RobertaBaseProcessedDownsampledKeywordDropout"

dataset = load_dataset(dataset_id)

print("Processing the dataset...")

tokenizer = RobertaTokenizerFast.from_pretrained(model_id)


def tokenize_data(batch):
    # Tokenize the main text
    text_encoding = tokenizer(
        batch["text"],
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )

    # Tokenize the community keyword
    community_encoding = tokenizer(
        batch["community"],
        padding=True,
        truncation=True,
        max_length=64,
        return_tensors="pt",
    )

    return {
        "input_ids": text_encoding["input_ids"],
        "attention_mask": text_encoding["attention_mask"],
        "community_input_ids": community_encoding["input_ids"],
        "community_attention_mask": community_encoding["attention_mask"],
    }


dataset["train"] = dataset["train"].map(tokenize_data, batched=True)
dataset["valid"] = dataset["valid"].map(tokenize_data, batched=True)


dataset["train"].set_format(
    "torch",
    columns=[
        "input_ids",
        "attention_mask",
        "label",
        "community_input_ids",
        "community_attention_mask",
    ],
)
dataset["valid"].set_format(
    "torch",
    columns=[
        "input_ids",
        "attention_mask",
        "label",
        "community_input_ids",
        "community_attention_mask",
    ],
)

num_labels = 2
class_names = [0, 1]
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

model = RobertaClassifier(num_labels=2, config=config)

In [None]:
training_args = TrainingArguments(
    output_dir=repository_id,
    logging_dir=f"{repository_id}/logs",
    logging_steps=10,
    save_total_limit=3,
    push_to_hub=True,
    report_to="wandb",
    **training_config,
)

run = wandb.init(
    project="nlp_cw",
    name=run_name,
    # Track hyperparameters and run metadata
    config=training_config,
)


data_collator = CustomDataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["valid"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate()

# Save the tokenizer and create a model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()

# Push the results to the hub
trainer.push_to_hub()

In [None]:
model_save_path = f"/vol/bitbucket/rm1623/nlp/{repository_id}/final_model.pth"

torch.save(trainer.model.state_dict(), model_save_path)

loaded_model = RobertaClassifier(num_labels=2, config=config)

model_state_dict = torch.load(model_save_path)

loaded_model.load_state_dict(model_state_dict)

loaded_model.eval()
print("Loaded...")

from torch.utils.data import DataLoader

# Assuming dataset["valid"] is already tokenized and ready for input
valid_dataloader = DataLoader(
    dataset["valid"],
    batch_size=1,
    collate_fn=data_collator,
    shuffle=False,
)

from tqdm import tqdm

loaded_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(valid_dataloader):
        # Move batch to the same device as the loaded_model
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch.pop("labels").detach().cpu().numpy()
        outputs = loaded_model(**batch)
        logits = outputs.logits if hasattr(outputs, "logits") else outputs[0]
        preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()

        # Accumulate predictions and labels
        all_preds.append(preds)
        all_labels.append(labels)

In [None]:
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

gold = dataset["valid"]["label"]
t1p = precision_score(gold, all_preds)
t1r = recall_score(gold, all_preds)
t1f = f1_score(gold, all_preds)
print("Precision:", t1p)
print("Recall:", t1r)
print("F1:", t1f)
print("-" * 40)

## Experiment 7: Experiment 6 but with more epochs


In [None]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login
from tqdm import tqdm

import wandb
import os
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from transformers import EvalPrediction
from transformers import DataCollatorWithPadding

from torch.utils.data import DataLoader
from transformers import RobertaModel, RobertaTokenizer, AdamW
from torch import nn, optim
from torch.nn import functional as F


class RobertaClassifier(nn.Module):
    def __init__(self, num_labels, config):
        super(RobertaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base", config=config)
        self.mapper = nn.Linear(
            self.roberta.config.hidden_size * 2, self.roberta.config.hidden_size * 2
        )
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.roberta.config.hidden_size * 2, num_labels)
        self.loss_fn = nn.CrossEntropyLoss()
        self.config = config

    def forward(
        self,
        input_ids,
        attention_mask,
        community_input_ids,
        community_attention_mask,
        labels=None,
    ):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        text_embedding = outputs[1]

        # Process community keywords
        community_outputs = self.roberta(
            input_ids=community_input_ids, attention_mask=community_attention_mask
        )
        community_embedding = community_outputs[1]

        # Concatenate
        combined_embedding = torch.cat((text_embedding, community_embedding), dim=1)

        mapped_embedding = self.mapper(combined_embedding)
        # Apply dropout to the output of the mapper
        dropped_embedding = self.dropout(mapped_embedding)

        # Pass the result through the classifier to get logits
        logits = self.classifier(dropped_embedding)

        # Compute loss if labels are provided (during training)
        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return loss, logits
        else:
            return logits


class CustomDataCollatorWithPadding(DataCollatorWithPadding):
    def __call__(self, features):
        batch = super().__call__(features)

        community_input_ids = torch.stack(
            [feature["community_input_ids"] for feature in features]
        )
        community_attention_mask = torch.stack(
            [feature["community_attention_mask"] for feature in features]
        )

        padded_community_input_ids = self.pad_tensors(
            community_input_ids, self.tokenizer.pad_token_id
        )
        padded_community_attention_mask = self.pad_tensors(community_attention_mask, 0)

        batch["community_input_ids"] = padded_community_input_ids
        batch["community_attention_mask"] = padded_community_attention_mask

        return batch

    def pad_tensors(self, tensors, pad_token_id):
        max_length = max(t.size(0) for t in tensors)
        # Pad each tensor to match the longest one
        padded = torch.stack(
            [
                torch.cat(
                    [
                        t,
                        torch.full(
                            (max_length - t.size(0),), pad_token_id, dtype=t.dtype
                        ),
                    ]
                )
                for t in tensors
            ]
        )
        return padded


def compute_metrics(p: EvalPrediction):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }


HF_TOKEN = "put your token here"
WANDB_TOKEN = "put your token here"

# WANDB_TOKEN = userdata.get('WANDB_TOKEN')
wandb.login(key=WANDB_TOKEN)

model_id = "roberta-base"
username = "ImperialIndians23"
downsampled = True
processed = True

dataset_id = "ImperialIndians23/nlp_cw_data"
run_name = f"roberta - keyword"

if processed:
    dataset_id += "_processed"
    run_name += " processed "
else:
    dataset_id += "_unprocessed"
    run_name += " unprocessed "

if downsampled:
    dataset_id += "_downsampled"
    run_name += "downsampled "


training_config = {
    "learning_rate": 1e-5,
    "num_train_epochs": 7,
    "weight_decay": 0.01,
    "gradient_accumulation_steps": 4,
    "per_device_train_batch_size": 2,
    "per_device_eval_batch_size": 2,
    "lr_scheduler_type": "inverse_sqrt",
    "warmup_steps": 500,
    "load_best_model_at_end": True,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "metric_for_best_model": "f1",
}

run_name += f"- {training_config['num_train_epochs']} epochs"

repository_id = "ImperialIndians23/RobertaBaseProcessedDownsampledKeywordDropoutE7"

dataset = load_dataset(dataset_id)

print("Processing the dataset...")

tokenizer = RobertaTokenizerFast.from_pretrained(model_id)


def tokenize_data(batch):
    # Tokenize the main text
    text_encoding = tokenizer(
        batch["text"],
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )

    # Tokenize the community keyword
    # Assuming each item has a single community keyword for simplicity
    community_encoding = tokenizer(
        batch["community"],
        padding=True,
        truncation=True,
        max_length=64,
        return_tensors="pt",
    )

    return {
        "input_ids": text_encoding["input_ids"],
        "attention_mask": text_encoding["attention_mask"],
        "community_input_ids": community_encoding["input_ids"],
        "community_attention_mask": community_encoding["attention_mask"],
    }


dataset["train"] = dataset["train"].map(tokenize_data, batched=True)
dataset["valid"] = dataset["valid"].map(tokenize_data, batched=True)


dataset["train"].set_format(
    "torch",
    columns=[
        "input_ids",
        "attention_mask",
        "label",
        "community_input_ids",
        "community_attention_mask",
    ],
)
dataset["valid"].set_format(
    "torch",
    columns=[
        "input_ids",
        "attention_mask",
        "label",
        "community_input_ids",
        "community_attention_mask",
    ],
)

num_labels = 2
class_names = [0, 1]
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

model = RobertaClassifier(num_labels=2, config=config)

In [None]:
training_args = TrainingArguments(
    output_dir=repository_id,
    logging_dir=f"{repository_id}/logs",
    logging_steps=10,
    save_total_limit=3,
    push_to_hub=True,
    report_to="wandb",
    **training_config,
)

run = wandb.init(
    project="nlp_cw",
    name=run_name,
    # Track hyperparameters and run metadata
    config=training_config,
)


data_collator = CustomDataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["valid"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate()

# Save the tokenizer and create a model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()

# Push the results to the hub
trainer.push_to_hub()

## Experiment 8 - Unprocessed Text + Back Translation


In [None]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login
from tqdm import tqdm

import wandb
import os
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }


HF_TOKEN = "put your token here"
WANDB_TOKEN = "put your token here"

# WANDB_TOKEN = userdata.get('WANDB_TOKEN')
wandb.login(key=WANDB_TOKEN)

model_id = "roberta-base"
username = "ImperialIndians23"
downsampled = False
processed = False
augmented = True

dataset_id = "ImperialIndians23/nlp_cw_data"
run_name = f"roberta - keyword"

if processed:
    dataset_id += "_processed"
    run_name += " processed "
else:
    dataset_id += "_unprocessed"
    run_name += " unprocessed "

if downsampled:
    dataset_id += "_downsampled"
    run_name += "downsampled "

if augmented:
    dataset_id += "_augmented"
    run_name += "augmented "


training_config = {
    "learning_rate": 1e-5,
    "num_train_epochs": 3,
    "weight_decay": 0.01,
    "gradient_accumulation_steps": 4,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 2,
    "lr_scheduler_type": "inverse_sqrt",
    "warmup_steps": 500,
    "load_best_model_at_end": True,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "metric_for_best_model": "f1",
}

run_name += f"- {training_config['num_train_epochs']} epochs"

repository_id = "ImperialIndians23/RobertaBaseUnprocessedAugmented"

dataset = load_dataset(dataset_id)

print("Processing the dataset...")

# Training
train_dataset = dataset["train"]

# Validation dataset
val_dataset = dataset["valid"]

# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)


# This function tokenizes the input text using the RoBERTa tokenizer.
# It applies padding and truncation to ensure that all sequences have the same length (512 tokens).
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))


train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

num_labels = 2
class_names = [0, 1]
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})


model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

In [None]:
training_args = TrainingArguments(
    output_dir=repository_id,
    logging_dir=f"{repository_id}/logs",
    logging_steps=10,
    save_total_limit=3,
    push_to_hub=True,
    report_to="wandb",
    **training_config,
)

run = wandb.init(
    project="nlp_cw",
    name=run_name,
    # Track hyperparameters and run metadata
    config=training_config,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate()

# Save the tokenizer and create a model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()

# Push the results to the hub
trainer.push_to_hub()

## Experiment 9 - Unprocessed + Synonym Aug


In [None]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login
from tqdm import tqdm

import wandb
import os
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }


HF_TOKEN = "put your token here"
WANDB_TOKEN = "put your token here"

# WANDB_TOKEN = userdata.get('WANDB_TOKEN')
wandb.login(key=WANDB_TOKEN)

model_id = "roberta-base"
username = "ImperialIndians23"
downsampled = False
processed = False
augmented = True
synonym_augmented = True

dataset_id = "ImperialIndians23/nlp_cw_data"
run_name = f"roberta - keyword"

if processed:
    dataset_id += "_processed"
    run_name += " processed "
else:
    dataset_id += "_unprocessed"
    run_name += " unprocessed "

if downsampled:
    dataset_id += "_downsampled"
    run_name += "downsampled "

if augmented:
    dataset_id += "_augmented"
    run_name += "augmented "

if synonym_augmented:
    dataset_id += "_synonym"
    run_name += "synonym "

print("Dataset:", dataset_id)

training_config = {
    "learning_rate": 1e-5,
    "num_train_epochs": 3,
    "weight_decay": 0.01,
    "gradient_accumulation_steps": 4,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 2,
    "lr_scheduler_type": "inverse_sqrt",
    "warmup_steps": 500,
    "load_best_model_at_end": True,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "metric_for_best_model": "f1",
}

run_name += f"- {training_config['num_train_epochs']} epochs"

repository_id = "ImperialIndians23/RobertaBaseUnprocessedAugmentedSynonym"

dataset = load_dataset(dataset_id)

print("Processing the dataset...")

# Training
train_dataset = dataset["train"]

# Validation dataset
val_dataset = dataset["valid"]

# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)


# This function tokenizes the input text using the RoBERTa tokenizer.
# It applies padding and truncation to ensure that all sequences have the same length (512 tokens).
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

num_labels = 2
class_names = [0, 1]
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

In [None]:
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

training_args = TrainingArguments(
    output_dir=repository_id,
    logging_dir=f"{repository_id}/logs",
    logging_steps=10,
    save_total_limit=3,
    push_to_hub=True,
    report_to="wandb",
    **training_config,
)

run = wandb.init(
    project="nlp_cw",
    name=run_name,
    # Track hyperparameters and run metadata
    config=training_config,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate()

# Save the tokenizer and create a model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()

# Push the results to the hub
trainer.push_to_hub()

## Experiment 10 - Unprocessed + Back Translation + Synonym Aug


In [None]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login
from tqdm import tqdm

import wandb
import os
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }


HF_TOKEN = "put your token here"
WANDB_TOKEN = "put your token here"

# WANDB_TOKEN = userdata.get('WANDB_TOKEN')
wandb.login(key=WANDB_TOKEN)

model_id = "roberta-base"
username = "ImperialIndians23"
downsampled = False
processed = False
augmented = True
double_augmented = True

dataset_id = "ImperialIndians23/nlp_cw_data"
run_name = f"roberta - keyword"

if processed:
    dataset_id += "_processed"
    run_name += " processed "
else:
    dataset_id += "_unprocessed"
    run_name += " unprocessed "

if downsampled:
    dataset_id += "_downsampled"
    run_name += "downsampled "

if augmented:
    dataset_id += "_augmented"
    run_name += "augmented "

if double_augmented:
    dataset_id += "_both"
    run_name += "both "


training_config = {
    "learning_rate": 1e-5,
    "num_train_epochs": 3,
    "weight_decay": 0.01,
    "gradient_accumulation_steps": 4,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 2,
    "lr_scheduler_type": "inverse_sqrt",
    "warmup_steps": 500,
    "load_best_model_at_end": True,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "metric_for_best_model": "f1",
}

run_name += f"- {training_config['num_train_epochs']} epochs"

repository_id = "ImperialIndians23/RobertaBaseUnprocessedAugmentedBoth"

dataset = load_dataset(dataset_id)

print("Processing the dataset...")

# Training
train_dataset = dataset["train"]

# Validation dataset
val_dataset = dataset["valid"]

# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)


# This function tokenizes the input text using the RoBERTa tokenizer.
# It applies padding and truncation to ensure that all sequences have the same length (512 tokens).
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

num_labels = 2
class_names = [0, 1]
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

In [None]:
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

training_args = TrainingArguments(
    output_dir=repository_id,
    logging_dir=f"{repository_id}/logs",
    logging_steps=10,
    save_total_limit=3,
    push_to_hub=True,
    report_to="wandb",
    **training_config,
)

run = wandb.init(
    project="nlp_cw",
    name=run_name,
    # Track hyperparameters and run metadata
    config=training_config,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate()

# Save the tokenizer and create a model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()

# Push the results to the hub
trainer.push_to_hub()

## Evaluation


Do final model predictions on `dev` set.


### Score 1 model


In [None]:
def labels2file(p, outf_path):
    with open(outf_path, "w") as outf:
        for pi in p:
            outf.write(",".join([str(pi)]) + "\n")

In [None]:
from datasets import load_dataset

dataset = load_dataset("ImperialIndians23/nlp_cw_data_unprocessed")
valid_dataset = dataset["valid"]

In [None]:
from transformers import pipeline
from tqdm import tqdm

classifier = pipeline("text-classification", repository_id)

batch_size = 8
predictions = []

# Retrieve all texts from the dataset
texts = [example["text"] for example in valid_dataset]

# Process texts in batches
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i : i + batch_size]
    batch_results = classifier(batch_texts)

    # Extract and store predictions from results
    batch_predictions = [result["label"] for result in batch_results]
    predictions.extend(batch_predictions)

labels2file(predictions, os.path.join("res/", "task1.txt"))

In [None]:
from collections import Counter

Counter(predictions)

In [None]:
gold = dataset["valid"]["label"]
labels2file(gold, os.path.join("ref/", "task1.txt"))

input_dir = "./"
output_dir = "./"

# define gold data path
ref_dir = os.path.join(input_dir, "ref")

# define submission data path
submission_dir = os.path.join(input_dir, "res")
files = os.listdir(submission_dir)
outf = open(os.path.join(output_dir, "scores.txt"), "w")

# evaluating on task 1
if "task1.txt" in files:
    task1_res = []
    task1_gold = []
    with open(os.path.join(submission_dir, "task1.txt")) as f:
        for line in f:
            task1_res.append(int(line.strip()))
    with open(os.path.join(ref_dir, "task1.txt")) as f:
        for line in f:
            task1_gold.append(int(line.strip()))
    # task 1 scores
    t1p = precision_score(task1_gold, task1_res)
    t1r = recall_score(task1_gold, task1_res)
    t1f = f1_score(task1_gold, task1_res)
    # task1
    outf.write("task1_precision:" + str(t1p) + "\n")
    outf.write("task1_recall:" + str(t1r) + "\n")
    outf.write("task1_f1:" + str(t1f) + "\n")

outf.close()

### Score all models


In [None]:
from transformers import pipeline
from tqdm import tqdm
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)


def scorer(repository_id, task1_gold, valid_dataset):
    print("-" * 40)
    print(f"Evaluating {repository_id.removeprefix('ImperialIndians23/')}")
    classifier = pipeline("text-classification", repository_id)

    batch_size = 8
    task1_res = []

    # Retrieve all texts from the dataset
    texts = [example["text"] for example in valid_dataset]

    # Process texts in batches
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i : i + batch_size]
        batch_results = classifier(batch_texts)

        # Extract and store predictions from results
        batch_predictions = [result["label"] for result in batch_results]
        task1_res.extend(batch_predictions)
    t1p = precision_score(task1_gold, task1_res)
    t1r = recall_score(task1_gold, task1_res)
    t1f = f1_score(task1_gold, task1_res)
    print("Precision:", t1p)
    print("Recall:", t1r)
    print("F1:", t1f)
    print("-" * 40)

#### Processed


In [None]:
from datasets import load_dataset

dataset = load_dataset("ImperialIndians23/nlp_cw_data_processed")
valid_dataset = dataset["valid"]

In [None]:
repository_ids = [
    "ImperialIndians23/RobertaBaseProcessedDownsampled",
    "ImperialIndians23/RobertaBaseProcessed",
]

In [None]:
gold = dataset["valid"]["label"]

for repository_id in repository_ids:
    scorer(repository_id, gold, valid_dataset)

#### Unprocessed


In [None]:
from datasets import load_dataset

dataset = load_dataset("ImperialIndians23/nlp_cw_data_unprocessed")
valid_dataset = dataset["valid"]

In [None]:
repository_ids = [
    "ImperialIndians23/RobertaBaseUnprocessedAugmented",
    "ImperialIndians23/RobertaBaseUnprocessedDownsampled",
    "ImperialIndians23/RobertaBaseUnprocessed",
]

In [None]:
gold = valid_dataset["label"]

for repository_id in repository_ids:
    scorer(repository_id, gold, valid_dataset)

#### Augmented


In [None]:
repository_ids = [
    "ImperialIndians23/RobertaBaseUnprocessedAugmentedSynonym",
    "ImperialIndians23/RobertaBaseUnprocessedAugmented",
    "ImperialIndians23/RobertaBaseUnprocessedAugmentedBoth",
]
gold = valid_dataset["label"]

for repository_id in repository_ids:
    scorer(repository_id, gold, valid_dataset)

#


### Check for tough sentences


In [None]:
from transformers import pipeline
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score


def scorer(repository_id, task1_gold, valid_dataset):
    print("-" * 40)
    print(f"Evaluating {repository_id.removeprefix('ImperialIndians23/')}")
    classifier = pipeline("text-classification", model=repository_id, device="cuda")

    batch_size = 8
    task1_res = []
    wrong_samples = []

    texts = [example["text"] for example in valid_dataset]
    parids = [example["par_id"] for example in valid_dataset]
    labels = task1_gold

    # Process texts in batches
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i : i + batch_size]
        batch_parids = parids[i : i + batch_size]

        batch_labels = labels[i : i + batch_size]
        batch_results = classifier(batch_texts)

        # Extract predictions from results
        batch_predictions = [result["label"] for result in batch_results]
        task1_res.extend(batch_predictions)

        # Check for wrong predictions and store them
        for j, (pred, real) in enumerate(zip(batch_predictions, batch_labels)):
            if pred != real:
                wrong_sample = (batch_parids[j], batch_texts[j], real)
                wrong_samples.append(wrong_sample)

    # Metrics
    t1p = precision_score(task1_gold, task1_res)
    t1r = recall_score(task1_gold, task1_res)
    t1f = f1_score(task1_gold, task1_res)

    print("Precision:", t1p)
    print("Recall:", t1r)
    print("F1:", t1f)
    print("-" * 40)

    print("Some wrongly classified samples:")
    for parid, text, real_label in wrong_samples[:5]:
        print(f"ParID: {parid} Text: {text}, Real Label: {real_label}")
    return wrong_samples

In [None]:
from datasets import load_dataset

dataset = load_dataset("ImperialIndians23/nlp_cw_data_unprocessed")
valid_dataset = dataset["valid"]
repository_id = "ImperialIndians23/RobertaBaseUnprocessedAugmented"
gold = valid_dataset["label"]

ws = scorer(repository_id, gold, valid_dataset)

In [None]:
import matplotlib.pyplot as plt

lengths = [len(text) for _, text, _ in ws]
labels = [label for _, _, label in ws]

plt.figure(figsize=(10, 6))
plt.hist(lengths, bins=20, alpha=0.7, label=labels)
plt.title("Histogram of Text Lengths of Wrong Detections")
plt.xlabel("Length of Text")
plt.ylabel("Frequency")
plt.legend(title="Label")
plt.show()

In [None]:
import matplotlib.pyplot as plt

all_texts_lengths = [len(example["text"]) for example in valid_dataset]
wrong_texts_lengths = [len(text) for _, text, _ in ws]

plt.figure(figsize=(12, 6))
plt.hist(all_texts_lengths, bins=50, alpha=0.5, label="All Texts")
plt.hist(wrong_texts_lengths, bins=50, alpha=0.5, label="Wrongly Classified Texts")
plt.xlabel("Text Length")
plt.ylabel("Frequency")
plt.title("Distribution of Text Lengths vs. Wrongly Classified Texts")
plt.legend()
plt.show()

In [None]:
import numpy as np

median_length = np.median(wrong_texts_lengths)
average_length = np.mean(wrong_texts_lengths)
percentile_25 = np.percentile(wrong_texts_lengths, 25)
percentile_75 = np.percentile(wrong_texts_lengths, 75)
print(median_length, average_length, percentile_25, percentile_75)

### Analysis of Q1


In [None]:
import pandas as pd
import os
from datasets import Dataset

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
from transformers import pipeline
from tqdm import tqdm
import random


def scorer(repository_id, task1_gold, valid_dataset):
    print("-" * 40)
    print(f"Evaluating {repository_id.removeprefix('ImperialIndians23/')}")
    classifier = pipeline("text-classification", model=repository_id, device=0)

    batch_size = 8
    task1_res = []
    wrong_samples = [] 
    orig_label_preds = (
        []
    )  

    texts = [example["text"] for example in valid_dataset]
    parids = [example["par_id"] for example in valid_dataset]
    labels = task1_gold
    orig_labels_list = [example["orig_label"] for example in valid_dataset]

    # Process texts in batches
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i : i + batch_size]
        batch_parids = parids[i : i + batch_size]
        batch_labels = labels[i : i + batch_size]
        batch_orig_labels = orig_labels_list[i : i + batch_size]
        batch_results = classifier(batch_texts)

        # Extract predictions from results
        # for result in batch_results:
        #     print(result, type(result))
        batch_predictions = [int(result["label"]) for result in batch_results]
        task1_res.extend(batch_predictions)

        # Check for wrong predictions and assign original label predictions
        for j, (pred, real, orig_label) in enumerate(
            zip(batch_predictions, batch_labels, batch_orig_labels)
        ):
            if str(pred) == str(real):
                orig_label_preds.append(orig_label)
            else:
                wrong_sample = (batch_parids[j], batch_texts[j], real)
                wrong_samples.append(wrong_sample)
                # Assign incorrect original label based on the binary prediction
                if pred == "1":  # Incorrectly predicted as patronizing
                    incorrect_orig_label = random.choice([2, 3, 4])
                else:  # Incorrectly predicted as not patronizing
                    incorrect_orig_label = random.choice([0, 1])
                orig_label_preds.append(incorrect_orig_label)

    # Metrics
    t1p = precision_score(labels, task1_res, pos_label=1, average="binary")
    t1r = recall_score(labels, task1_res, pos_label=1, average="binary")
    t1f = f1_score(labels, task1_res, pos_label=1, average="binary")

    print("Binary Classification Metrics:")
    print("Precision:", t1p)
    print("Recall:", t1r)
    print("F1:", t1f)
    print("-" * 40)

    # Calculate and print F1 scores for original labels
    orig_f1_scores = {}
    for label in set(orig_labels_list):
        bin_labels = [1 if l == label else 0 for l in orig_labels_list]
        bin_preds = [1 if p == label else 0 for p in orig_label_preds]
        orig_f1_scores[label] = f1_score(bin_labels, bin_preds)

    print("Original Label F1 Scores:", orig_f1_scores)

    return wrong_samples, orig_f1_scores

In [None]:
from datasets import load_from_disk

valid_dataset = load_from_disk("./nlp_cw_data_valid_with_orig_labels")
repository_id = "ImperialIndians23/RobertaBaseUnprocessedAugmented"
gold = valid_dataset["label"]

ws, f1_scores = scorer(repository_id, gold, valid_dataset)

In [None]:
colors = ["green", "blue", "red", "purple", "orange"]  # One color per label

plt.figure(figsize=(10, 6))
labels, scores = zip(*sorted(f1_scores.items()))
bars = plt.bar(labels, scores, color=colors)

legend_elements = [
    plt.Line2D([0], [0], color=color, lw=4, label=f"Level {label}")
    for label, color in zip(labels, colors)
]
plt.legend(handles=legend_elements, title="Original Labels")

plt.xlabel("Original Labels")
plt.ylabel("F1 Score")
plt.title("F1 Scores for Original Labels")
plt.xticks(labels)
plt.ylim(0, 1)
plt.show()

In [None]:
f1_scores

### Analysis of Q3


In [None]:
from collections import defaultdict
from transformers import pipeline
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score


def scorer(repository_id, task1_gold, valid_dataset):
    print("-" * 40)
    print(f"Evaluating {repository_id.removeprefix('ImperialIndians23/')}")
    classifier = pipeline("text-classification", model=repository_id, device="cuda")

    batch_size = 8
    community_wise_results = defaultdict(lambda: {"predictions": [], "labels": []})
    wrong_samples = [] 

    texts = [example["text"] for example in valid_dataset]
    parids = [example["par_id"] for example in valid_dataset]
    labels = task1_gold  # Assuming task1_gold contains the actual labels
    communities = [example["community"] for example in valid_dataset]

    # Process texts in batches
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i : i + batch_size]
        batch_parids = parids[i : i + batch_size]
        batch_labels = labels[i : i + batch_size]
        batch_communities = communities[i : i + batch_size]
        batch_results = classifier(batch_texts)

        # Extract predictions from results
        batch_predictions = [result["label"] for result in batch_results]

        # Group by community
        for j, (prediction, label, community) in enumerate(
            zip(batch_predictions, batch_labels, batch_communities)
        ):
            community_wise_results[community]["predictions"].append(prediction)
            community_wise_results[community]["labels"].append(label)

            # Check for wrong predictions and store them
            if prediction != label:
                wrong_sample = (batch_parids[j], batch_texts[j], label)
                wrong_samples.append(wrong_sample)

    # Community-wise Metrics
    community_f1_scores = {}
    for community, results in community_wise_results.items():
        community_f1_scores[community] = f1_score(
            results["labels"], results["predictions"], average="weighted"
        )
        print(f"Community: {community}")
        print("F1:", community_f1_scores[community])

    print("-" * 40)

    return community_f1_scores, wrong_samples

In [None]:
from datasets import load_from_disk

valid_dataset = load_from_disk("./nlp_cw_data_valid_with_orig_labels")
repository_id = "ImperialIndians23/RobertaBaseUnprocessedAugmented"
gold = valid_dataset["label"]

f1_scores, ws = scorer(repository_id, gold, valid_dataset)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

community_f1_data = pd.DataFrame(
    list(f1_scores.items()), columns=["Community", "F1 Score"]
)

plt.figure(figsize=(10, 6))
sns.barplot(x="Community", y="F1 Score", data=community_f1_data, palette="viridis")

plt.xlabel("Community")
plt.ylabel("F1 Score")
plt.title("Community-wise F1 Scores")
plt.xticks(rotation=45)


plt.tight_layout()
plt.show()