In [None]:
from datasets import load_dataset, Dataset, DatasetDict
import matplotlib.pyplot as plt
from transformers import(
                         AutoTokenizer, AutoModel,RobertaTokenizer,
                         AutoModelForSequenceClassification,
                         RobertaForSequenceClassification,
                         AutoConfig, TrainingArguments, Trainer
                        )
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score



In [None]:
dataset = load_dataset("dair-ai/emotion")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [None]:
df = dataset['train'].to_pandas()
df

In [None]:
label_name = dataset['train'].features
display(label_name)
print(f" these are labels :\n {label_name['label'].names}")

{'text': Value('string'),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'])}
 these are labels :
 ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

In [None]:
label_names = dataset['train'].features['label'].names
label_names

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

In [None]:
df['label_text'] = df['label'].apply(lambda x: label_names[x])
display(df.head())

In [None]:
label_counts = df['label_text'].value_counts(ascending=True)
label_counts

In [None]:
model_checkpoint = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

In [None]:
train, test = train_test_split(df, test_size=0.3, stratify=df['label_text'])
test, validation = train_test_split(test, test_size=1/3, stratify=test['label'])

validation.shape, test.shape, train.shape

((1600, 3), (3200, 3), (11200, 3))

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [None]:
dataset = DatasetDict({
     'train': Dataset.from_pandas(train, preserve_index=False),
     'test': Dataset.from_pandas(test, preserve_index=False),
     'validation': Dataset.from_pandas(validation, preserve_index=False)
})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 11200
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 3200
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 1600
    })
})

In [None]:
dataset['train'][10]

{'text': 'i feel very contented just sitting beside him without even uttering a single word',
 'label': 1,
 'label_text': 'joy'}

In [None]:
def tokenize(batch):
  return tokenizer(batch['text'], padding=True, truncation=True)

tokenize(dataset['train'][:3])

{'input_ids': [[0, 118, 619, 14, 5, 521, 40, 28, 55, 1800, 11, 5, 8171, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 118, 67, 619, 5800, 8, 7758, 8, 10513, 142, 52, 3486, 1268, 197, 33, 7, 109, 24, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 118, 2813, 939, 56, 10, 18236, 8847, 14, 2771, 5, 38769, 8, 29841, 22887, 21862, 3245, 25283, 14215, 118, 8, 5, 12343, 18236, 939, 619, 101, 209, 80, 10230, 35734, 5, 1423, 179, 8, 1423, 1097, 50, 11, 42, 403, 5, 4045, 8, 14065, 4405, 2380, 9, 5, 1969, 1035, 375, 281, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [None]:
encoded = dataset.map(tokenize, batched=True, batch_size=None)

In [None]:
encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11200
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3200
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1600
    })
})

In [None]:
model = AutoModel.from_pretrained(model_checkpoint)
model

In [None]:
model.config.id2label, model.config.label2id

({0: 'LABEL_0', 1: 'LABEL_1'}, {'LABEL_0': 0, 'LABEL_1': 1})


In [None]:
label2id = {label: i for i, label in enumerate(label_names)}
id2label = {i: label for i, label in enumerate(label_names)}
label2id, id2label

({'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5},
 {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'})

In [None]:
config = AutoConfig.from_pretrained(model_checkpoint, label2id=label2id, id2label=id2label)
model = RobertaForSequenceClassification.from_pretrained(model_checkpoint, config=config)

In [None]:
batch_size = 64
training_dir = 'roberta_base'
training_args = TrainingArguments(output_dir=training_dir,
                                  overwrite_output_dir=True,
                                  num_train_epochs=5,
                                  learning_rate=2e-5,
                                  per_device_eval_batch_size=batch_size,
                                  per_device_train_batch_size=batch_size,
                                  weight_decay=0.01,
                                  eval_strategy='epoch',
                                  disable_tqdm=False)

In [None]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average='weighted')
  acc = accuracy_score(labels, preds)
  return {"accuracy":acc, 'f1 score': f1}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=encoded['train'],
    eval_dataset=encoded['validation'],
    tokenizer=tokenizer
)
trainer

In [None]:
trainer.train()

## Epoch	Training Loss	Validation Loss	Accuracy	F1 score
1	No log	0.329966	0.880000	0.877516
2	No log	0.222665	0.913750	0.912874
3	0.449900	0.207226	0.921250	0.922132
4	0.449900	0.185305	0.925625	0.926028
5	0.449900	0.184957	0.928750	0.928974
TrainOutput(global_step=875, training_loss=0.31117589024135045, metrics={'train_runtime': 934.5234, 'train_samples_per_second': 59.924, 'train_steps_per_second': 0.936, 'total_flos': 2532534859008000.0, 'train_loss': 0.31117589024135045, 'epoch': 5.0})

In [None]:
trainer.save_model('roberta_classification')

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="roberta_classification", tokenizer="roberta_classification")
texts = ["This is great!", "I feel lonely", "I hate it."]
results = classifier(texts)
print(results)


[{'label': 'joy', 'score': 0.9964421391487122}, {'label': 'sadness', 'score': 0.997150719165802}, {'label': 'anger', 'score': 0.9469196796417236}]