In [2]:
# Import dependencies
import os
import numpy as np
import matplotlib.pyplot as plt

import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer

from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from datasets import load_dataset

In [None]:
# Install dependencies
%pip install evaluate
%pip install transformers[torch]
%pip install accelarte
%pip install datasets

In [10]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # To train our model on GPU if available

torch.cuda.empty_cache()

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

In [4]:
id2label = {
"POLITICS": 0,
"WELLNESS": 1,
"ENTERTAINMENT": 2,
"PARENTING": 3,
"STYLE & BEAUTY" : 4,
"OTHER": 5,
"TRAVEL": 6,
"WORLD NEWS": 7,
"FOOD & DRINK": 8,
"FINANCE": 9,
"SPORTS": 10,
"SCIENCE & TECH": 11,
"ENVIRONMENT": 12,
"ARTS & CULTURE" : 13,
"CRIME" : 14,
"RELIGION": 15
}

label2id = {}
for key in id2label.keys():
  label2id[id2label[key]] = key

In [5]:
def compute_metrics(logits_and_labels):
  '''
    The Trainer class uses this function to compute the metrics like Accuracy, F1 score, etc. for the test set
  '''
  logits, labels = logits_and_labels
  predictions = np.argmax(logits, axis=-1)
  acc = accuracy_score(labels, predictions)
  f1 = f1_score(labels, predictions, average = 'micro')

  conv_labels = [label2id[x] for x in labels]
  conv_predicions = [label2id[x] for x in predictions]
 
  print("*************** Confusion Matrix **************************")
  cm = confusion_matrix(conv_labels, conv_predicions)
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=id2label.keys())
  disp.plot(xticks_rotation=90, cmap="")
  plt.show()

  print("*************** Classification Report **************************")
  print(classification_report(conv_labels, conv_predicions))

  return {'accuracy': acc, 'f1_score': f1}

In [6]:
# Using the pre-trained XLNet Tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

def tokenize_fn(batch):
  return tokenizer(batch['full_review'], truncation = True)

In [7]:
# Loading the pre-trained xlnet-base-cased model for predicting 10 classes
classes = 10
model = AutoModelForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=classes)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Loading pre-processed data
raw_dataset = load_dataset('csv', data_files = "./top_9.csv")

split = raw_dataset['train'].train_test_split(test_size=0.2, seed=42)

# Tokenizing the dataset
tokenized_dataset = split.map(tokenize_fn, batched = True)

Map:   0%|          | 0/27430 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [12]:
# Defining Training Arguments for model
training_args = TrainingArguments(
    output_dir="XLNet_Top_10",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    use_cpu=True
)

# Initializing the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
# Training the model
trainer.train()
# trainer.train("./XLNet_Top_10/checkpoint-65000") #To load the model from a checkpoint and resume training

In [None]:
# To make evaluation on specific checkpoint
trainer._load_from_checkpoint("./XLNet_Top_10/checkpoint-65000")
trainer.evaluate(tokenized_dataset["test"])