In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments, get_scheduler
import evaluate
from datasets import load_dataset
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Choosing distilbert due to its smaller size while maintaining high accuracy
model_checkpoint = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
transformer_backbone = AutoModel.from_pretrained(model_checkpoint)

In [5]:
# Custom NLP Model class
# Contains multiple heads for different learning tasks with a forward pass to run them both

class CustomModelNLP(nn.Module):
    def __init__(self, transformer_backbone):
        super(CustomModelNLP, self).__init__()
        # Number of labels for sentence classification
        self.nLabels = 6
        #Number of labels for sentiment analysis
        self.nSentinment = 2

        #Model Layers
        self.transformer_backbone = transformer_backbone
        self.classifier = nn.Linear(transformer_backbone.config.hidden_size, self.nLabels)
        self.sentiment = nn.Linear(transformer_backbone.config.hidden_size, self.nSentinment)

        # Freeze transformer
        for param in self.transformer_backbone.parameters():
            param.requires_grad = False


    def forward(self, input_ids, attention_mask, task_id):
        out = self.transformer_backbone(input_ids=input_ids, attention_mask=attention_mask)
        embedding = out.last_hidden_state[:, 0, :]

        if task_id == 0:
            out = self.classifier(embedding)
        elif task_id == 1:
            out = self.sentiment(embedding)
        else:
            assert False, 'Bad Task ID'
        
        return out

#Tokenization function
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding='max_length', max_length=512)

customModel = CustomModelNLP(transformer_backbone)  

In [6]:
# Tokenize and get embeddings
sentences = ["The old clock tower chimed, echoing through the quiet city streets.", 
             "A vibrant tapestry of colors adorned the window, catching the afternoon sunlight.", 
             "The children giggled as the playful puppy chased its tail in the park."]
task_ids = [0,1,0]

print("="*40)
print("Sample tokenization output. Tensor output hidden due to length")
print("="*40)
print("\n")


for sentence in sentences:
    tokenized_sentence = tokenizer(sentence, return_tensors="pt")
    output = transformer_backbone(input_ids=tokenized_sentence.input_ids, attention_mask=tokenized_sentence.attention_mask)
    embedding = output.last_hidden_state[:, 0, :] 
    print(embedding.shape)
#print(embedding)

print("\n")
print("="*40)
print("Sample model output depending on task id")
print("="*40)
print("\n")

for sentence, task_id in zip(sentences, task_ids):
    inputs = tokenizer(sentence, return_tensors="pt")
    print(inputs)
    with torch.no_grad():
        output = customModel(input_ids = inputs.input_ids, attention_mask = inputs.attention_mask, task_id = task_id)
        
    print(f"Sentence: {sentence}")
    print("Output:", output)

Sample tokenization output. Tensor output hidden due to length


torch.Size([1, 768])
torch.Size([1, 768])
torch.Size([1, 768])


Sample model output depending on task id


{'input_ids': tensor([[  101,  1996,  2214,  5119,  3578, 27460,  1010, 17142,  2083,  1996,
          4251,  2103,  4534,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Sentence: The old clock tower chimed, echoing through the quiet city streets.
Output: tensor([[ 0.4417, -0.1121, -0.0081,  0.1835, -0.0344, -0.0517]])
{'input_ids': tensor([[  101,  1037, 17026, 25213,  1997,  6087, 19189,  1996,  3332,  1010,
          9105,  1996,  5027,  9325,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Sentence: A vibrant tapestry of colors adorned the window, catching the afternoon sunlight.
Output: tensor([[ 0.0098, -0.1250]])
{'input_ids': tensor([[  101,  1996,  2336, 15889,  2004,  1996, 18378, 17022, 13303,  2049,
          5725

In [7]:
# Load Data & Tokenize
emotion_data_split = load_dataset("dair-ai/emotion", "split")
sentiment_data = load_dataset("gxb912/large-twitter-tweets-sentiment")
tokenized_emotion_data = emotion_data_split.map(tokenize_function, batched=True)
tokenized_sentiment_data = sentiment_data.map(tokenize_function, batched=True)
print(tokenized_emotion_data["train"].shape)
print(tokenized_sentiment_data["train"].features)

(16000, 4)
{'sentiment': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [8]:
# Put data into a dataloader object
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

emotion_train_dataloader = DataLoader(
    tokenized_emotion_data['train'].remove_columns(["text"]), shuffle = True, batch_size = 32, collate_fn = data_collator
)
emotion_validation_dataloader = DataLoader(
    tokenized_emotion_data['validation'].remove_columns(["text"]), shuffle = True, batch_size = 32, collate_fn = data_collator
)

sentiment_train_dataloader = DataLoader(
    tokenized_sentiment_data['train'].remove_columns(["text"]), shuffle = True, batch_size = 32, collate_fn = data_collator
)

sentiment_validation_dataloader = DataLoader(
    tokenized_sentiment_data['test'].remove_columns(["text"]), shuffle = True, batch_size = 32, collate_fn = data_collator
)

In [9]:
# Training loop parameters
learning_rate = 1e-6
epochs = 5
steps = epochs * len(emotion_train_dataloader)
optimizer = torch.optim.Adam(customModel.parameters(), lr = learning_rate)

# Loss & Metric Functions
emotion_loss_fn = nn.CrossEntropyLoss()
sentiment_loss_fn = nn.CrossEntropyLoss()
metric = evaluate.load("accuracy")

In [13]:
# Set up cuda device if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
customModel = customModel.to(device)

cuda


In [11]:
# Progress bars
progress_bar_train = tqdm(range(steps))
progress_bar_eval = tqdm(range(epochs * len(emotion_validation_dataloader) ))

# Training Loop
for epoch in range(epochs):
    # Zip both data sets
    train_zip_dl = zip(emotion_train_dataloader, sentiment_train_dataloader)
    validation_zip_dl = zip(emotion_validation_dataloader, sentiment_validation_dataloader)

    # Train
    customModel.train()
    for emotion_batch, sentiment_batch in train_zip_dl:
        emotion_batch.to(device)
        sentiment_batch.to(device)
        emotions_predicitons = customModel(emotion_batch['input_ids'], emotion_batch['attention_mask'], task_id = 0)
        emotions_loss = emotion_loss_fn(emotions_predicitons, emotion_batch['labels'])
        sentiment_predicitons = customModel(sentiment_batch['input_ids'], sentiment_batch['attention_mask'], task_id = 1)
        sentiment_loss = sentiment_loss_fn(torch.argmax(sentiment_predicitons, dim = -1).float(), sentiment_batch['sentiment'].float())
        loss = emotions_loss + sentiment_loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        progress_bar_train.update(1)

    # Evaluate
    customModel.eval()
    for emotion_batch, sentiment_batch in validation_zip_dl:
        emotion_batch.to(device)
        sentiment_batch.to(device)
        emotions_predicitons = customModel(emotion_batch['input_ids'], emotion_batch['attention_mask'], task_id = 0)
        sentiment_predicitons = customModel(sentiment_batch['input_ids'], sentiment_batch['attention_mask'], task_id = 1)

        # Add predictions to metric
        metric.add_batch(predictions = torch.argmax(emotions_predicitons, dim=-1), references = emotion_batch['labels'])   
        metric.add_batch(predictions = torch.argmax(sentiment_predicitons, dim=-1), references = sentiment_batch['sentiment']) 
        progress_bar_eval.update(1)

  0%|                                                                                          | 0/2500 [00:00<?, ?it/s]
 20%|████████████████                                                                | 500/2500 [01:35<06:17,  5.29it/s][A
  0%|▎                                                                                | 1/315 [01:35<8:19:57, 95.53s/it][A
  1%|▌                                                                                | 2/315 [01:35<3:25:51, 39.46s/it][A
  1%|▊                                                                                | 3/315 [01:35<1:51:57, 21.53s/it][A
  1%|█                                                                                | 4/315 [01:36<1:07:57, 13.11s/it][A
  2%|█▎                                                                                 | 5/315 [01:36<43:40,  8.45s/it][A
  2%|█▌                                                                                 | 6/315 [01:36<29:04,  5.65s/it][A
  2%|█▊    

In [12]:
print(metric.compute())

{'accuracy': 0.4097609561752988}
