In [1]:
!pip install neattext

Collecting neattext
  Downloading neattext-0.1.3-py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.7/114.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neattext
Successfully installed neattext-0.1.3


In [2]:
import pandas as pd
import numpy as np
import joblib
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import os

In [4]:
df = pd.read_csv("/content/emotion_dataset_raw.csv")

In [5]:
import neattext.functions as nfx
df['Clean_Text'] = df['Text'].apply(nfx.remove_userhandles)
df['Clean_Text'] = df['Clean_Text'].apply(nfx.remove_stopwords)

In [6]:
Xfeatures = df['Clean_Text']
ylabels = df['Emotion']
x_train, x_test, y_train, y_test = train_test_split(Xfeatures, ylabels, test_size=0.3, random_state=42)


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
max_seq_length = 128

def tokenize_texts(texts, max_length):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            truncation=True,
                            return_attention_mask=True,
                            return_tensors='pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    return input_ids, attention_masks

x_train_ids, x_train_masks = tokenize_texts(x_train, max_seq_length)
x_test_ids, x_test_masks = tokenize_texts(x_test, max_seq_length)

In [9]:
class BERTbiLSTMModel(nn.Module):
    def __init__(self, bert_model, hidden_size, num_classes):
        super(BERTbiLSTMModel, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.bilstm = nn.LSTM(input_size=bert_model.config.hidden_size,
                              hidden_size=hidden_size,
                              num_layers=1,
                              batch_first=True,
                              bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        bilstm_output, _ = self.bilstm(pooled_output.unsqueeze(1))  # Add an additional dimension
        logits = self.fc(bilstm_output[:, -1, :])  # Take the last hidden state
        return logits


bert_model = BertModel.from_pretrained('bert-base-uncased')
num_classes = len(df['Emotion'].unique())
hidden_size = 128
model = BERTbiLSTMModel(bert_model, hidden_size, num_classes)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

# Convert emotion labels to numerical format
emotion_to_label = {emotion: label for label, emotion in enumerate(df['Emotion'].unique())}
y_train_tensor = torch.tensor([emotion_to_label[emotion] for emotion in y_train.values])
y_test_tensor = torch.tensor([emotion_to_label[emotion] for emotion in y_test.values])

batch_size = 32
train_dataset = TensorDataset(x_train_ids, x_train_masks, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = TensorDataset(x_test_ids, x_test_masks, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_loader)
    print(f'Average training loss: {avg_train_loss:.4f}')

torch.save(model.state_dict(), 'emotion_classifier_bert_bilstm.pt')


Epoch 1/3: 100%|██████████| 486/486 [10:02<00:00,  1.24s/it]


Average training loss: 1.3932


Epoch 2/3: 100%|██████████| 486/486 [09:53<00:00,  1.22s/it]


Average training loss: 0.9575


Epoch 3/3: 100%|██████████| 486/486 [09:50<00:00,  1.21s/it]


Average training loss: 0.6985


In [12]:
import torch
from transformers import BertTokenizer, BertModel
import torch.nn as nn

# Load pre-trained BERT model and tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model.load_state_dict(torch.load('/content/emotion_classifier_bert_bilstm.pt'))
model.eval()

# Function to predict emotion for a given sentence
def predict_emotion(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", max_length=128, truncation=True, padding='max_length')
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        probabilities = torch.softmax(logits, dim=1)
        print(probabilities)
        predicted_class = torch.argmax(probabilities, dim=1).item()
    return predicted_class

# Example usage
sentence = "I am feeling annoying today"
predicted_emotion = predict_emotion(sentence)
print("Predicted Emotion:", predicted_emotion)


tensor([[0.0152, 0.6036, 0.1534, 0.0196, 0.0314, 0.1512, 0.0021, 0.0235]])
Predicted Emotion: 1


In [13]:
emotion_to_label

{'neutral': 0,
 'joy': 1,
 'sadness': 2,
 'fear': 3,
 'surprise': 4,
 'anger': 5,
 'shame': 6,
 'disgust': 7}