In [3]:
# https://huggingface.co/datasets/Sp1786/multiclass-sentiment-analysis-dataset 
import numpy as np
import pandas as pd
import torch
torch.manual_seed(42) 
from transformers import logging
logging.set_verbosity_error()

In [13]:

splits = {'train': 'train_df.csv', 'validation': 'val_df.csv', 'test': 'test_df.csv'}
df = pd.read_csv("hf://datasets/Sp1786/multiclass-sentiment-analysis-dataset/" + splits["train"])
df_test = pd.read_csv("hf://datasets/Sp1786/multiclass-sentiment-analysis-dataset/" + splits["test"])
df.head

<bound method NDFrame.head of           id                                               text  label  \
0       9536                    Cooking microwave pizzas, yummy      2   
1       6135  Any plans of allowing sub tasks to show up in ...      1   
2      17697   I love the humor, I just reworded it. Like sa...      2   
3      14182                       naw idk what ur talkin about      1   
4      17840          That sucks to hear. I hate days like that      0   
...      ...                                                ...    ...   
31227   6265   Grrrr....I got the wrong size coat for the sheep      0   
31228  11284                              4 cases of swine flu!      1   
31229   6436                                          excellent      2   
31230    860  is sitting thru the boring bits in Titanic wai...      1   
31231  15795                                    Missed the play      0   

      sentiment  
0      positive  
1       neutral  
2      positive  
3       n

In [23]:
df.label.value_counts()

label
1    11649
2    10478
0     9105
Name: count, dtype: int64

In [31]:
print('Number of labels:', df.label.nunique())

Number of labels: 3


In [33]:
train_texts = df['text'].tolist()
train_labels = df['label'].tolist()

In [37]:
import re
def tokenize(text):
    tokenized_review_text = re.findall(r'\b\w+\b', text.lower())
    return tokenized_review_text

tokenized_corpus = [tokenize(review_text) for review_text in train_texts]
tokenized_corpus[0]

['cooking', 'microwave', 'pizzas', 'yummy']

In [39]:
from collections import Counter
combined_corpus = []
for text in tokenized_corpus:
    for token in text:
        combined_corpus.append(token)

word_freqs = Counter(combined_corpus)

In [41]:
MAX_VOCAB_SIZE = 1000
most_common_words = word_freqs.most_common(MAX_VOCAB_SIZE)
print("Top 10 Most Common Words: ", most_common_words[0:10])

Top 10 Most Common Words:  [('i', 24290), ('to', 18191), ('the', 18108), ('it', 12243), ('a', 11488), ('and', 10254), ('my', 7651), ('is', 7306), ('you', 7123), ('for', 6898)]


In [43]:
vocab = {word: idx + 2 for idx, (word, freq) in enumerate(most_common_words)}
vocab['<unk>'] = 0
vocab['<pad>'] = 1 

In [45]:
def encode_text(text, vocab):
    tokenized_text = tokenize(text)
    encoded_text = [vocab.get(word, vocab['<unk>']) for word in tokenized_text]
    return encoded_text

In [47]:
def pad_or_truncate(encoded_text, max_len):
    if len(encoded_text) > max_len:
        return encoded_text[:max_len]
    else:
        return encoded_text + [vocab['<pad>']] * (max_len - len(encoded_text)) 

In [49]:
MAX_SEQ_LENGTH = 128
padded_text_seqs = [pad_or_truncate(encode_text(text, vocab), max_len=MAX_SEQ_LENGTH) for text in train_texts]

In [55]:
import torch
X_tensor = torch.tensor(padded_text_seqs)
y_tensor = torch.tensor(train_labels, dtype=torch.long)

from torch.utils.data import DataLoader, TensorDataset
batch_size = 16
train_dataset = TensorDataset(X_tensor, y_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [57]:
import torch.nn as nn
torch.manual_seed(42) 

class SimpleNNWithEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(SimpleNNWithEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.fc1 = nn.Linear(embed_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x, dim=1)
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        return x

In [59]:
vocab_size = len(vocab)
embed_size = 50 
hidden_size = 100
output_size = 3

text_classifier_nn = SimpleNNWithEmbedding(vocab_size, embed_size, hidden_size, output_size)
print(text_classifier_nn)

SimpleNNWithEmbedding(
  (embedding): Embedding(1002, 50)
  (fc1): Linear(in_features=50, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=3, bias=True)
)


In [61]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(text_classifier_nn.parameters(), lr=0.005)

In [63]:
def train_model(model, train_loader, criterion, optimizer, num_epochs=5):
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0.0
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            output = model(batch_X)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        avg_loss = epoch_loss / len(train_loader)
        if (epoch + 1) % 5 == 0:
            print(f"[Epoch {epoch + 1}/{num_epochs}], Average CE Loss: {avg_loss:.4f}")

train_model(text_classifier_nn, train_dataloader, criterion, optimizer, num_epochs=50)

[Epoch 5/50], Average CE Loss: 0.7451
[Epoch 10/50], Average CE Loss: 0.6842
[Epoch 15/50], Average CE Loss: 0.6340
[Epoch 20/50], Average CE Loss: 0.5839
[Epoch 25/50], Average CE Loss: 0.5389
[Epoch 30/50], Average CE Loss: 0.5020
[Epoch 35/50], Average CE Loss: 0.4679
[Epoch 40/50], Average CE Loss: 0.4371
[Epoch 45/50], Average CE Loss: 0.4192
[Epoch 50/50], Average CE Loss: 0.3960


In [67]:
test_texts = df['text'].tolist()
test_labels = df['label'].tolist()
padded_text_seqs_test = [pad_or_truncate(encode_text(test_seq, vocab), MAX_SEQ_LENGTH) for test_seq in test_texts]

In [69]:
X_tensor_test = torch.tensor(padded_text_seqs_test)
y_tensor_test = torch.tensor(test_labels, dtype=torch.long)

from torch.utils.data import DataLoader, TensorDataset
test_dataset = TensorDataset(X_tensor_test, y_tensor_test)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [71]:
import torch.nn.functional as F

def get_predictions_and_probabilities(model, test_loader):
    model.eval()

    all_probs = [] 
    all_labels = []

    with torch.no_grad(): 
        for batch_X, batch_y in test_loader:
            outputs = model(batch_X)
            probs = F.softmax(outputs, dim=1)  
            all_probs.extend(probs.cpu().numpy())
            predicted_labels = torch.argmax(outputs, dim=1)
            all_labels.extend(predicted_labels.cpu().numpy())

    return all_probs, all_labels

pred_probs, pred_labels = get_predictions_and_probabilities(text_classifier_nn, test_loader)

In [73]:
from sklearn.metrics import confusion_matrix, classification_report
conf_matrix = confusion_matrix(test_labels, pred_labels)
report = classification_report(test_labels, pred_labels)

print(conf_matrix)
print(report)

[[7379 1264  462]
 [ 955 9637 1057]
 [ 235  922 9321]]
              precision    recall  f1-score   support

           0       0.86      0.81      0.84      9105
           1       0.82      0.83      0.82     11649
           2       0.86      0.89      0.87     10478

    accuracy                           0.84     31232
   macro avg       0.85      0.84      0.84     31232
weighted avg       0.84      0.84      0.84     31232

