In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Define a simple dataset class
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, word_to_idx):
        self.texts = texts
        self.labels = labels
        self.word_to_idx = word_to_idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = [self.word_to_idx[word] for word in self.texts[idx].split()]

        try:
            label = torch.as_tensor(int(self.labels[idx])).clone().detach()
        except ValueError:
            label = torch.as_tensor(0)

        return {'text': torch.LongTensor(text), 'label': label}

def collate_batch(batch):
    texts = [item['text'] for item in batch]
    labels = [item['label'] for item in batch]

    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)

    return {'text': padded_texts, 'label': torch.stack(labels)}

def read_data_from_csv(file_path):
    texts = []
    labels = []
    with open(file_path, 'r') as file:
        next(file)
        for line in file:
            parts = line.strip().split(',')
            if len(parts) >= 2:
                texts.append(parts[0])
                labels.append(parts[1].strip('\"'))
    return texts, labels

# Improved Emotion Analysis Model
class ImprovedEmotionAnalysisModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pretrained_embeddings=None):
        super(ImprovedEmotionAnalysisModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
            self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out[:, -1, :]
        lstm_out = self.dropout(lstm_out)
        out = self.fc(lstm_out)
        return out

# Read data from CSV file
file_path = '/content/gdrive/MyDrive/DS677 - Fall 23 - DL Project - Paresh, Ojaswi, Dinesh/TextAndEmotions.csv'
texts, labels = read_data_from_csv(file_path)

# Create vocabulary and word_to_idx mapping
vocab = set(' '.join(texts).split())
word_to_idx = {word: idx + 1 for idx, word in enumerate(vocab)}

# Update the dataset and pad sequences
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

train_dataset = EmotionDataset(train_texts, train_labels, word_to_idx)
test_dataset = EmotionDataset(test_texts, test_labels, word_to_idx)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

# Model parameters
embedding_dim = 50
hidden_dim = 100
output_dim = len(set(labels))

# Initialize the model and choose the appropriate pre-trained embeddings
# Download GloVe embeddings from https://nlp.stanford.edu/projects/glove/ and adjust the path
# Example code to load GloVe embeddings:
# embeddings_index = {}
# with open('path_to_glove/glove.6B.50d.txt', 'r', encoding='utf-8') as f:
#     for line in f:
#         values = line.split()
#         word = values[0]
#         coefs = np.asarray(values[1:], dtype='float32')
#         embeddings_index[word] = coefs
# embedding_matrix = np.zeros((len(word_to_idx) + 1, embedding_dim))
# for word, i in word_to_idx.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector
# model = ImprovedEmotionAnalysisModel(
#     vocab_size=len(word_to_idx) + 1,
#     embedding_dim=embedding_dim,
#     hidden_dim=hidden_dim,
#     output_dim=output_dim,
#     pretrained_embeddings=embedding_matrix,
# ).to(device)

model = ImprovedEmotionAnalysisModel(
    vocab_size=len(word_to_idx) + 1,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
).to(device)

# Training parameters
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 50

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}', leave=False):
        text, label = batch['text'].to(device), batch['label'].to(device)
        optimizer.zero_grad()
        output = model(text)
        loss = criterion(output, label)
        loss.backward()
        # Implement gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')

# Evaluation
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Evaluating', leave=False):
        text, label = batch['text'].to(device), batch['label'].to(device)
        output = model(text)
        predictions = torch.argmax(output, dim=1)
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(label.cpu().numpy())

accuracy = accuracy_score(all_labels, all_predictions)
print(f'Test Accuracy: {accuracy * 100:.2f}%')




Epoch 1/50, Loss: 4.298942142062717




Epoch 2/50, Loss: 2.4066073894500732




Epoch 3/50, Loss: 0.8778071800867716




Epoch 4/50, Loss: 0.5014093981848823




Epoch 5/50, Loss: 0.4190385606553819




Epoch 6/50, Loss: 0.4098073012299008




Epoch 7/50, Loss: 0.39207568764686584




Epoch 8/50, Loss: 0.4053659869564904




Epoch 9/50, Loss: 0.38669821951124406




Epoch 10/50, Loss: 0.38769100109736127




Epoch 11/50, Loss: 0.39474110470877755




Epoch 12/50, Loss: 0.39178109500143266




Epoch 13/50, Loss: 0.3780011369122399




Epoch 14/50, Loss: 0.384607646200392




Epoch 15/50, Loss: 0.34878357582622105




Epoch 16/50, Loss: 0.33234446081850266




Epoch 17/50, Loss: 0.3111840420299106




Epoch 18/50, Loss: 0.2962263855669234




Epoch 19/50, Loss: 0.2789573238955604




Epoch 20/50, Loss: 0.23815032177501255




Epoch 21/50, Loss: 0.21328215135468376




Epoch 22/50, Loss: 0.1740485429763794




Epoch 23/50, Loss: 0.15266337659623888




Epoch 24/50, Loss: 0.13207165648539862




Epoch 25/50, Loss: 0.11312453986869918




Epoch 26/50, Loss: 0.11571580585506228




Epoch 27/50, Loss: 0.09975932165980339




Epoch 28/50, Loss: 0.09674576297402382




Epoch 29/50, Loss: 0.08749340826438533




Epoch 30/50, Loss: 0.08410173861516847




Epoch 31/50, Loss: 0.08670817501842976




Epoch 32/50, Loss: 0.07847022472156419




Epoch 33/50, Loss: 0.07839789179464181




Epoch 34/50, Loss: 0.08933944130937259




Epoch 35/50, Loss: 0.07552709989249706




Epoch 36/50, Loss: 0.0832259746061431




Epoch 37/50, Loss: 0.08482471501661672




Epoch 38/50, Loss: 0.077828049659729




Epoch 39/50, Loss: 0.08589657851391369




Epoch 40/50, Loss: 0.06929195848190123




Epoch 41/50, Loss: 0.07019840718971358




Epoch 42/50, Loss: 0.07724801844192876




Epoch 43/50, Loss: 0.04984689628084501




Epoch 44/50, Loss: 0.04687118778626124




Epoch 45/50, Loss: 0.03701224115987619




Epoch 46/50, Loss: 0.02200316648102469




Epoch 47/50, Loss: 0.02892321317146222




Epoch 48/50, Loss: 0.025212829052988026




Epoch 49/50, Loss: 0.02620111669724186




Epoch 50/50, Loss: 0.029320076832340822




Test Accuracy: 93.75%


In [None]:
# Sample inputs
sample_inputs = [
    "I love the new feature!",
    "I am satisfied with the customer service.",
    "This movie is not good!",
    "This software have lots of features.",
    "Incredible customer experience!",
    "I am anxious"
]

# Predict emotions for each sample input
for sample_input in sample_inputs:
    # Preprocess the sample input
    sample_input_indices = [word_to_idx[word] for word in sample_input.split() if word in word_to_idx]

    # Check if the sample input is empty after filtering
    if not sample_input_indices:
        print(f"No valid words found in the sample input: {sample_input}")
    else:
        # Convert to tensor and add batch dimension
        sample_input_tensor = torch.LongTensor(sample_input_indices).unsqueeze(0).to(device)

        # Pass through the trained model
        model.eval()
        with torch.no_grad():
            output = model(sample_input_tensor)

        # Interpret the model's output
        predicted_class = torch.argmax(output, dim=1).item()

        # Map the predicted class to the corresponding emotion label
        emotion_labels = {0: "Negative", 1: "Positive"}  # Update with your specific labels
        predicted_emotion = emotion_labels.get(predicted_class, "Unknown")

        print(f"The predicted emotion for the sample input '{sample_input}' is: {predicted_emotion}")


The predicted emotion for the sample input 'I love the new feature!' is: Positive
The predicted emotion for the sample input 'I am satisfied with the customer service.' is: Positive
The predicted emotion for the sample input 'This movie is not good!' is: Positive
The predicted emotion for the sample input 'This software have lots of features.' is: Negative
The predicted emotion for the sample input 'Incredible customer experience!' is: Positive
The predicted emotion for the sample input 'I am anxious' is: Negative
