# Data Loading and Processing

In [2]:
import torch
import torch.nn as nn
import csv

# `reviews` is a list of review strings, and `labels` is a list of polarity values
reviews = []
labels = []

# Open the CSV file
with open('EcoPreprocessed.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)  # Using DictReader to read CSV as dictionaries
    for row in reader:
        reviews.append(row['review'])  # Extract the 'review' column
        labels.append(float(row['polarity']))  # Convert the 'polarity' column to float and append

In [None]:
import torch
import torch.nn as nn

# Create a set of all unique words in the dataset
words = {word for sentence in reviews for word in sentence.split()}

# Build a mapping from words to integers based on lexicographical ordering
word_to_int = {word: idx + 1 for idx, word in enumerate(sorted(words))}

# Encode each sentence to a list of integers based on the word_to_int mapping
def encode(sentence):
    return [word_to_int.get(word, 0) for word in sentence.split()]  # Default to 0 if word not found

# Convert each sentence into a tensor of integers
var_len_tensors = [torch.tensor(encode(sentence)) for sentence in reviews]

# Pad the sequences to the same length
training_dataset = nn.utils.rnn.pad_sequence(var_len_tensors, batch_first=True)

# Get the vocabulary size
vocab_size = len(words) + 1  # Adding 1 to account for the padding token (0)

# Convert labels to tensor
training_labels = torch.unsqueeze(torch.tensor(labels), dim=-1)

# Now we have vocabulary_size, training_dataset, word_to_int, and training_labels

# Model Definition

In [4]:
class EmotionPredictor(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int):
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)
        self.linear_layer = nn.Linear(embedding_dim, 1) # Fully connected layer into a single neuron
        self.tanh = nn.Tanh()
    
    def forward(self, x):
        # Lookup embeddings for the input tokens
        embedded = self.embedding_layer(x)
        # Compute the mean of the embeddings across the sequence length (dim=1)
        pooled = embedded.mean(dim=1)
        # Project the pooled representation to a single output score
        output = self.linear_layer(pooled)
        # Apply activation function Tanh to output between -1 and 1
        return self.tanh(output)

In [5]:
# Set hyperparameters
embedding_dimension = 256
batch_size = 64
num_epochs = 1000

# Initialize model, loss function, and optimizer
model = EmotionPredictor(vocab_size, embedding_dimension)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

# Training loop
for epoch in range(num_epochs):
    # Shuffle dataset at the start of each epoch
    randperm = torch.randperm(len(training_dataset))
    training_dataset, training_labels = training_dataset[randperm], training_labels[randperm]
    # Take a batch of the first `batch_size` samples
    mini_batch = training_dataset[:batch_size]
    mini_batch_labels = training_labels[:batch_size]

    # Forward pass
    prediction = model(mini_batch)
    # Compute loss
    loss = loss_function(prediction, mini_batch_labels)
    # Backpropagation and optimizer step
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print the loss every 100 epochs
    if epoch % 100 == 0:
        print(f'Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [0/1000], Loss: 1.2384
Epoch [100/1000], Loss: 0.1590
Epoch [200/1000], Loss: 0.0953
Epoch [300/1000], Loss: 0.1296
Epoch [400/1000], Loss: 0.1100
Epoch [500/1000], Loss: 0.0843
Epoch [600/1000], Loss: 0.0829
Epoch [700/1000], Loss: 0.1094
Epoch [800/1000], Loss: 0.0918
Epoch [900/1000], Loss: 0.0838


# Testing

In [None]:
# Define the examples
examples = [
    "worst movie ever",
    "best movie ever",
    "weird but funny movie"
]

# Encode the examples as integer tensors using the word_to_int mapping
encoded_examples = [
    torch.tensor([word_to_int.get(word, 0) for word in example.split()])
    for example in examples
]

# Pad the sequences to make them the same length (for batching)
testing_tensor = torch.nn.utils.rnn.pad_sequence(encoded_examples, batch_first=True)

# Set the model to evaluation mode
model.eval()

# Perform the prediction
predictions = model(testing_tensor)

# Print the predictions as a list
print(predictions.tolist())


tensor([[5090, 2955, 1530]])
[[-0.9994620084762573]]


# ONNX Export

In [None]:
# Set the model to evaluation mode
model.eval()

# Create a dummy input tensor
dummy_input = torch.randint(0, vocab_size, (1, 50))

# Export the model to ONNX
torch.onnx.export(
    model,                    # model to export
    dummy_input,              # dummy input (shape must match your model's input)
    "emotion_predictor.onnx",          # path to save the ONNX file
    verbose=True
)

Exported graph: graph(%input : Long(1, *, strides=[50, 1], requires_grad=0, device=cpu),
      %embedding_layer.weight : Float(5159, 256, strides=[256, 1], requires_grad=1, device=cpu),
      %linear_layer.weight : Float(1, 256, strides=[256, 1], requires_grad=1, device=cpu),
      %linear_layer.bias : Float(1, strides=[1], requires_grad=1, device=cpu)):
  %/embedding_layer/Gather_output_0 : Float(1, *, 256, strides=[12800, 256, 1], requires_grad=1, device=cpu) = onnx::Gather[onnx_name="/embedding_layer/Gather"](%embedding_layer.weight, %input), scope: __main__.EmotionPredictor::/torch.nn.modules.sparse.Embedding::embedding_layer # /Users/tushar/VSCodeProjects/Jupyter Notebooks/sentiment-analysis/.venv/lib/python3.13/site-packages/torch/nn/functional.py:2551:0
  %/ReduceMean_output_0 : Float(1, 256, strides=[256, 1], requires_grad=1, device=cpu) = onnx::ReduceMean[axes=[1], keepdims=0, onnx_name="/ReduceMean"](%/embedding_layer/Gather_output_0), scope: __main__.EmotionPredictor:: # /va

# `word_to_int` Export

In [14]:
import json

# Save the dictionary to a JSON file
with open("word_to_int.json", "w") as json_file:
    json.dump(word_to_int, json_file)