In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from collections import Counter
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence

# Custom Dataset class for handling text
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        self.vocab = self.build_vocab(self.texts)

    @staticmethod
    def build_vocab(texts):
        tokens = [word for text in texts for word in text.split()]
        vocab = {word: i+1 for i, word in enumerate(set(tokens))}  # +1 for padding index
        return vocab
    
    def encode_text(self, text):
        return [self.vocab[word] for word in text.split()]

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        encoded_text = self.encode_text(self.texts[index])
        return torch.tensor(encoded_text, dtype=torch.long), torch.tensor(self.labels[index], dtype=torch.float32)

    @staticmethod
    def collate_fn(batch):
        texts, labels = zip(*batch)
        texts = pad_sequence(texts, batch_first=True, padding_value=0)  # Padding the sequences
        labels = torch.tensor(labels, dtype=torch.float32)
        return texts, labels

class SimpleNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim=10, hidden_dim=10):
        super(SimpleNN, self).__init__()
        # An embedding layer that converts input data (indices of words) into embeddings
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=0)
        
        # A fully connected layer that maps embeddings to hidden_dim space
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        
        # Output layer that maps from hidden space to 1 output (for binary classification)
        self.fc2 = nn.Linear(hidden_dim, 1)
    
    def forward(self, x):
        # Pass data through the embedding layer
        # The input x should be of shape (batch_size, sequence_length)
        embedded = self.embedding(x)
        
        # Take the mean of the embeddings (an example of simple pooling)
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1)
        
        # Pass the pooled embeddings through the fully connected layer with ReLU activation
        hidden = F.relu(self.fc1(pooled))
        
        # Pass the result through the output layer and apply the sigmoid activation function
        # The output will be a batch of single numbers (probabilities)
        output = torch.sigmoid(self.fc2(hidden))
        
        return output.squeeze()  # Squeeze to remove any extra dimensions if output is for single example



In [52]:
# Assume you have downloaded the Sentiment140 dataset
# and it is in a CSV file called 'sentiment140.csv'
import pandas as pd

# Load the dataset
df = pd.read_csv('../../data-sets/sentiment140.csv', encoding='latin1', usecols=[0, 5], names=['sentiment', 'text'])

# Preprocess the tweets
# ... here you would add your preprocessing steps, like removing URLs, Twitter handles, etc.

# Encode the sentiments (0 for negative, 1 for positive)
df['sentiment'] = df['sentiment'].replace(4, 1)

df1 = df[df["sentiment"] == 1][0:5000]
df2 = df[df["sentiment"] == 0][0:5000]
combined_df = pd.concat([df1, df2], ignore_index=True)

# Split the dataset
train_df, test_df = train_test_split(combined_df, test_size=0.2, random_state=42)

# Proceed with creating your TextDataset instances and DataLoaders
train_data = TextDataset(train_df['text'].tolist(), train_df['sentiment'].tolist())
test_data = TextDataset(test_df['text'].tolist(), test_df['sentiment'].tolist())

In [55]:
df1

Unnamed: 0,sentiment,text
800000,1,I LOVE @Health4UandPets u guys r the best!!
800001,1,im meeting up with one of my besties tonight! ...
800002,1,"@DaRealSunisaKim Thanks for the Twitter add, S..."
800003,1,Being sick can be really cheap when it hurts t...
800004,1,@LovesBrooklyn2 he has that effect on everyone
...,...,...
804995,1,"@lbran, thanks for sending us the package - go..."
804996,1,@ickleoriental hahahha.. U obviously don't hv ...
804997,1,"@juliekoh It's an internet term, but it's spil..."
804998,1,new day.... NEW TRACK!!!!


In [27]:
len(train_data.labels)

800

In [53]:
vocab_size = len(train_data.vocab) + 1# +1 for padding index

# Data loaders
train_loader = DataLoader(train_data, batch_size=2, shuffle=True, collate_fn=TextDataset.collate_fn)
test_loader = DataLoader(test_data, batch_size=2, collate_fn=TextDataset.collate_fn)

# Initialize the neural network
model = SimpleNN(vocab_size)
criterion = nn.BCELoss()  # Binary cross-entropy loss for binary classification
optimizer = Adam(model.parameters(), lr=0.001)  # Using Adam optimizer

# Training the model
for epoch in range(5):  # Loop over the dataset multiple times
    for i, (inputs, labels) in enumerate(train_loader, 0):
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

# Testing the model
correct = 0
total = 0
with torch.no_grad():  # Inference mode, no gradients
    for inputs, labels in test_loader:
        outputs = model(inputs).squeeze()
        predicted = outputs.round()  # Threshold predictions to get binary classification
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Accuracy: {accuracy}')

Accuracy: 0.502


In [56]:
def predict_sentiment(model, vocab, texts):
    # Tokenize and encode the new text
    tokens = [[vocab.get(word, 0) for word in text.split()] for text in texts]
    lengths = [len(token) for token in tokens]

    # Pad the sequences
    padded_tokens = torch.zeros(len(tokens), max(lengths)).long()
    for i, token in enumerate(tokens):
        padded_tokens[i, :lengths[i]] = torch.tensor(token)
    
    # Make predictions
    with torch.no_grad():
        outputs = model(padded_tokens).squeeze()
    
    # Debugging prints
    print("Encoded texts:", tokens)
    print("Model raw outputs:", outputs)
    
    predictions = outputs.round().numpy()  # Convert to numpy array
    
    # Convert predictions to text labels
    labels = ['Positive' if pred == 1 else 'Negative' for pred in predictions]
    return labels


# Example usage:
new_texts = ["This product is really very good", "I'm not happy with this bad service"]
predictions = predict_sentiment(model, train_data.vocab, new_texts)
print(predictions)


Encoded texts: [[10356, 19682, 1894, 16613, 21461, 1983], [7004, 12282, 24033, 12737, 19106, 16153, 12429]]
Model raw outputs: tensor([0.9169, 0.2497])
['Positive', 'Negative']


In [None]:
xtr

In [35]:
sum(1 for lab in train_data.labels if lab == 0)

800