<a href="https://colab.research.google.com/github/vidyasagarcrj/Codedirect/blob/main/Sentiment_Analysis_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import nltk

nltk.download('punkt')

# Step 1: Load and preprocess your dataset
# Your dataset should have two columns: 'text' and 'label'
data = pd.read_csv("path_to_your_data.csv")  # Replace with your data file
data = data[['text', 'label']]  # Ensure these columns exist

# Encode labels to integers
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])  # Converts categorical labels to integers

# Tokenize the text
data['text'] = data['text'].apply(word_tokenize)

# Build vocabulary
vocab = set(word for sentence in data['text'] for word in sentence)
word_to_idx = {word: idx + 1 for idx, word in enumerate(vocab)}  # Indexing starts at 1 for padding
word_to_idx['<PAD>'] = 0  # Add padding token

# Convert text to sequences of integers
data['text'] = data['text'].apply(lambda x: [word_to_idx[word] for word in x])

# Pad sequences to the same length
max_len = max(len(x) for x in data['text'])  # Find the maximum sequence length
data['text'] = data['text'].apply(lambda x: x + [0] * (max_len - len(x)))  # Pad with 0s

# Step 2: Split the data
X = np.array(data['text'].tolist())
y = np.array(data['label'].tolist())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Create a PyTorch dataset
class SentimentDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = SentimentDataset(X_train, y_train)
test_dataset = SentimentDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Step 4: Define the LSTM-based sentiment analysis model
class SentimentModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(SentimentModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)  # Convert input to embeddings
        lstm_out, _ = self.lstm(embedded)  # Pass embeddings through LSTM
        hidden_state = lstm_out[:, -1, :]  # Use the last hidden state
        output = self.fc(hidden_state)  # Pass through fully connected layer
        return output

# Model hyperparameters
vocab_size = len(word_to_idx)  # Total vocabulary size
embed_dim = 128  # Size of word embeddings
hidden_dim = 256  # Number of LSTM hidden units
output_dim = len(label_encoder.classes_)  # Number of sentiment classes (e.g., positive/negative)

model = SentimentModel(vocab_size, embed_dim, hidden_dim, output_dim)

# Step 5: Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Step 6: Train the model
epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    epoch_acc = 0

    for X_batch, y_batch in tqdm(train_loader):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/len(train_loader):.4f}")

# Step 7: Save the model
torch.save(model.state_dict(), "sentiment_model.pth")
print("Model trained and saved successfully!")

# Step 8: Test the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        predictions = model(X_batch)
        _, predicted = torch.max(predictions, 1)
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")
