## Setup and Installation

In [None]:
!pip install torchtext==0.4.0 datasets

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext import data
import time
import pandas as pd
import numpy as np
from collections import defaultdict

# Set random seeds for reproducibility
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
np.random.seed(SEED)

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## Data Preparation

In [None]:
# Define fields
TEXT = data.Field(tokenize='spacy',
                  tokenizer_language='en_core_web_sm',
                  include_lengths=True,
                  pad_first=True)
LABEL = data.LabelField(dtype=torch.float)

In [None]:
# Load IMDB dataset using Hugging Face
from datasets import load_dataset
from torchtext.data import Example, Dataset

print("Loading IMDB dataset from Hugging Face...")
imdb = load_dataset("imdb")

# Convert to torchtext format
train_examples = [Example.fromlist([item['text'], item['label']],
                                   [('text', TEXT), ('label', LABEL)])
                 for item in imdb['train']]

test_examples = [Example.fromlist([item['text'], item['label']],
                                  [('text', TEXT), ('label', LABEL)])
                for item in imdb['test']]

train_data = Dataset(train_examples, [('text', TEXT), ('label', LABEL)])
test_data = Dataset(test_examples, [('text', TEXT), ('label', LABEL)])

print(f"Loaded {len(train_data)} training examples and {len(test_data)} test examples")

In [None]:
# Split training data into train and validation
import random
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

In [None]:
# Build vocabulary
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

In [None]:
# Create iterators
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device
)

print(f'Created iterators with batch size: {BATCH_SIZE}')

## Model Definitions

We'll define all required model architectures:

In [None]:
# 1. Basic RNN Model (for Task 1 and 2)
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        return self.fc(hidden.squeeze(0))

In [None]:
# 2. Feed-Forward Neural Networks
class FeedForwardNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dims, output_dim, dropout=0.5):
        """
        hidden_dims: list of hidden layer dimensions
        e.g., [500] for 1-layer, [500, 300] for 2-layer, [500, 300, 200] for 3-layer
        """
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)

        # Build layers
        layers = []
        prev_dim = embedding_dim

        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = hidden_dim

        layers.append(nn.Linear(prev_dim, output_dim))

        self.layers = nn.Sequential(*layers)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)  # [sent len, batch size, emb dim]
        # Take mean of embeddings across sequence length
        pooled = embedded.mean(dim=0)  # [batch size, emb dim]
        return self.layers(pooled)

In [None]:
# 3. CNN Model
class CNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, n_filters, filter_sizes, output_dim, dropout=0.5):
        """
        filter_sizes: list of kernel sizes, e.g., [1, 2, 3]
        n_filters: number of filters for each kernel size
        """
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)

        # Create convolutional layers for each filter size
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=n_filters,
                     kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])

        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)  # [sent len, batch size, emb dim]
        embedded = embedded.permute(1, 0, 2)  # [batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1)  # [batch size, 1, sent len, emb dim]

        # Apply convolutions and max pooling
        conved = [nn.functional.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [nn.functional.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]

        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [None]:
# 4. LSTM Model
class LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers=1,
                 bidirectional=False, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                           bidirectional=bidirectional, dropout=dropout if n_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        # Pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu())
        packed_output, (hidden, cell) = self.lstm(packed_embedded)

        # hidden = [num layers * num directions, batch size, hidden dim]
        # Concatenate the final forward and backward hidden states
        if self.lstm.bidirectional:
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        else:
            hidden = hidden[-1,:,:]

        return self.fc(self.dropout(hidden))

## Training and Evaluation Functions

In [None]:
def binary_accuracy(preds, y):
    """Calculate accuracy"""
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        text, text_lengths = batch.text
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()

    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

## Experiment Runner

Function to run experiments and track results:

In [None]:
def run_experiment(model, optimizer, n_epochs, experiment_name):
    """
    Run a complete experiment and return results
    """
    criterion = nn.BCEWithLogitsLoss().to(device)
    model = model.to(device)

    print(f"\n{'='*70}")
    print(f"Experiment: {experiment_name}")
    print(f"Model parameters: {count_parameters(model):,}")
    print(f"{'='*70}")

    best_valid_loss = float('inf')
    best_valid_acc = 0
    training_time = 0

    for epoch in range(n_epochs):
        start_time = time.time()
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        training_time += (end_time - start_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            best_valid_acc = valid_acc
            torch.save(model.state_dict(), 'best-model.pt')

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\tVal. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')

    # Load best model and test
    model.load_state_dict(torch.load('best-model.pt'))
    test_loss, test_acc = evaluate(model, test_iterator, criterion)

    print(f'\nTest Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
    print(f'Total Training Time: {training_time/60:.2f} minutes')

    return {
        'experiment': experiment_name,
        'best_valid_loss': best_valid_loss,
        'best_valid_acc': best_valid_acc * 100,
        'test_loss': test_loss,
        'test_acc': test_acc * 100,
        'training_time': training_time / 60,  # in minutes
        'n_params': count_parameters(model)
    }

## Results Storage

In [None]:
# Dictionary to store all results
all_results = []

## Task 1: Warmup - Baseline RNN with SGD

This is the baseline experiment from the original notebook.

In [None]:
# Hyperparameters
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

# Create model
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
optimizer = optim.SGD(model.parameters(), lr=1e-3)

# Run experiment
result = run_experiment(model, optimizer, n_epochs=20,
                       experiment_name='Task 1: RNN with SGD (Baseline)')
all_results.append(result)

## Task 2: Different Optimizers (SGD, Adam, Adagrad)

Compare performance with different optimizers using the same RNN architecture.

In [None]:
# Task 2a: RNN with Adam
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
optimizer = optim.Adam(model.parameters())

result = run_experiment(model, optimizer, n_epochs=20,
                       experiment_name='Task 2: RNN with Adam')
all_results.append(result)

In [None]:
# Task 2b: RNN with Adagrad
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model.to(device) # Move model to device *before* optimizer initialization
optimizer = optim.Adagrad(model.parameters())

result = run_experiment(model, optimizer, n_epochs=20,
                       experiment_name='Task 2: RNN with Adagrad')
all_results.append(result)

## Task 3: Different Number of Epochs with Adam

Test with 5, 10, 20, and 50 epochs using Adam optimizer.

In [None]:
# Task 3a: 5 epochs
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
optimizer = optim.Adam(model.parameters())

result = run_experiment(model, optimizer, n_epochs=5,
                       experiment_name='Task 3: RNN Adam - 5 epochs')
all_results.append(result)

In [None]:
# Task 3b: 10 epochs
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
optimizer = optim.Adam(model.parameters())

result = run_experiment(model, optimizer, n_epochs=10,
                       experiment_name='Task 3: RNN Adam - 10 epochs')
all_results.append(result)

In [None]:
# Task 3c: 20 epochs (already done above, but for completeness)
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
optimizer = optim.Adam(model.parameters())

result = run_experiment(model, optimizer, n_epochs=20,
                       experiment_name='Task 3: RNN Adam - 20 epochs')
all_results.append(result)

In [None]:
# Task 3d: 50 epochs
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
optimizer = optim.Adam(model.parameters())

result = run_experiment(model, optimizer, n_epochs=50,
                       experiment_name='Task 3: RNN Adam - 50 epochs')
all_results.append(result)

## Task 4: Different Model Architectures

All experiments use Adam optimizer, 50 epochs, and randomly initialized embeddings.

In [None]:
# Task 4a: One-layer Feed-Forward NN (hidden_dim=500)
model = FeedForwardNN(INPUT_DIM, EMBEDDING_DIM, [500], OUTPUT_DIM)
optimizer = optim.Adam(model.parameters())

result = run_experiment(model, optimizer, n_epochs=50,
                       experiment_name='Task 4: 1-Layer FFN (500)')
all_results.append(result)

In [None]:
# Task 4b: Two-layer Feed-Forward NN (hidden_dims=500, 300)
model = FeedForwardNN(INPUT_DIM, EMBEDDING_DIM, [500, 300], OUTPUT_DIM)
optimizer = optim.Adam(model.parameters())

result = run_experiment(model, optimizer, n_epochs=50,
                       experiment_name='Task 4: 2-Layer FFN (500, 300)')
all_results.append(result)

In [None]:
# Task 4c: Three-layer Feed-Forward NN (hidden_dims=500, 300, 200)
model = FeedForwardNN(INPUT_DIM, EMBEDDING_DIM, [500, 300, 200], OUTPUT_DIM)
optimizer = optim.Adam(model.parameters())

result = run_experiment(model, optimizer, n_epochs=50,
                       experiment_name='Task 4: 3-Layer FFN (500, 300, 200)')
all_results.append(result)

In [None]:
# Task 4d: CNN with filter sizes [1, 2, 3]
N_FILTERS = 100
FILTER_SIZES = [1, 2, 3]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM)
optimizer = optim.Adam(model.parameters())

result = run_experiment(model, optimizer, n_epochs=50,
                       experiment_name='Task 4: CNN (filters 1,2,3)')
all_results.append(result)

In [None]:
# Task 4e: LSTM
model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, n_layers=2, bidirectional=False)
optimizer = optim.Adam(model.parameters())

result = run_experiment(model, optimizer, n_epochs=50,
                       experiment_name='Task 4: LSTM')
all_results.append(result)

In [None]:
# Task 4f: Bi-LSTM
model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, n_layers=2, bidirectional=True)
optimizer = optim.Adam(model.parameters())

result = run_experiment(model, optimizer, n_epochs=50,
                       experiment_name='Task 4: Bi-LSTM')
all_results.append(result)

## Results Summary and Analysis

Let's create comprehensive tables for the report.

In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(all_results)

# Display complete results
print("\n" + "="*100)
print("COMPLETE EXPERIMENTAL RESULTS")
print("="*100)
print(results_df.to_string(index=False))

# Save to CSV
results_df.to_csv('experiment_results.csv', index=False)
print("\nResults saved to 'experiment_results.csv'")

### Table 1: Optimizer Comparison (Task 2)

In [None]:
# Filter results for optimizer comparison
optimizer_results = results_df[results_df['experiment'].str.contains('Task 2|Task 1')].copy()
optimizer_results['Optimizer'] = optimizer_results['experiment'].str.extract(r'with (\w+)')

table1 = optimizer_results[['Optimizer', 'test_acc', 'test_loss', 'training_time']].copy()
table1.columns = ['Optimizer', 'Test Accuracy (%)', 'Test Loss', 'Training Time (min)']
table1 = table1.round(2)

print("\nTable 1: Optimizer Comparison (RNN, 20 epochs)")
print("="*60)
print(table1.to_string(index=False))

### Table 2: Epoch Comparison (Task 3)

In [None]:
# Filter results for epoch comparison
epoch_results = results_df[results_df['experiment'].str.contains('Task 3')].copy()
epoch_results['Epochs'] = epoch_results['experiment'].str.extract(r'(\d+) epochs')

table2 = epoch_results[['Epochs', 'test_acc', 'test_loss', 'training_time']].copy()
table2.columns = ['Epochs', 'Test Accuracy (%)', 'Test Loss', 'Training Time (min)']
table2 = table2.round(2)

print("\nTable 2: Epoch Comparison (RNN with Adam)")
print("="*60)
print(table2.to_string(index=False))

### Table 3: Model Architecture Comparison (Task 4)

In [None]:
# Filter results for model comparison
model_results = results_df[results_df['experiment'].str.contains('Task 4')].copy()
model_results['Model'] = model_results['experiment'].str.replace('Task 4: ', '')

table3 = model_results[['Model', 'n_params', 'test_acc', 'test_loss', 'training_time']].copy()
table3.columns = ['Model Architecture', 'Parameters', 'Test Accuracy (%)', 'Test Loss', 'Training Time (min)']
table3['Parameters'] = table3['Parameters'].apply(lambda x: f"{x:,}")
table3 = table3.round(2)

print("\nTable 3: Model Architecture Comparison (Adam, 50 epochs, Random Embeddings)")
print("="*100)
print(table3.to_string(index=False))

## Visualization

In [None]:
import matplotlib.pyplot as plt

# Plot 1: Optimizer Comparison
fig, axes = plt.subplots(1, 2, figsize=(10, 3))

optimizer_data = results_df[results_df['experiment'].str.contains('Task 2|Task 1')]
optimizers = optimizer_data['experiment'].str.extract(r'with (\w+)')[0].values

axes[0].bar(optimizers, optimizer_data['test_acc'])
axes[0].set_ylabel('Test Accuracy (%)')
axes[0].set_title('Optimizer Comparison - Test Accuracy')
axes[0].grid(axis='y', alpha=0.3)

axes[1].bar(optimizers, optimizer_data['training_time'])
axes[1].set_ylabel('Training Time (minutes)')
axes[1].set_title('Optimizer Comparison - Training Time')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('optimizer_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Plot 2: Epoch Comparison
epoch_data = results_df[results_df['experiment'].str.contains('Task 3')]
epochs = epoch_data['experiment'].str.extract(r'(\d+) epochs')[0].astype(int).values

fig, ax = plt.subplots(figsize=(7, 3.5))
ax.plot(epochs, epoch_data['test_acc'], 'o-', linewidth=2, markersize=8, label='Test Accuracy')
ax.set_xlabel('Number of Epochs')
ax.set_ylabel('Test Accuracy (%)')
ax.set_title('Impact of Training Epochs on Test Accuracy (RNN with Adam)')
ax.grid(True, alpha=0.3)
ax.legend()

plt.tight_layout()
plt.savefig('epoch_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Plot 3: Model Architecture Comparison
model_data = results_df[results_df['experiment'].str.contains('Task 4')]
model_names = model_data['experiment'].str.replace('Task 4: ', '').values

fig, ax = plt.subplots(figsize=(8, 3.5))
bars = ax.bar(range(len(model_names)), model_data['test_acc'])
ax.set_xticks(range(len(model_names)))
ax.set_xticklabels(model_names, rotation=45, ha='right')
ax.set_ylabel('Test Accuracy (%)')
ax.set_title('Model Architecture Comparison (Adam, 50 epochs)')
ax.grid(axis='y', alpha=0.3)

# Color the best model
best_idx = model_data['test_acc'].idxmax() - model_data.index[0]
bars[best_idx].set_color('green')

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## Key Findings and Analysis

**For your report, consider these analysis points:**

### 1. Optimizer Comparison (Task 2)
- Compare SGD vs Adam vs Adagrad
- Discuss convergence speed and final accuracy
- Explain why Adam typically performs better (adaptive learning rates)

### 2. Effect of Training Epochs (Task 3)
- Show how accuracy improves with more epochs
- Discuss diminishing returns and potential overfitting
- Identify the optimal number of epochs

### 3. Model Architecture Comparison (Task 4)
- Compare simple FFNs vs sequential models (LSTM, Bi-LSTM)
- Discuss why LSTM/Bi-LSTM capture sequential information better
- Analyze the trade-off between model complexity and performance
- Explain CNN's effectiveness in capturing local patterns

### 4. General Observations
- All models use randomly initialized embeddings (not Word2Vec)
- If you had Word2Vec results, you could compare:
  - Pre-trained embeddings capture semantic relationships
  - Random embeddings learn task-specific representations
  - Pre-trained usually gives better performance with less data

### 5. Recommendations
- Best optimizer for this task
- Optimal training duration
- Most effective model architecture

In [None]:
# Find best performing configurations
print("\n" + "="*80)
print("BEST PERFORMING CONFIGURATIONS")
print("="*80)

best_overall = results_df.loc[results_df['test_acc'].idxmax()]
print(f"\nBest Overall Performance:")
print(f"  Experiment: {best_overall['experiment']}")
print(f"  Test Accuracy: {best_overall['test_acc']:.2f}%")
print(f"  Test Loss: {best_overall['test_loss']:.3f}")
print(f"  Training Time: {best_overall['training_time']:.2f} minutes")

# Best by category
print("\nBest by Task:")
for task in ['Task 1', 'Task 2', 'Task 3', 'Task 4']:
    task_data = results_df[results_df['experiment'].str.contains(task)]
    if len(task_data) > 0:
        best = task_data.loc[task_data['test_acc'].idxmax()]
        print(f"\n  {task}: {best['experiment']}")
        print(f"    Test Accuracy: {best['test_acc']:.2f}%")

## Report Writing Tips

Use the tables and plots above to create your report. Here's a suggested structure:

### 1. Introduction (0.5 pages)
- Brief overview of sentiment analysis task
- Description of IMDB dataset
- Objectives of the experiments

### 2. Methodology (1 page)
- Data preprocessing
- Model architectures (briefly describe each)
- Training procedure and hyperparameters
- Evaluation metrics

### 3. Results (1.5 pages)
- Present Tables 1, 2, and 3
- Include the comparison plots
- Report best performing configurations

### 4. Analysis and Discussion (1 page)
- **Optimizer comparison**: Why does Adam outperform SGD?
- **Epoch analysis**: Does more training always help?
- **Architecture comparison**: Why do LSTMs work better for sequences?
- **Random vs Pre-trained embeddings**: What would be the expected difference?

### 5. Conclusion (0.5 pages)
- Summary of key findings
- Recommendations for sentiment analysis tasks
- Future work suggestions

**Note**: Keep it concise and focus on insights rather than just reporting numbers!