### Sentiment analysis on movie reviews using Python

In [2]:
import torch
from datasets import load_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define tokenizer
tokenizer = get_tokenizer("basic_english")

# Load IMDb dataset
dataset = load_dataset('imdb')
train_iter = dataset['train']
test_iter = dataset['test']

# Tokenize and numericalize the text
def tokenize_and_numericalize(iterator):
    for review in iterator:
        yield tokenizer(review['text'])

# Build vocabulary
vocab = build_vocab_from_iterator(tokenize_and_numericalize(train_iter), specials=["<unk>"])

# Define text transformation functions
text_transform = lambda x: [vocab[token] for token in tokenizer(x)]
label_transform = lambda x: int(x)

# Convert text data to tensors
def collate_batch(batch):
    labels = torch.tensor([label_transform(item['label']) for item in batch], dtype=torch.float64)
    texts = [torch.tensor(text_transform(item['text']), dtype=torch.int64) for item in batch]
    texts = torch.nn.utils.rnn.pad_sequence(texts, batch_first=True)
    return texts, labels.to(device)

# Create data loaders
train_loader = torch.utils.data.DataLoader(list(train_iter), batch_size=32, shuffle=True, collate_fn=collate_batch)
test_loader = torch.utils.data.DataLoader(list(test_iter), batch_size=32, shuffle=False, collate_fn=collate_batch)


  from .autonotebook import tqdm as notebook_tqdm


### Training

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the neural network architecture
class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, _ = self.rnn(embedded)
        last_hidden_state = output[:, -1, :]
        return self.fc(last_hidden_state)

# Define hyperparameters
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 1

# Initialize the model
model = SentimentClassifier(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for batch in iterator:
        text, labels = batch
        optimizer.zero_grad()
        predictions = model(text).squeeze(1)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# Evaluation function
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in iterator:
            text, labels = batch
            predictions = model(text).squeeze(1)
            loss = criterion(predictions, labels)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# Train the model
N_EPOCHS = 5
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion)
    valid_loss = evaluate(model, test_loader, criterion)
    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Test Loss: {valid_loss:.3f}')
