In [None]:
from torch import nn
from torchvision import transforms, datasets
from torch.utils.data import random_split, DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence,pack_padded_sequence
import numpy as np
import pandas as pd
import torch
import tiktoken

In [None]:
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()

import os

In [None]:
def collate_fn_imdb(data, pad_value=50257, use_embedding=True):
    data.sort(key=lambda x: len(x[0]), reverse=True)
    sequences = [x[0] for x in data]
    scores = torch.tensor([x[1] for x in data], dtype=torch.float32)

    seq_len = [s.size(0) for s in sequences]
    padded_seqs_long = pad_sequence(sequences, batch_first=True, padding_value=pad_value)

    if use_embedding:
        return padded_seqs_long, scores
    else:
        padded_seqs_float = padded_seqs_long.unsqueeze(-1).float()
        packed = pack_padded_sequence(padded_seqs_float, seq_len, batch_first=True, enforce_sorted=True)
        
        return packed, scores

def tokenize_text(text_list, tokenizer):
    tokenized_text = []
    for text in text_list:
        tokens = torch.tensor(tokenizer.encode(text))
        tokenized_text.append(tokens)

    return tokenized_text

class IMDBDataset(Dataset):
    def __init__(self, comments_token_ids, sentiments, scores):
        self.comments_token_ids = comments_token_ids
        self.sentiments = sentiments
        self.scores = scores


    def __len__(self):
        return len(self.comments_token_ids)
    
    def __getitem__(self, idx):
        return self.comments_token_ids[idx], self.scores[idx]
    
def create_IMDB_dataloader(comments_token_ids, sentiment, scores, batch_size=32, shuffle=True, num_workers=0, use_embedding=True):
    dataset = IMDBDataset(comments_token_ids, sentiment, scores)
    collate_wrapper = lambda x: collate_fn_imdb(x, pad_value=50257, use_embedding=use_embedding)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, collate_fn=collate_wrapper)
    
    return dataloader

In [None]:
train_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)),
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1326,), (0.3106,)),
])

mnist_train = datasets.MNIST(root='data', train=True, download=True, transform=train_transform)
mnist_test = datasets.MNIST(root='data', train=False, download=True, transform=test_transform)

train_size = int(0.9 * len(mnist_train))
val_size = len(mnist_train) - train_size
mnist_train_set, mnist_val_set = random_split(mnist_train, [train_size, val_size])

sample, target = mnist_train[0]
print(f"Sample count: {len(mnist_train)}")
print(f"Sample shape: {sample.shape}, Target: {target}")
print(f"Sample type: {sample.dtype}, Target type: {type(target)}")
print(f"Sample min: {sample.min()}, Sample max: {sample.max()}")
print(f"Sample training set size: {len(mnist_train_set)}")
print(f"Sample validation set size: {len(mnist_val_set)}")

In [None]:
IMDB_train = pd.read_csv("data/IMDB_train.csv")
IMDB_test = pd.read_csv("data/IMDB_test.csv")

train_comments = IMDB_train["preprocessed_comments"].to_list()
train_sentiments = IMDB_train["sentiment"].to_list()
train_scores = IMDB_train["score"].to_list()
test_comments = IMDB_test["preprocessed_comments"].to_list()
test_sentiments = IMDB_test["sentiment"].to_list()
test_scores = IMDB_test["score"].to_list()
tokenizer = tiktoken.get_encoding("gpt2")

tokenized_train_comments = tokenize_text(train_comments, tokenizer)
tokenized_test_comments = tokenize_text(test_comments, tokenizer)
sample = train_comments[0]
sample_sentiment = train_sentiments[0]
sample_score = train_scores[0]
token_ids = tokenized_train_comments[0]
reconstructed = tokenizer.decode(token_ids.tolist())

print(f"Sample: {sample}")
print(f"Sentiment: {sample_sentiment}")
print(f"Score: {sample_score}")
print(f"Token IDs: {token_ids}")
print(f"Reconstructed: {reconstructed}")

In [None]:
vocab_size = 50257
embedding_dim = 64
padding_token_id = vocab_size

train_data = create_IMDB_dataloader(tokenized_train_comments, train_sentiments, train_scores)
test_data = create_IMDB_dataloader(tokenized_test_comments, test_sentiments, test_scores)

sample = next(iter(train_data))
print(f"Sample: {sample[0]}, Target: {sample[1]}")

In [None]:
class torch_RNN_MNIST(nn.Module):
    def __init__(self, input_size=28, hiddin_dim=14, out_dim=10):
        super().__init__()
        self.rnn = nn.RNN(input_size, hiddin_dim, 3)
        self.fc = nn.Linear(hiddin_dim, out_dim)

    def forward(self,x):
        print(x.shape)
        h0 = torch.zeros(3, x.size(0), 14)

        output, hn = self.rnn(x, h0)

        return self.fc(output[:,-1,:])  
    
def collate_fn_mnist(x):
    images = [i[0] for i in x]
    labels = [i[1] for i in x]

    images = torch.vstack(images)
    images = images.squeeze(1)

    labels = torch.tensor(labels, dtype=torch.long)

    return images,labels

def train_mnist(model, train_dataset, val_dataset):
    lr = 1e-3
    epochs = 50
    batch = 256
    weight_decay = 2e-4
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    loss_fn = nn.CrossEntropyLoss()

    train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True, collate_fn=collate_fn_mnist)
    val_loader = DataLoader(val_dataset, batch_size=batch, shuffle=False, collate_fn=collate_fn_mnist)

    model.to(device)
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for x, y in tqdm(train_loader):
            x = x.to(device)
            y = y.to(device)

            optimizer.zero_grad()
            output = model(x)
            loss = loss_fn(output, y)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss/len(train_loader)}")
        with torch.no_grad():
            val_loss = 0
            correct = 0
            total = 0
            model.eval()
            for x, y in val_loader:
                x = x.to(device)
                y = y.to(device)

                output = model(x)
                loss = loss_fn(output, y)
                val_loss += loss.item()

                _, predicted = torch.max(output.data, 1)
                total += y.size(0)
                correct += (predicted == y).sum().item()

            print(f"Validation Loss: {val_loss/len(val_loader)}, Accuracy: {100 * correct / total}%")



In [None]:
rnn_mnist = torch_RNN_MNIST(input_size=28, hiddin_dim=14, out_dim=10)

train_mnist(rnn_mnist, mnist_train_set, mnist_val_set)