In [149]:
from torch import nn
from torchvision import transforms, datasets
from torch.utils.data import random_split, DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence,pack_padded_sequence
import numpy as np
import pandas as pd
import torch
import tiktoken

In [150]:
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()

import os

In [151]:
def collate_fn(data, pad_value=50257, use_embedding=True):
    data.sort(key=lambda x: len(x[0]), reverse=True)
    sequences = [x[0] for x in data]
    scores = torch.tensor([x[1] for x in data], dtype=torch.float32)

    seq_len = [s.size(0) for s in sequences]
    padded_seqs_long = pad_sequence(sequences, batch_first=True, padding_value=pad_value)

    if use_embedding:
        return padded_seqs_long, scores
    else:
        padded_seqs_float = padded_seqs_long.unsqueeze(-1).float()
        packed = pack_padded_sequence(padded_seqs_float, seq_len, batch_first=True, enforce_sorted=True)
        
        return packed, scores

def tokenize_text(text_list, tokenizer):
    tokenized_text = []
    for text in text_list:
        tokens = torch.tensor(tokenizer.encode(text))
        tokenized_text.append(tokens)

    return tokenized_text

class IMDBDataset(Dataset):
    def __init__(self, comments_token_ids, sentiments, scores):
        self.comments_token_ids = comments_token_ids
        self.sentiments = sentiments
        self.scores = scores


    def __len__(self):
        return len(self.comments_token_ids)
    
    def __getitem__(self, idx):
        return self.comments_token_ids[idx], self.scores[idx]
    
def create_IMDB_dataloader(comments_token_ids, sentiment, scores, batch_size=32, shuffle=True, num_workers=0, use_embedding=True):
    dataset = IMDBDataset(comments_token_ids, sentiment, scores)
    collate_wrapper = lambda x: collate_fn(x, pad_value=50257, use_embedding=use_embedding)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, collate_fn=collate_wrapper)
    
    return dataloader

In [152]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)),
])

mnist_train = datasets.MNIST(root='data', train=True, download=True, transform=transform)
mnist_test = datasets.MNIST(root='data', train=False, download=True, transform=transform)

train_size = int(0.9 * len(mnist_train))
val_size = len(mnist_train) - train_size
mnist_train_set, mnist_val_set = random_split(mnist_train, [train_size, val_size])

sample, target = mnist_train[0]
print(f"Sample count: {len(mnist_train)}")
print(f"Sample shape: {sample.shape}, Target: {target}")
print(f"Sample type: {sample.dtype}, Target type: {type(target)}")
print(f"Sample min: {sample.min()}, Sample max: {sample.max()}")
print(f"Sample training set size: {len(mnist_train_set)}")
print(f"Sample validation set size: {len(mnist_val_set)}")

Sample count: 60000
Sample shape: torch.Size([1, 28, 28]), Target: 5
Sample type: torch.float32, Target type: <class 'int'>
Sample min: -0.4242129623889923, Sample max: 2.821486711502075
Sample training set size: 54000
Sample validation set size: 6000


In [153]:
IMDB_train = pd.read_csv("data/IMDB_train.csv")
IMDB_test = pd.read_csv("data/IMDB_test.csv")

train_comments = IMDB_train["preprocessed_comments"].to_list()
train_sentiments = IMDB_train["sentiment"].to_list()
train_scores = IMDB_train["score"].to_list()
test_comments = IMDB_test["preprocessed_comments"].to_list()
test_sentiments = IMDB_test["sentiment"].to_list()
test_scores = IMDB_test["score"].to_list()
tokenizer = tiktoken.get_encoding("gpt2")

tokenized_train_comments = tokenize_text(train_comments, tokenizer)
tokenized_test_comments = tokenize_text(test_comments, tokenizer)
sample = train_comments[0]
sample_sentiment = train_sentiments[0]
sample_score = train_scores[0]
token_ids = tokenized_train_comments[0]
reconstructed = tokenizer.decode(token_ids.tolist())

print(f"Sample: {sample}")
print(f"Sentiment: {sample_sentiment}")
print(f"Score: {sample_score}")
print(f"Token IDs: {token_ids}")
print(f"Reconstructed: {reconstructed}")

Sample: For movie get respect sure lot memorable quote listed gem . Imagine movie Joe Piscopo actually funny ! Maureen Stapleton scene stealer . The Moroni character absolute scream . Watch Alan The Skipper Hale jr . police Sgt .
Sentiment: 1
Score: 9
Token IDs: tensor([ 1890,  3807,   651,  2461,  1654,  1256, 18078,  9577,  5610, 16840,
          764, 18450,  3807,  5689,   350,  2304,   404,    78,  1682,  8258,
         5145,  6669, 49851,   520,   499, 10565,  3715,  8711,   263,   764,
          383,  3461, 14651,  2095,  4112,  8196,   764,  6305, 12246,   383,
         3661, 14710, 35056,   474,    81,   764,  1644, 22925,   764])
Reconstructed: For movie get respect sure lot memorable quote listed gem . Imagine movie Joe Piscopo actually funny ! Maureen Stapleton scene stealer . The Moroni character absolute scream . Watch Alan The Skipper Hale jr . police Sgt .


In [154]:
vocab_size = 50257
embedding_dim = 64
padding_token_id = vocab_size

train_data = create_IMDB_dataloader(tokenized_train_comments, train_sentiments, train_scores)
test_data = create_IMDB_dataloader(tokenized_test_comments, test_sentiments, test_scores)

#sample = next(iter(train_data))
#print(f"Sample: {sample[0]}, Target: {sample[1]}")