# IMDB Classification Training

Based on https://www.kaggle.com/code/hishamkhdair/nlp-imdb-classification-pytorch-embeddings

In [5]:
!cat aclImdb/train/pos/10001_10.txt

Brilliant over-acting by Lesley Ann Warren. Best dramatic hobo lady I have ever seen, and love scenes in clothes warehouse are second to none. The corn on face is a classic, as good as anything in Blazing Saddles. The take on lawyers is also superb. After being accused of being a turncoat, selling out his boss, and being dishonest the lawyer of Pepto Bolt shrugs indifferently "I'm a lawyer" he says. Three funny words. Jeffrey Tambor, a favorite from the later Larry Sanders show, is fantastic here too as a mad millionaire who wants to crush the ghetto. His character is more malevolent than usual. The hospital scene, and the scene where the homeless invade a demolition site, are all-time classics. Look for the legs scene and the two big diggers fighting (one bleeds). This movie gets better each time I see it (which is quite often).

In [6]:
import os
import pathlib

# Define a function to read the text data and return text and label pairs
def read_text_data(data_path):
    texts = []
    labels = []
    for label in ['pos', 'neg']:
        label_path = os.path.join(data_path, label)
        for text_file in os.listdir(label_path):
            with open(os.path.join(label_path, text_file), 'r', encoding='utf-8') as f:
                text = f.read()
            labels.append(1 if label == 'pos' else 0)
            texts.append(text)
    return texts, labels

# Path to the directory of the saved dataset
data_path = pathlib.Path("aclImdb")

# Read the text data and labels from the train directory
texts, labels = read_text_data(data_path/'train')

print(f'Successfully read {len(texts)} texts, and {len(labels)} labels from training dataset')

Successfully read 25000 texts, and 25000 labels from training dataset


In [32]:
import pickle
filehandler = open("text.pkl","wb")
pickle.dump(texts,filehandler)
filehandler.close()


In [9]:
from torchtext.data.utils import get_tokenizer

# Define a tokenizer function to preprocess the text
tokenizer = get_tokenizer('basic_english')

In [10]:
tokenizer('HERE Is an Example ;')


['here', 'is', 'an', 'example']

In [11]:
from torchtext.vocab import build_vocab_from_iterator

# Build the vocabulary from the text data
vocab = build_vocab_from_iterator(map(tokenizer, texts), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

# Define a function to numericalize the text
def numericalize_text(text):
    return [vocab[token] for token in tokenizer(text)]

In [13]:
# the length of the constructed vocab from the text set, 100683 unique tokens
print(len(vocab))

# checking the index of words that are present in the vocabulary
print(vocab(['here', 'is', 'an', 'example']))

# checking the index of a word that is not present in the vocabulary, returns 0, the index for <unk>
print(vocab['biblioklept'])

100683
[131, 9, 40, 464]
0


In [14]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split

# Define a custom dataset class for the text data
class CustomTextDataset(Dataset):
    def __init__(self, texts, labels, vocab, numericalize_text):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.numericalize_text = numericalize_text

    def __getitem__(self, index):
        label = self.labels[index]
        text = self.texts[index]
        numericalized_text = self.numericalize_text(text)
        return numericalized_text, label

    def __len__(self):
        return len(self.labels)

In [15]:
# Create train and validation datasets
dataset = CustomTextDataset(texts, labels, vocab, numericalize_text)
train_size = int(len(dataset) * 0.8)
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from torch.nn.utils.rnn import pad_sequence

# preprocess the data with a collate function, and pads the input sequences to the maximum length in the batch:
def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(_text)
        text_list.append(processed_text)
    padded_text = pad_sequence(text_list, batch_first=False, padding_value=1.0)
    return torch.tensor(label_list, dtype=torch.float64).to(device), padded_text.to(device)

# Create train and validation data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, collate_fn=collate_batch, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, collate_fn=collate_batch, batch_size=batch_size, shuffle=False)

In [17]:
label, text = next(iter(train_loader))
print(label.shape, text.shape)
print(label, text)

torch.Size([32]) torch.Size([662, 32])
tensor([0., 0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 1.,
        1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0.],
       dtype=torch.float64) tensor([[   25,    13, 30712,  ...,    59,    93,   578],
        [  237,     9,    16,  ...,    85,     6,    11],
        [   33,    34,     1,  ...,    84,    36,     1],
        ...,
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1]])


In [18]:
from torch import nn
import torch.nn.functional as F

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        embedded = embedded.permute(1, 0, 2)
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) 
        return self.fc(pooled)

In [19]:
# Create an instance of the text classification model with the given vocabulary size, embedding dimension and output dimension

model = TextClassificationModel(vocab_size = len(vocab), embedding_dim = 100, output_dim = 1)

# Define a loss function based on binary cross entropy and sigmoid activation
criterion = nn.BCEWithLogitsLoss()
# Define an optimizer that updates the model parameters using Adam algorithm
optimizer = torch.optim.Adam(model.parameters())

# Move the model to the device (CPU or GPU) for computation
model = model.to(device)

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
for epoch in range(10):
  epoch_loss = 0
  epoch_acc = 0
  
  model.train()
  for label, text in train_loader:
      optimizer.zero_grad()
      predictions = model(text).squeeze(1)
      loss = criterion(predictions, label)
      
      rounded_preds = torch.round(
          torch.sigmoid(predictions))
      correct = (rounded_preds == label).float()
      acc = correct.sum() / len(correct)
      
      loss.backward()
      optimizer.step()
      epoch_loss += loss.item()
      epoch_acc += acc.item()

  print("Epoch %d Train: Loss: %.4f Acc: %.4f" % (epoch + 1, epoch_loss / len(train_loader), 
                                                  epoch_acc / len(train_loader)))

  epoch_loss = 0
  epoch_acc = 0
  model.eval()
  with torch.no_grad():
    for label, text in val_loader:
      predictions = model(text).squeeze(1)
      loss = criterion(predictions, label)
      
      rounded_preds = torch.round(torch.sigmoid(predictions))
      correct = (rounded_preds == label).float()
      acc = correct.sum() / len(correct)
      
      epoch_loss += loss.item()
      epoch_acc += acc.item()

  print("Epoch %d Valid: Loss: %.4f Acc: %.4f" % (epoch + 1, epoch_loss / len(val_loader), 
                                                  epoch_acc / len(val_loader)))

Epoch 1 Train: Loss: 0.6837 Acc: 0.5776
Epoch 1 Valid: Loss: 0.6696 Acc: 0.5661
Epoch 2 Train: Loss: 0.6322 Acc: 0.7256
Epoch 2 Valid: Loss: 0.5932 Acc: 0.7400
Epoch 3 Train: Loss: 0.5476 Acc: 0.7864
Epoch 3 Valid: Loss: 0.5134 Acc: 0.8027
Epoch 4 Train: Loss: 0.4677 Acc: 0.8385
Epoch 4 Valid: Loss: 0.4520 Acc: 0.8336
Epoch 5 Train: Loss: 0.4078 Acc: 0.8688
Epoch 5 Valid: Loss: 0.4058 Acc: 0.8475
Epoch 6 Train: Loss: 0.3580 Acc: 0.8877
Epoch 6 Valid: Loss: 0.3701 Acc: 0.8682
Epoch 7 Train: Loss: 0.3210 Acc: 0.9007
Epoch 7 Valid: Loss: 0.3451 Acc: 0.8768
Epoch 8 Train: Loss: 0.2913 Acc: 0.9102
Epoch 8 Valid: Loss: 0.3280 Acc: 0.8832
Epoch 9 Train: Loss: 0.2689 Acc: 0.9155
Epoch 9 Valid: Loss: 0.3160 Acc: 0.8814
Epoch 10 Train: Loss: 0.2482 Acc: 0.9237
Epoch 10 Valid: Loss: 0.3037 Acc: 0.8875


In [21]:
# Read the text data and labels from the test directory
test_labels, test_texts = read_text_data(data_path/'test')

# Create a custom text dataset object for the test data using the vocabulary and numericalize function
test_dataset = CustomTextDataset(test_labels, test_texts, vocab, numericalize_text)

# Create a data loader for the test dataset
test_loader = DataLoader(test_dataset, collate_fn=collate_batch, batch_size=batch_size, shuffle=False)

In [22]:
test_loss = 0
test_acc = 0
model.eval()
with torch.no_grad():
  for label, text in test_loader:
    predictions = model(text).squeeze(1)
    loss = criterion(predictions, label)
    
    rounded_preds = torch.round(
        torch.sigmoid(predictions))
    correct = (rounded_preds == label).float()
    acc = correct.sum() / len(correct)

    test_loss += loss.item()
    test_acc += acc.item()

print("Test: Loss: %.4f Acc: %.4f" %
        (test_loss / len(test_loader), 
        test_acc / len(test_loader)))

Test: Loss: 0.3210 Acc: 0.8817


In [33]:
torch.save(model.state_dict(), 'sentiment-model.pt')



In [24]:
# Define a text pipeline function that tokenizes and numericalizes a given sentence using the vocabulary
text_pipeline = lambda x: vocab(tokenizer(x))

# Define a function that predicts the sentiment of a given sentence using the model
def predict_sentiment(model, sentence):
    model.eval()
    text = torch.tensor(text_pipeline(sentence)).unsqueeze(1).to(device)
    prediction = model(text)
    return torch.sigmoid(prediction).item()

In [25]:
sentiment = predict_sentiment(model, "Very bad movie")
sentiment

9.9899903979298e-35

In [26]:
sentiment = predict_sentiment(model, "This movie is awesome")
sentiment

1.0

In [36]:
# load from saved state
loaded_model = TextClassificationModel(vocab_size = len(vocab), embedding_dim = 100, output_dim = 1)
loaded_model.load_state_dict(torch.load('sentiment-model.pt'))

sentiment = predict_sentiment(model, "Very bad movie")
if sentiment > 0.5:
    print("positive")
else:
    print("negativ")

sentiment = predict_sentiment(model, "This movie is awesome")
if sentiment > 0.5:
    print("positive")
else:
    print("negativ")

negativ
