In [1]:
import torch
import spacy
import tqdm
import torch.nn as nn
from torchtext import data, datasets

In [2]:
spacy_en = spacy.load("en")

In [3]:
TEXT = data.Field(tokenize="spacy", include_lengths=True)
LABEL = data.LabelField(dtype=torch.float)

In [4]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

aclImdb_v1.tar.gz:   0%|          | 147k/84.1M [00:00<01:05, 1.28MB/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:01<00:00, 64.0MB/s]


In [5]:
len(train_data), len(test_data)

(25000, 25000)

In [6]:
for d in train_data:
    print(vars(d)["text"])
    print(vars(d)["label"])
    break

['Not', 'sure', 'one', 'can', 'call', 'this', 'an', 'anti', '-', 'war', 'film', ',', 'it', 'shows', 'war', 'at', 'an', 'elite', 'level', '.', 'These', 'are', 'elite', 'troops', 'that', 'know', 'what', 'they', 'are', 'doing', 'and', 'take', 'great', 'pride', 'in', 'it', '.', 'Even', 'when', 'they', 'are', 'pacifist', ',', 'they', 'still', 'enjoy', 'the', 'skill', 'level', 'and', 'defeating', 'their', 'foes', ',', 'even', 'if', 'it', 'does', 'go', 'against', 'being', 'a', 'pacifist', '.', 'The', 'movies', 'is', 'slow', 'and', 'rather', 'uneventful', 'and', 'in', 'many', 'ways', 'is', 'rather', 'tame', 'as', 'war', 'movies', 'go', '-', 'more', 'so', 'by', 'todays', 'standards', ',', 'no', 'body', 'parts', 'flying', 'off', 'as', 'in', 'modern', 'movies', '.', 'It', 'is', 'brutal', 'in', 'other', 'ways', 'though', 'as', 'you', 'see', 'killing', 'at', 'a', 'personal', 'level', '.', 'This', 'is', 'more', 'of', 'a', 'thinking', 'man', "'s", 'movie', '.', 'Once', 'you', 'start', 'to', 'watch', 

In [7]:
TEXT.build_vocab(train_data, max_size=25000)
LABEL.build_vocab(train_data)

In [8]:
class Net(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, output_size, num_layers, bidirectional, p, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers=num_layers, bidirectional=bidirectional, dropout=p)
        self.fc = nn.Linear(hidden_size * 2, output_size)
        self.dropout = nn.Dropout(p)

    def forward(self, texts, text_lengths):
        embedded = self.dropout(self.embedding(texts))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_x, (h, c) = self.lstm(packed_embedded)
        x, x_len = nn.utils.rnn.pad_packed_sequence(packed_x)

        hidden = self.dropout(torch.cat((h[-2, :, :], h[-1, :, :]), dim=1))
        output = self.fc(hidden)

        return output

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epochs = 10
batch_size = 64
vocab_size = len(TEXT.vocab)
embedding_size = 100
hidden_size = 256
output_size = 1
num_layers = 2
bidirectional = True
p = 0.5
pad_idx = TEXT.vocab.stoi["<pad>"]

In [10]:
device

device(type='cuda')

In [11]:
train_batches, test_batches = data.BucketIterator.splits((train_data, test_data), batch_size=batch_size,
                                                         sort_within_batch=True, device=device)

In [12]:
for batch in train_batches:
    print(batch.text[0])
    print(batch.text[0].shape)
    print(batch.text[1])
    print(batch.text[1].shape)
    print(batch.label.shape)
    break

tensor([[ 4631,  1388,    66,  ...,   171,    25,  3246],
        [   16,    95,  3844,  ...,    31,   106,    33],
        [   22,   248,    24,  ...,   145,    68,   323],
        ...,
        [   12,     2,    80,  ...,     1,     1,     1],
        [ 1036, 21347,    16,  ...,     1,     1,     1],
        [    4,     4,     4,  ...,     1,     1,     1]], device='cuda:0')
torch.Size([58, 64])
tensor([58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 57, 57, 57, 57, 57, 57, 57, 57,
        56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55,
        55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 53, 53, 53, 53, 53, 53, 53,
        53, 52, 52, 52, 52, 52, 52, 51, 51, 51], device='cuda:0')
torch.Size([64])
torch.Size([64])


In [13]:
net = Net(vocab_size, embedding_size, hidden_size, output_size, num_layers, bidirectional, p, pad_idx).to(device)
net

Net(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (lstm): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [14]:
opt = torch.optim.Adam(net.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()

In [15]:
def get_accuracy(preds, y):
    preds = torch.round(torch.sigmoid(preds))
    correct = (preds == y).float()
    acc = correct.sum() / len(correct)

    return acc

In [16]:
def loop(net, batches, train):
    batch_losses = []
    batch_accs = []

    if train:
        print("Train Loop:")
        net.train()
        for batch in tqdm.tqdm(batches, total=len(batches)):
            texts, text_lengths = batch.text
            texts = texts.to(device)
            text_lengths = text_lengths.to(torch.device("cpu"))
            labels = batch.label.to(device)

            preds = net(texts, text_lengths)
            preds = preds.squeeze(1)

            loss = loss_fn(preds, labels)
            acc = get_accuracy(preds, labels)

            opt.zero_grad()
            loss.backward()
            opt.step()

            batch_losses.append(loss.item())
            batch_accs.append(acc)

    else:
        print("Inference Loop:")
        net.eval()
        with torch.no_grad():
            for batch in tqdm.tqdm(batches, total=len(batches)):
                texts, text_lengths = batch.text
                texts = texts.to(device)
                text_lengths = text_lengths.to(torch.device("cpu"))
                labels = batch.label.to(device)

                preds = net(texts, text_lengths)
                preds = preds.squeeze(1)

                loss = loss_fn(preds, labels)
                acc = get_accuracy(preds, labels)

                batch_losses.append(loss.item())
                batch_accs.append(acc) 

    print("")
    print("")
    
    return sum(batch_losses) / len(batch_losses), sum(batch_accs) / len(batch_accs)

In [17]:
def predict_sentiment(net, text):
    tokens = [t.text for t in spacy_en.tokenizer(text)]
    indices = [TEXT.vocab.stoi[t] for t in tokens]
    indices = torch.LongTensor(indices).unsqueeze(1).to(device)
    
    text_len = [len(indices)]
    text_len = torch.LongTensor(text_len).to(torch.device("cpu"))

    preds = net(indices, text_len)
    preds = torch.sigmoid(preds)
    
    print(f"sentiment: {preds.item()}")

In [18]:
text = "this is a very good idea"

In [19]:
for epoch in range(epochs):
    train_loss, train_acc = loop(net, train_batches, True)
    val_loss, val_acc = loop(net, test_batches, False)
    
    print(f"epoch: {epoch} | train_loss: {train_loss:.4f} | train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} | val_acc: {val_acc:.4f}")
    predict_sentiment(net, text)
    print("")

  0%|          | 0/391 [00:00<?, ?it/s]

Train Loop:


100%|██████████| 391/391 [00:49<00:00,  7.83it/s]
  3%|▎         | 11/391 [00:00<00:03, 101.69it/s]



Inference Loop:


100%|██████████| 391/391 [00:15<00:00, 25.41it/s]
  0%|          | 0/391 [00:00<?, ?it/s]



epoch: 0 | train_loss: 0.6826 | train_acc: 0.5582 | val_loss: 0.6584 | val_acc: 0.6224
sentiment: 0.6014764308929443

Train Loop:


100%|██████████| 391/391 [00:53<00:00,  7.37it/s]
  3%|▎         | 10/391 [00:00<00:04, 94.73it/s]



Inference Loop:


100%|██████████| 391/391 [00:15<00:00, 25.54it/s]
  0%|          | 1/391 [00:00<00:41,  9.37it/s]



epoch: 1 | train_loss: 0.6370 | train_acc: 0.6390 | val_loss: 0.5765 | val_acc: 0.7085
sentiment: 0.674282431602478

Train Loop:


100%|██████████| 391/391 [00:52<00:00,  7.40it/s]
  3%|▎         | 11/391 [00:00<00:03, 102.69it/s]



Inference Loop:


100%|██████████| 391/391 [00:15<00:00, 25.29it/s]
  1%|          | 2/391 [00:00<00:22, 17.56it/s]



epoch: 2 | train_loss: 0.5703 | train_acc: 0.7070 | val_loss: 0.5690 | val_acc: 0.6911
sentiment: 0.7461382150650024

Train Loop:


100%|██████████| 391/391 [00:52<00:00,  7.43it/s]
  3%|▎         | 10/391 [00:00<00:03, 97.52it/s]



Inference Loop:


100%|██████████| 391/391 [00:15<00:00, 25.40it/s]
  0%|          | 0/391 [00:00<?, ?it/s]



epoch: 3 | train_loss: 0.4950 | train_acc: 0.7634 | val_loss: 0.4155 | val_acc: 0.8111
sentiment: 0.9088044762611389

Train Loop:


100%|██████████| 391/391 [00:52<00:00,  7.42it/s]
  3%|▎         | 11/391 [00:00<00:03, 101.48it/s]



Inference Loop:


100%|██████████| 391/391 [00:15<00:00, 25.38it/s]
  0%|          | 1/391 [00:00<00:40,  9.57it/s]



epoch: 4 | train_loss: 0.4073 | train_acc: 0.8176 | val_loss: 0.3880 | val_acc: 0.8380
sentiment: 0.9769014716148376

Train Loop:


100%|██████████| 391/391 [00:52<00:00,  7.43it/s]
  3%|▎         | 10/391 [00:00<00:03, 97.61it/s]



Inference Loop:


100%|██████████| 391/391 [00:15<00:00, 25.46it/s]
  0%|          | 1/391 [00:00<00:53,  7.36it/s]



epoch: 5 | train_loss: 0.3389 | train_acc: 0.8543 | val_loss: 0.3444 | val_acc: 0.8664
sentiment: 0.984970211982727

Train Loop:


100%|██████████| 391/391 [00:52<00:00,  7.43it/s]
  3%|▎         | 10/391 [00:00<00:03, 96.67it/s]



Inference Loop:


100%|██████████| 391/391 [00:15<00:00, 25.52it/s]
  0%|          | 1/391 [00:00<00:46,  8.33it/s]



epoch: 6 | train_loss: 0.3074 | train_acc: 0.8734 | val_loss: 0.3025 | val_acc: 0.8773
sentiment: 0.9739989638328552

Train Loop:


100%|██████████| 391/391 [00:52<00:00,  7.42it/s]
  3%|▎         | 10/391 [00:00<00:03, 98.88it/s]



Inference Loop:


100%|██████████| 391/391 [00:15<00:00, 25.40it/s]
  1%|          | 2/391 [00:00<00:31, 12.46it/s]



epoch: 7 | train_loss: 0.2789 | train_acc: 0.8836 | val_loss: 0.3116 | val_acc: 0.8793
sentiment: 0.9846983551979065

Train Loop:


100%|██████████| 391/391 [00:52<00:00,  7.46it/s]
  3%|▎         | 11/391 [00:00<00:03, 104.65it/s]



Inference Loop:


100%|██████████| 391/391 [00:15<00:00, 25.50it/s]
  0%|          | 0/391 [00:00<?, ?it/s]



epoch: 8 | train_loss: 0.2610 | train_acc: 0.8936 | val_loss: 0.2776 | val_acc: 0.8875
sentiment: 0.9836331009864807

Train Loop:


100%|██████████| 391/391 [00:52<00:00,  7.41it/s]
  3%|▎         | 10/391 [00:00<00:03, 99.26it/s]



Inference Loop:


100%|██████████| 391/391 [00:15<00:00, 25.34it/s]



epoch: 9 | train_loss: 0.2403 | train_acc: 0.9057 | val_loss: 0.2717 | val_acc: 0.8938
sentiment: 0.9774786233901978






In [21]:
def save_checkpoint(net, opt, filename):
    check_point = {"net_dict": net.state_dict(), "opt_dict": opt.state_dict()}
    torch.save(check_point, filename)
    print("Checkpoint Saved!")

def load_checkpoint(net, opt, filename):
    check_point = torch.load(filename)
    net.load_state_dict(check_point["net_dict"])
    opt.load_state_dict(check_point["opt_dict"])
    losses = check_point["losses"]
    print("Checkpoint Loaded!")

In [22]:
save_checkpoint(net, opt, "checkpoint.pth.tar")

Checkpoint Saved!


In [23]:
predict_sentiment(net, "this is a very bad idea")

sentiment: 0.1433289498090744


In [24]:
predict_sentiment(net, "this film is terrible")

sentiment: 0.007531510200351477


In [25]:
predict_sentiment(net, "you are terrific")

sentiment: 0.9466769099235535


In [26]:
predict_sentiment(net, "that is horrible")

sentiment: 0.0610734187066555


In [27]:
predict_sentiment(net, "yeet!!")

sentiment: 0.3996625244617462


In [28]:
predict_sentiment(net, "what are you doing?")

sentiment: 0.5638638138771057
