In [1]:
import torch
import spacy
import tqdm
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data, datasets

In [2]:
spacy_en = spacy.load("en")

In [3]:
TEXT = data.Field(tokenize="spacy", batch_first=True)
LABEL = data.LabelField(dtype=torch.float)

In [4]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

aclImdb_v1.tar.gz:   0%|          | 147k/84.1M [00:00<01:05, 1.28MB/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:01<00:00, 68.2MB/s]


In [5]:
len(train_data), len(test_data)

(25000, 25000)

In [6]:
for d in train_data:
    print(vars(d)["text"])
    print(vars(d)["label"])
    break

['Exquisite', 'comedy', 'starring', 'Marian', 'Davies', '(', 'with', 'the', 'affable', 'William', 'Haines', ')', '.', 'Young', 'Peggy', 'arrives', 'in', 'Hollywood', 'seeking', 'stardom', '.', 'Cameo', 'performances', 'showcase', '"', 'all', 'the', 'stars', 'in', 'MGM', "'s", 'heaven', '"', 'in', 'the', 'famous', 'commissary', 'scene', ',', 'plus', 'lots', 'of', 'vintage', 'film', 'making', 'detail', 'for', 'the', 'scholar', '.', 'Pic', 'also', 'captures', 'for', 'posterity', 'Davies', "'", 'famous', ',', 'wickedly', 'sarcastic', 'impersonations', 'of', 'the', 'top', 'stars', 'of', 'the', 'day', '(', 'her', 'Swanson', 'is', 'a', 'beaut!).<br', '/><br', '/>"Peggy', ',', '"', 'even', 'catches', 'herself', 'as', 'she', 'encounters', 'the', 'famous', 'star', 'Marian', 'Davies', 'at', 'tennis', ',', 'turns', 'up', 'her', 'nose', 'and', 'comments', ',', '"', 'Ohh', ',', 'I', 'do', "n't", 'like', 'her!"<br', '/><br', '/>My', 'print', 'was', 'perfect', '.', 'Story', ',', 'direction', ',', 'act

In [7]:
TEXT.build_vocab(train_data, max_size=25000)
LABEL.build_vocab(train_data)

In [8]:
class Net(nn.Module):
    def __init__(self, vocab_size, embedding_size, num_filters, filter_sizes, output_size, p, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.conv_layers = nn.ModuleList([nn.Conv1d(in_channels=embedding_size, out_channels=num_filters, 
                                                    kernel_size=fs) for fs in filter_sizes])
        
        self.fc = nn.Linear(num_filters * len(filter_sizes), output_size)

        self.dropout = nn.Dropout(p)

    def forward(self, text):
        embedded = self.embedding(text).permute(0, 2, 1)

        conved_n = [F.relu(conv(embedded)) for conv in self.conv_layers]
        pooled_n = [F.max_pool1d(conved, kernel_size=conved.shape[2]).squeeze(2) for conved in conved_n]

        pooled = self.dropout(torch.cat(pooled_n, dim=1))
        output = self.fc(pooled)

        return output

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epochs = 10
batch_size = 64
vocab_size = len(TEXT.vocab)
embedding_size = 100
num_filters = 100
filter_sizes = [3, 4, 5]
output_size = 1
p = 0.5
pad_idx = TEXT.vocab.stoi["<pad>"]

In [10]:
device

device(type='cuda')

In [11]:
train_batches, test_batches = data.BucketIterator.splits((train_data, test_data), batch_size=batch_size, device = device)

In [12]:
net = Net(vocab_size, embedding_size, num_filters, filter_sizes, output_size, p, pad_idx).to(device)
net

Net(
  (embedding): Embedding(25002, 100)
  (conv_layers): ModuleList(
    (0): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(100, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(100, 100, kernel_size=(5,), stride=(1,))
  )
  (fc): Linear(in_features=300, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [13]:
opt = torch.optim.Adam(net.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()

In [14]:
def get_accuracy(preds, y):
    preds = torch.round(torch.sigmoid(preds))
    correct = (preds == y).float()
    acc = correct.sum() / len(correct)

    return acc

In [15]:
def loop(net, batches, train):
    batch_losses = []
    batch_accs = []

    if train:
        print("Train Loop:")
        net.train()
        for batch in tqdm.tqdm(batches, total=len(batches)):
            texts = batch.text.to(device)
            labels = batch.label.to(device)

            preds = net(texts)
            preds = preds.squeeze(1)

            loss = loss_fn(preds, labels)
            acc = get_accuracy(preds, labels)

            opt.zero_grad()
            loss.backward()
            opt.step()

            batch_losses.append(loss.item())
            batch_accs.append(acc)

    else:
        print("Inference Loop:")
        net.eval()
        with torch.no_grad():
            for batch in tqdm.tqdm(batches, total=len(batches)):
                texts = batch.text.to(device)
                labels = batch.label.to(device)

                preds = net(texts)
                preds = preds.squeeze(1)

                loss = loss_fn(preds, labels)
                acc = get_accuracy(preds, labels)

                batch_losses.append(loss.item())
                batch_accs.append(acc) 

    print("")
    print("")
    
    return sum(batch_losses) / len(batch_losses), sum(batch_accs) / len(batch_accs)

In [16]:
def predict_sentiment(net, text, min_len=5):
    net.eval()
    tokens = [t.text for t in spacy_en.tokenizer(text)]
    if len(tokens) < min_len:
        tokens += ["<pad>"] * (min_len - len(tokens))

    indices = [TEXT.vocab.stoi[t] for t in tokens]
    indices = torch.LongTensor(indices).unsqueeze(0).to(device)
    
    preds = net(indices)
    preds = torch.sigmoid(preds)
    
    print(f"sentiment: {preds.item()}")

In [17]:
text = "this is a very good idea"

In [18]:
for epoch in range(epochs):
    train_loss, train_acc = loop(net, train_batches, True)
    val_loss, val_acc = loop(net, test_batches, False)
    
    print(f"epoch: {epoch} | train_loss: {train_loss:.4f} | train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} | val_acc: {val_acc:.4f}")
    predict_sentiment(net, text)
    print("")

  0%|          | 0/391 [00:00<?, ?it/s]

Train Loop:


100%|██████████| 391/391 [00:13<00:00, 29.71it/s]
  7%|▋         | 29/391 [00:00<00:01, 286.69it/s]



Inference Loop:


100%|██████████| 391/391 [00:02<00:00, 152.71it/s]
  1%|          | 2/391 [00:00<00:19, 19.69it/s]



epoch: 0 | train_loss: 0.6295 | train_acc: 0.6480 | val_loss: 0.4681 | val_acc: 0.7892
sentiment: 0.7283974289894104

Train Loop:


100%|██████████| 391/391 [00:12<00:00, 30.24it/s]
  9%|▉         | 35/391 [00:00<00:01, 344.32it/s]



Inference Loop:


100%|██████████| 391/391 [00:02<00:00, 160.71it/s]
  1%|          | 3/391 [00:00<00:16, 23.59it/s]



epoch: 1 | train_loss: 0.4759 | train_acc: 0.7688 | val_loss: 0.3873 | val_acc: 0.8316
sentiment: 0.8933433890342712

Train Loop:


100%|██████████| 391/391 [00:12<00:00, 30.24it/s]
  9%|▉         | 35/391 [00:00<00:01, 342.61it/s]



Inference Loop:


100%|██████████| 391/391 [00:02<00:00, 158.75it/s]
  1%|          | 2/391 [00:00<00:19, 19.93it/s]



epoch: 2 | train_loss: 0.4024 | train_acc: 0.8144 | val_loss: 0.3328 | val_acc: 0.8586
sentiment: 0.9277836680412292

Train Loop:


100%|██████████| 391/391 [00:13<00:00, 29.48it/s]
  9%|▊         | 34/391 [00:00<00:01, 336.76it/s]



Inference Loop:


100%|██████████| 391/391 [00:02<00:00, 159.28it/s]
  1%|          | 3/391 [00:00<00:15, 24.66it/s]



epoch: 3 | train_loss: 0.3354 | train_acc: 0.8533 | val_loss: 0.3038 | val_acc: 0.8682
sentiment: 0.9294244050979614

Train Loop:


100%|██████████| 391/391 [00:13<00:00, 29.96it/s]
  7%|▋         | 29/391 [00:00<00:01, 287.56it/s]



Inference Loop:


100%|██████████| 391/391 [00:02<00:00, 159.11it/s]
  1%|          | 3/391 [00:00<00:14, 26.08it/s]



epoch: 4 | train_loss: 0.2812 | train_acc: 0.8825 | val_loss: 0.2877 | val_acc: 0.8777
sentiment: 0.9237751364707947

Train Loop:


100%|██████████| 391/391 [00:12<00:00, 30.14it/s]
  7%|▋         | 29/391 [00:00<00:01, 284.65it/s]



Inference Loop:


100%|██████████| 391/391 [00:02<00:00, 158.10it/s]
  1%|          | 3/391 [00:00<00:16, 23.91it/s]



epoch: 5 | train_loss: 0.2293 | train_acc: 0.9063 | val_loss: 0.2859 | val_acc: 0.8799
sentiment: 0.9109876751899719

Train Loop:


100%|██████████| 391/391 [00:13<00:00, 29.94it/s]
  8%|▊         | 33/391 [00:00<00:01, 325.94it/s]



Inference Loop:


100%|██████████| 391/391 [00:02<00:00, 157.54it/s]
  1%|          | 3/391 [00:00<00:15, 25.03it/s]



epoch: 6 | train_loss: 0.1832 | train_acc: 0.9268 | val_loss: 0.2972 | val_acc: 0.8802
sentiment: 0.9263855218887329

Train Loop:


100%|██████████| 391/391 [00:13<00:00, 29.87it/s]
  9%|▉         | 36/391 [00:00<00:01, 351.43it/s]



Inference Loop:


100%|██████████| 391/391 [00:02<00:00, 159.68it/s]
  1%|          | 3/391 [00:00<00:15, 24.36it/s]



epoch: 7 | train_loss: 0.1403 | train_acc: 0.9453 | val_loss: 0.3200 | val_acc: 0.8763
sentiment: 0.8898522853851318

Train Loop:


100%|██████████| 391/391 [00:13<00:00, 29.87it/s]
  8%|▊         | 30/391 [00:00<00:01, 299.81it/s]



Inference Loop:


100%|██████████| 391/391 [00:02<00:00, 157.52it/s]
  0%|          | 1/391 [00:00<00:42,  9.14it/s]



epoch: 8 | train_loss: 0.1052 | train_acc: 0.9601 | val_loss: 0.3471 | val_acc: 0.8738
sentiment: 0.9050530195236206

Train Loop:


100%|██████████| 391/391 [00:13<00:00, 30.02it/s]
  9%|▉         | 35/391 [00:00<00:01, 341.22it/s]



Inference Loop:


100%|██████████| 391/391 [00:02<00:00, 160.96it/s]



epoch: 9 | train_loss: 0.0789 | train_acc: 0.9705 | val_loss: 0.3788 | val_acc: 0.8735
sentiment: 0.8994780778884888






In [19]:
predict_sentiment(net, "this is a very bad idea")

sentiment: 0.08899673819541931


In [20]:
predict_sentiment(net, "this film is terrible")

sentiment: 0.034502360969781876


In [21]:
predict_sentiment(net, "you are terrific")

sentiment: 0.9294814467430115


In [22]:
predict_sentiment(net, "that is horrible")

sentiment: 0.06200595572590828


In [23]:
predict_sentiment(net, "yeet!!")

sentiment: 0.6045075058937073


In [24]:
predict_sentiment(net, "what are you doing?")

sentiment: 0.49868667125701904
