In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

SEED = 515
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Simple Sentiment Analysis
This notebook follows this tutorial: https://github.com/bentrevett/pytorch-sentiment-analysis.

# Build a Dataset Manually

Among the main concepts of TorchText, `Field` is the one that defines how data should be processed. 

In [2]:
from torchtext.data import Field, LabelField, Example, Dataset, BucketIterator

TEXT = Field(tokenize='spacy')
GENDER = Field()
LABEL = LabelField(dtype=torch.float)

In [3]:
# Example from dict
fields = {'T': ('text', TEXT), 
          'G': ('gender', GENDER), 
          'L': ('label', LABEL)}
raw_ex = {'T': "I like this film.", 
          'G': "f", 
          'L': "pos"}

ex = Example.fromdict(raw_ex, fields)
print(ex)
print(ex.text)
print(ex.gender)
print(ex.label)

<torchtext.data.example.Example object at 0x000001B8E8BD2E88>
['I', 'like', 'this', 'film', '.']
['f']
pos


In [4]:
# Example from list
fields = [('text', TEXT), ('gender', GENDER), ('label', LABEL)]
raw_ex = ["I like this film.", "f", "pos"]

ex = Example.fromlist(raw_ex, fields)
print(ex)
print(ex.text)
print(ex.gender)
print(ex.label)

<torchtext.data.example.Example object at 0x000001B8E8BD3848>
['I', 'like', 'this', 'film', '.']
['f']
pos


In [5]:
# Create a Dataset
fields = [('text', TEXT), ('gender', GENDER), ('label', LABEL)]
raw_data = [["I like this film.", "f", "pos"], 
            ["I hate it.", "f", "neg"], 
            ["I have no feelings about it.", "m", "neg"], 
            ["It is my best.", "m", "pos"], 
            ["My father loves it so much and I do think so.", "f", "pos"]]

examples = [Example.fromlist(d, fields) for d in raw_data]
data = Dataset(examples, fields)
print(data)
print(data[1])
print(data[1].text)
print(data[1].gender)
print(data[1].label)

<torchtext.data.dataset.Dataset object at 0x000001B8E8BD4088>
<torchtext.data.example.Example object at 0x000001B8E8BD4248>
['I', 'hate', 'it', '.']
['f']
neg


In [6]:
TEXT.build_vocab(data)
GENDER.build_vocab(data)
LABEL.build_vocab(data)
len(TEXT.vocab), len(GENDER.vocab), len(LABEL.vocab)

(25, 4, 2)

In [7]:
BATCH_SIZE = 2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
iterator = BucketIterator(data, batch_size=BATCH_SIZE, device=device, shuffle=True)

for batch in iterator:
    print(batch.text)

tensor([[ 3,  7],
        [16, 12],
        [22, 19],
        [13,  4],
        [ 8,  5],
        [ 4, 20],
        [ 2,  9],
        [ 1,  3],
        [ 1, 11],
        [ 1, 23],
        [ 1,  5],
        [ 1,  2]])
tensor([[ 6],
        [17],
        [21],
        [10],
        [ 2]])
tensor([[ 3,  3],
        [18, 15],
        [24,  4],
        [14,  2],
        [ 2,  1]])


# Preparing Data

In [10]:
import torchtext
train_data, test_data = torchtext.datasets.IMDB.splits(TEXT, LABEL, root='data')

In [11]:
print(train_data[0].text)
print(train_data[0].label)

['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '"', 'Teachers', '"', '.', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'Bromwell', 'High', "'s", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"', 'Teachers', '"', '.', 'The', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', "'", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'I', 'knew', 'and', 'their', 'students', '.', 'When', 'I', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'I', 'immediately', 'recalled', '.........', 'at', '..........', 'High', '.', 'A', 'classic', 'line', ':'

In [12]:
train_data, valid_data = train_data.split()
len(train_data), len(valid_data), len(test_data)

(17500, 7500, 25000)

In [16]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

print(TEXT.vocab.itos[:10])
print(TEXT.vocab.freqs.most_common(10))

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']
[('the', 202309), (',', 192501), ('.', 164945), ('and', 109135), ('a', 108723), ('of', 100313), ('to', 93471), ('is', 75949), ('in', 60970), ('I', 54448)]


In [17]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE, device=device)

# Build Model

In [21]:
class Classifier(nn.Module):
    def __init__(self, in_dim, emb_dim, hid_dim, out_dim):
        super().__init__()
        self.emb = nn.Embedding(in_dim, emb_dim)
        self.rnn = nn.RNN(emb_dim, hid_dim)
        self.fc = nn.Linear(hid_dim, out_dim)

    def forward(self, text):
        # text: (step, batch)
        embedded = self.emb(text)
        # hidden: (1, batch, hid_dim)
        outs, hidden = self.rnn(embedded)
        return self.fc(hidden.squeeze(0))

In [27]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

IN_DIM = len(TEXT.vocab)
EMB_DIM = 100
HID_DIM = 256
OUT_DIM = 1

classifer = Classifier(IN_DIM, EMB_DIM, HID_DIM, OUT_DIM).to(device)
count_parameters(classifer)

2592105

In [28]:
optimizer = optim.SGD(classifer.parameters(), lr=1e-13)
# Binary cross entropy with logits. 
# The binary version of cross entropy loss. 
loss_func = nn.BCEWithLogitsLoss().to(device)

In [30]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    epoch_loss = 0
    for batch in train_iterator:
        preds = classifer(batch.text).squeeze(-1)
        loss = loss_func(preds, batch.label)
        epoch_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(epoch, epoch_loss/len(train_iterator))

0 0.6934661349675951


KeyboardInterrupt: 