<a href="https://colab.research.google.com/github/xiaohai-AI/bootstrap/blob/master/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch
!pip install torchtext
!python -m spacy download en


# K80 gpu for 12 hours
import torch
from torch import nn, optim
from torchtext import data, datasets

print('GPU:', torch.cuda.is_available())

torch.manual_seed(123)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
GPU: True


<torch._C.Generator at 0x7f609a9b2cd0>

In [2]:


TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:07<00:00, 10.8MB/s]


In [3]:
print('len of train data:', len(train_data))
print('len of test data:', len(test_data))

len of train data: 25000
len of test data: 25000


In [4]:
print(train_data.examples[15].text)
print(train_data.examples[15].label)

['This', 'is', 'just', 'as', 'good', 'as', 'the', 'original', '101', 'if', 'not', 'better', '.', 'Of', 'course', ',', 'Cruella', 'steals', 'the', 'show', 'with', 'her', 'outrageous', 'behaviour', 'and', 'outfits', ',', 'and', 'the', 'movie', 'was', 'probably', 'made', 'because', 'the', 'public', 'wanted', 'to', 'see', 'more', 'of', 'Cruella', '.', 'We', 'see', 'a', 'lot', 'more', 'of', 'her', 'this', 'time', 'round', '.', 'I', 'also', 'like', 'Ioan', 'Gruffudd', 'as', 'Kevin', ',', 'the', 'rather', 'bumbling', 'male', 'lead', '.', 'To', 'use', 'Paris', 'as', 'the', 'climax', 'of', 'the', 'movie', 'was', 'a', 'clever', 'idea', '.', 'The', 'movie', 'is', 'well', 'worth', 'watching', 'whatever', 'your', 'age', ',', 'provided', 'you', 'like', 'animals', '.']
pos


In [5]:
# word2vec, glove
TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d')
LABEL.build_vocab(train_data)


batchsz = 30
device = torch.device('cuda')
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size = batchsz,
    device=device
)

.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]                           
 99%|█████████▉| 397411/400000 [00:13<00:00, 29596.83it/s]

In [0]:
class RNN(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        """
        """
        super(RNN, self).__init__()
        
        # [0-10001] => [100]
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # [100] => [256]
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, 
                           bidirectional=True, dropout=0.5)
        # [256*2] => [1]
        self.fc = nn.Linear(hidden_dim*2, 1)
        self.dropout = nn.Dropout(0.5)
        
        
    def forward(self, x):
        """
        x: [seq_len, b] vs [b, 3, 28, 28]
        """
        # [seq, b, 1] => [seq, b, 100]
        embedding = self.dropout(self.embedding(x))
        
        # output: [seq, b, hid_dim*2]
        # hidden/h: [num_layers*2, b, hid_dim]
        # cell/c: [num_layers*2, b, hid_di]
        output, (hidden, cell) = self.rnn(embedding)
        
        # [num_layers*2, b, hid_dim] => 2 of [b, hid_dim] => [b, hid_dim*2]
        hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
        
        # [b, hid_dim*2] => [b, 1]
        hidden = self.dropout(hidden)
        out = self.fc(hidden)
        
        return out

In [7]:
rnn = RNN(len(TEXT.vocab), 100, 256)

pretrained_embedding = TEXT.vocab.vectors
print('pretrained_embedding:', pretrained_embedding.shape)
rnn.embedding.weight.data.copy_(pretrained_embedding)
print('embedding layer inited.')

optimizer = optim.Adam(rnn.parameters(), lr=1e-3)
criteon = nn.BCEWithLogitsLoss().to(device)
rnn.to(device)


pretrained_embedding: torch.Size([10002, 100])
embedding layer inited.


RNN(
  (embedding): Embedding(10002, 100)
  (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [0]:
import numpy as np

def binary_acc(preds, y):
    """
    get accuracy
    """
    preds = torch.round(torch.sigmoid(preds))
    correct = torch.eq(preds, y).float()
    acc = correct.sum() / len(correct)
    return acc

def train(rnn, iterator, optimizer, criteon):
    
    avg_acc = []
    rnn.train()
    
    for i, batch in enumerate(iterator):
        
        # [seq, b] => [b, 1] => [b]
        pred = rnn(batch.text).squeeze(1)
        # 
        loss = criteon(pred, batch.label)
        acc = binary_acc(pred, batch.label).item()
        avg_acc.append(acc)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if i%10 == 0:
            print(i, acc)
        
    avg_acc = np.array(avg_acc).mean()
    print('avg acc:', avg_acc)
    
    
def eval(rnn, iterator, criteon):
    
    avg_acc = []
    
    rnn.eval()
    
    with torch.no_grad():
        for batch in iterator:

            # [b, 1] => [b]
            pred = rnn(batch.text).squeeze(1)

            #
            loss = criteon(pred, batch.label)

            acc = binary_acc(pred, batch.label).item()
            avg_acc.append(acc)
        
    avg_acc = np.array(avg_acc).mean()
    
    print('>>test:', avg_acc)
        
    
    

In [9]:
for epoch in range(10):
    
    eval(rnn, test_iterator, criteon)
    train(rnn, train_iterator, optimizer, criteon)

 99%|█████████▉| 397411/400000 [00:30<00:00, 29596.83it/s]

>>test: 0.49976021376588076
0 0.40000003576278687
10 0.40000003576278687
20 0.46666669845581055
30 0.5333333611488342
40 0.40000003576278687
50 0.46666669845581055
60 0.5
70 0.4333333671092987
80 0.4333333671092987
90 0.40000003576278687
100 0.6666666865348816
110 0.5333333611488342
120 0.6333333849906921
130 0.6000000238418579
140 0.5666667222976685
150 0.5
160 0.6000000238418579
170 0.6000000238418579
180 0.7000000476837158
190 0.5333333611488342
200 0.6000000238418579
210 0.5666667222976685
220 0.40000003576278687
230 0.7000000476837158
240 0.6666666865348816
250 0.6333333849906921
260 0.7666667103767395
270 0.6333333849906921
280 0.6666666865348816
290 0.6000000238418579
300 0.6333333849906921
310 0.5666667222976685
320 0.7333333492279053
330 0.6000000238418579
340 0.7666667103767395
350 0.7000000476837158
360 0.4333333671092987
370 0.6333333849906921
380 0.6333333849906921
390 0.7000000476837158
400 0.7000000476837158
410 0.7666667103767395
420 0.5
430 0.5666667222976685
440 0.600