In [1]:
import torch
import torch.nn as nn
from torchtext import data

In [283]:
tokenize = lambda x: x.split()
TEXT = data.Field(init_token='<SOS>', eos_token='<EOS>', tokenize=tokenize, lower=True)
LABEL = data.Field(sequential=False, use_vocab=False)

In [284]:
def raw_info(vocab, ID_list):
    length, bsz = ID_list.shape
    for _ in range(bsz):
        for i in range(length):
            print(vocab.itos[ID_list[i,_]], end=' ')
        print()

In [323]:
data_path = '/home/zhaoyu/Datasets/NLPBasics/sentiment/train.tsv'
dataset = data.TabularDataset(data_path, 'TSV', skip_header=True, 
                              fields=[('PhraseId', LABEL), ('SentenceId', LABEL),
                                      ('Phrase', TEXT), ('Sentiment', LABEL)])
TEXT.build_vocab(dataset)
vocab = TEXT.vocab

In [329]:
train_iter = data.BucketIterator(dataset, batch_size=4, 
                                 train=True,
                                 sort=True,
                                 shuffle=True, 
                                 sort_within_batch=False,
                                 sort_key=lambda x: len(x.Phrase),
                                 repeat=False)

print('train:', train_iter.train, '\nsort:', train_iter.sort, 
      '\nshuffle:', train_iter.shuffle)

train: True 
sort: True 
shuffle: True


In [330]:
for i, batch in enumerate(train_iter):
    if i>10000 and i<10100:
        print(i, batch.Sentiment, batch.Phrase.shape)
        raw_info(vocab, batch.Phrase)
    if i>10100:
        break

10001 tensor([2, 2, 3, 2]) torch.Size([4, 4])
<SOS> have never <EOS> 
<SOS> a lock <EOS> 
<SOS> expert fighters <EOS> 
<SOS> few weeks <EOS> 
10002 tensor([2, 3, 1, 2]) torch.Size([4, 4])
<SOS> actually shocked <EOS> 
<SOS> it succeeds <EOS> 
<SOS> many improbabilities <EOS> 
<SOS> rose-colored situations <EOS> 
10003 tensor([3, 3, 2, 2]) torch.Size([4, 4])
<SOS> impacting film <EOS> 
<SOS> great documentaries <EOS> 
<SOS> one truth <EOS> 
<SOS> administration 's <EOS> 
10004 tensor([2, 2, 2, 2]) torch.Size([4, 4])
<SOS> tearing ` <EOS> 
<SOS> their mothers <EOS> 
<SOS> have noticed <EOS> 
<SOS> take nothing <EOS> 
10005 tensor([2, 2, 1, 2]) torch.Size([4, 4])
<SOS> seems -rrb- <EOS> 
<SOS> uselessly redundant <EOS> 
<SOS> shamelessly money-grubbing <EOS> 
<SOS> horror sequels <EOS> 
10006 tensor([2, 4, 2, 2]) torch.Size([4, 4])
<SOS> apocalypse movies <EOS> 
<SOS> confidently orchestrated <EOS> 
<SOS> aesthetically and <EOS> 
<SOS> deeply and <EOS> 
10007 tensor([3, 2, 2, 2]) torch.Si

In [136]:
class Classifier(nn.Module):
    def __init__(self, vocab_size, nemb, nhid, nclass, nlayer=2):
        super(Classifier, self).__init__()
        self.emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=nemb)
        self.rnn = nn.GRU(input_size=nemb, hidden_size=nhid, num_layers=nlayer)
        self.fc = nn.Linear(nhid, nclass)
        
    def forward(self, x, hidden_state):
        emb = self.emb(x)
        rnn_output, hidden_state = self.rnn(emb, hidden_state)
        logits = self.fc(rnn_output)
        return logits

In [9]:
rnn = Classifier(len(vocab), 300, 512, 5)