In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch.nn as nn

import torch

import nltk

In [3]:
doc = "They think he's a good president because he's done things they like: appointing conservatives to the court and cutting taxes, for example. But every other normal Republican would have done the exact same things, made actual deals to get much more, and they'd have left out all the ridiculous drama that keeps Trump's approval so low and his accomplishments so meager." 

In [4]:
sentences = nltk.sent_tokenize(doc)

In [5]:
import pandas as pd

df = pd.DataFrame(columns = ["sentence"])

df["sentence"] = sentences
df["sentence"] = df.sentence.map(lambda s :  s.lower())

In [6]:
df['words'] = df.sentence.map(nltk.word_tokenize)

In [7]:
df.words

0    [they, think, he, 's, a, good, president, beca...
1    [but, every, other, normal, republican, would,...
Name: words, dtype: object

In [8]:
max_sentence_len = df.words.map(len).max()

In [9]:
import itertools

vocab = list(set(itertools.chain.from_iterable(df.words.tolist())))

In [10]:
vocab

['to',
 'all',
 'done',
 'the',
 'deals',
 'republican',
 'so',
 'but',
 'made',
 'same',
 'think',
 'much',
 'court',
 'because',
 ':',
 'actual',
 "'d",
 'they',
 'good',
 'have',
 'out',
 'low',
 'conservatives',
 "'s",
 'example',
 'more',
 'that',
 'keeps',
 'meager',
 'drama',
 'for',
 'a',
 'would',
 'he',
 'things',
 'appointing',
 'other',
 'exact',
 'left',
 'accomplishments',
 'normal',
 'like',
 'president',
 'get',
 'approval',
 'every',
 '.',
 'ridiculous',
 ',',
 'taxes',
 'cutting',
 'trump',
 'his',
 'and']

In [11]:
import re
?re.match

In [12]:
re.match("\w+",vocab[1])

<_sre.SRE_Match object; span=(0, 3), match='all'>

In [13]:
def matcher(word):
    return re.match("\w+", word)

vocab = list(filter(matcher, itertools.chain.from_iterable(df.words)))

In [14]:
vocab += ["<unk>"]

In [15]:
vocab

['they',
 'think',
 'he',
 'a',
 'good',
 'president',
 'because',
 'he',
 'done',
 'things',
 'they',
 'like',
 'appointing',
 'conservatives',
 'to',
 'the',
 'court',
 'and',
 'cutting',
 'taxes',
 'for',
 'example',
 'but',
 'every',
 'other',
 'normal',
 'republican',
 'would',
 'have',
 'done',
 'the',
 'exact',
 'same',
 'things',
 'made',
 'actual',
 'deals',
 'to',
 'get',
 'much',
 'more',
 'and',
 'they',
 'have',
 'left',
 'out',
 'all',
 'the',
 'ridiculous',
 'drama',
 'that',
 'keeps',
 'trump',
 'approval',
 'so',
 'low',
 'and',
 'his',
 'accomplishments',
 'so',
 'meager',
 '<unk>']

In [16]:
index2vocab = {
    index: word
    for index, word in enumerate(vocab)
}

vocab2index = {
    word: index
    for index, word in enumerate(vocab)
}

In [17]:
vocab2index

{'they': 42,
 'think': 1,
 'he': 7,
 'a': 3,
 'good': 4,
 'president': 5,
 'because': 6,
 'done': 29,
 'things': 33,
 'like': 11,
 'appointing': 12,
 'conservatives': 13,
 'to': 37,
 'the': 47,
 'court': 16,
 'and': 56,
 'cutting': 18,
 'taxes': 19,
 'for': 20,
 'example': 21,
 'but': 22,
 'every': 23,
 'other': 24,
 'normal': 25,
 'republican': 26,
 'would': 27,
 'have': 43,
 'exact': 31,
 'same': 32,
 'made': 34,
 'actual': 35,
 'deals': 36,
 'get': 38,
 'much': 39,
 'more': 40,
 'left': 44,
 'out': 45,
 'all': 46,
 'ridiculous': 48,
 'drama': 49,
 'that': 50,
 'keeps': 51,
 'trump': 52,
 'approval': 53,
 'so': 59,
 'low': 55,
 'his': 57,
 'accomplishments': 58,
 'meager': 60,
 '<unk>': 61}

In [18]:
def get_word_index(word):
    index = vocab2index.get(
        word,
        vocab2index["<unk>"]
    )
    return index

df["word_indices"] = df.words.map(
    lambda words: list(map(get_word_index, words))
)

In [19]:
df.word_indices

0    [42, 1, 7, 61, 3, 4, 5, 6, 7, 61, 29, 33, 42, ...
1    [22, 23, 24, 25, 26, 27, 43, 29, 47, 31, 32, 3...
Name: word_indices, dtype: object

In [20]:
from torch.autograd import Variable

def list2var(l):
    print(l)
    tensor = torch.LongTensor(l)
    return Variable(tensor)

variables = df.word_indices.map(list2var).tolist()

[42, 1, 7, 61, 3, 4, 5, 6, 7, 61, 29, 33, 42, 11, 61, 12, 13, 37, 47, 16, 56, 18, 19, 61, 20, 21, 61]
[22, 23, 24, 25, 26, 27, 43, 29, 47, 31, 32, 33, 61, 34, 35, 36, 37, 38, 39, 40, 61, 56, 42, 61, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 61, 53, 59, 55, 56, 57, 58, 59, 60, 61]


In [21]:
from torch.nn.utils.rnn import pad_sequence

seq = pad_sequence(variables, batch_first=True)

In [22]:
embedding = nn.Embedding(num_embeddings=len(vocab), embedding_dim=100)

In [23]:
a = embedding(seq[0])
b = embedding(seq[1])

In [24]:
c = torch.stack([a,b], dim=0)
c.shape

torch.Size([2, 44, 100])

In [25]:
seq

tensor([[42,  1,  7, 61,  3,  4,  5,  6,  7, 61, 29, 33, 42, 11, 61, 12, 13, 37,
         47, 16, 56, 18, 19, 61, 20, 21, 61,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0],
        [22, 23, 24, 25, 26, 27, 43, 29, 47, 31, 32, 33, 61, 34, 35, 36, 37, 38,
         39, 40, 61, 56, 42, 61, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 61, 53,
         59, 55, 56, 57, 58, 59, 60, 61]])

In [27]:
from models import *

gru = WordGRU(100, len(vocab), bidirectional=True)

In [28]:
h_0 = torch.zeros(2, 2, 100)
o, h = gru(seq, h_0)
o.shape

torch.Size([2, 44, 200])

- 2 sentences
- 44 words in each
- 100 dim of each word
- PyTorch LSTM is only concerned with the last dimension (100)
- For word attention all sentences( each sentence is a batch ) is padded

## Word Attention

In [29]:
attn = WordAttention(200)
s_vec = attn(o)

In [30]:
s_vec.shape

torch.Size([2, 200])

- 2 Sentences
- Each sentence of size 200
- A batch of documents would have shape: `[batch_size, max_sent_len, max_word_len]`

In [31]:
s_vec

tensor([[ 2.0163e+01,  8.7290e+00,  3.4055e+01,  2.6654e+01,  2.5068e+01,
          4.6261e+01, -3.5101e+01,  2.9657e+00,  1.8693e+01, -1.6703e+00,
         -9.4082e+00, -3.0112e+01, -3.8391e+01, -3.8499e+01,  1.8975e+01,
         -4.0459e+01,  1.1163e+01, -1.1295e+01, -3.2604e+01, -5.6510e+00,
          3.4529e+01, -1.4267e+01, -7.2485e+00,  4.2787e+00, -2.5971e+01,
         -1.1791e+01,  4.7805e+01, -3.7395e+01, -2.2510e+01,  2.1934e+01,
          5.5025e+01, -1.8491e+01,  2.4144e+01,  1.3930e+01, -3.4532e+01,
          2.4406e+01,  5.7057e+00,  2.7735e+01,  3.4079e+01, -2.8117e+01,
         -8.7659e+00,  9.9341e+00, -1.1719e+01,  2.3641e+01, -2.2787e+01,
          2.2190e+01, -2.9935e+01,  1.9158e+01, -5.9563e-01,  1.4123e+01,
          1.3187e+01,  5.4976e+00,  2.8214e+01, -3.3311e+01,  5.7860e-01,
         -1.3092e+01,  3.6241e+01,  2.0336e+00, -4.4440e+00, -1.2177e+01,
         -1.3722e+01, -2.9056e+01,  1.8751e+01,  5.1652e+00,  1.4427e+00,
          1.3212e+01, -1.4711e+01, -3.

## Sentence LSTM

In [32]:
sentence_gru = SentenceGRU(200, 100)

In [33]:
s_vec.unsqueeze(dim=0).shape

torch.Size([1, 2, 200])

In [34]:
h_0_s = torch.zeros(2,1,100)
sentence_output, h_s = sentence_gru(s_vec.unsqueeze(dim=0), h_0_s)

In [35]:
sentence_output.shape

torch.Size([1, 2, 200])

## Sentence Attention

In [36]:
s_attn = SentenceAttention(200)

In [37]:
d_vec = s_attn(sentence_output)

In [38]:
d_vec.shape

torch.Size([1, 200])

In [39]:
output_layer = OutputLayer(200,2)
output = output_layer(d_vec)

  o = self.softmax(o)


In [40]:
output.squeeze(dim=1)

tensor([[-0.1239, -2.1495]], grad_fn=<SqueezeBackward1>)

In [41]:
loss = nn.NLLLoss()

In [42]:
l = loss(output, torch.LongTensor([1]))

In [43]:
l.backward()