In [2]:
import torch.nn as nn

import torch

import nltk

In [3]:
doc = "They think he's a good president because he's done things they like: appointing conservatives to the court and cutting taxes, for example. But every other normal Republican would have done the exact same things, made actual deals to get much more, and they'd have left out all the ridiculous drama that keeps Trump's approval so low and his accomplishments so meager." 

In [4]:
sentences = nltk.sent_tokenize(doc)

In [5]:
import pandas as pd

df = pd.DataFrame(columns = ["sentence"])

df["sentence"] = sentences
df["sentence"] = df.sentence.map(lambda s :  s.lower())

In [6]:
df['words'] = df.sentence.map(nltk.word_tokenize)

In [7]:
df.words

0    [they, think, he, 's, a, good, president, beca...
1    [but, every, other, normal, republican, would,...
Name: words, dtype: object

In [8]:
max_sentence_len = df.words.map(len).max()

In [9]:
import itertools

vocab = list(set(itertools.chain.from_iterable(df.words.tolist())))

In [10]:
vocab

['appointing',
 ',',
 'would',
 'keeps',
 'court',
 'same',
 'out',
 'all',
 'a',
 'made',
 ':',
 'taxes',
 'more',
 'low',
 'the',
 'republican',
 'much',
 'done',
 'they',
 'because',
 'have',
 'so',
 "'d",
 'every',
 'other',
 'approval',
 'and',
 'actual',
 'things',
 'cutting',
 'example',
 'exact',
 'like',
 'president',
 'ridiculous',
 'trump',
 'accomplishments',
 "'s",
 'his',
 'deals',
 'but',
 'that',
 'he',
 'meager',
 'to',
 'for',
 'conservatives',
 '.',
 'get',
 'normal',
 'think',
 'left',
 'good',
 'drama']

In [11]:
import re
?re.match

In [12]:
re.match("\w+",vocab[1])

In [13]:
def matcher(word):
    return re.match("\w+", word)

vocab = list(filter(matcher, itertools.chain.from_iterable(df.words)))

In [14]:
vocab += ["<unk>"]

In [15]:
vocab

['they',
 'think',
 'he',
 'a',
 'good',
 'president',
 'because',
 'he',
 'done',
 'things',
 'they',
 'like',
 'appointing',
 'conservatives',
 'to',
 'the',
 'court',
 'and',
 'cutting',
 'taxes',
 'for',
 'example',
 'but',
 'every',
 'other',
 'normal',
 'republican',
 'would',
 'have',
 'done',
 'the',
 'exact',
 'same',
 'things',
 'made',
 'actual',
 'deals',
 'to',
 'get',
 'much',
 'more',
 'and',
 'they',
 'have',
 'left',
 'out',
 'all',
 'the',
 'ridiculous',
 'drama',
 'that',
 'keeps',
 'trump',
 'approval',
 'so',
 'low',
 'and',
 'his',
 'accomplishments',
 'so',
 'meager',
 '<unk>']

In [16]:
index2vocab = {
    index: word
    for index, word in enumerate(vocab)
}

vocab2index = {
    word: index
    for index, word in enumerate(vocab)
}

In [17]:
vocab2index

{'they': 42,
 'think': 1,
 'he': 7,
 'a': 3,
 'good': 4,
 'president': 5,
 'because': 6,
 'done': 29,
 'things': 33,
 'like': 11,
 'appointing': 12,
 'conservatives': 13,
 'to': 37,
 'the': 47,
 'court': 16,
 'and': 56,
 'cutting': 18,
 'taxes': 19,
 'for': 20,
 'example': 21,
 'but': 22,
 'every': 23,
 'other': 24,
 'normal': 25,
 'republican': 26,
 'would': 27,
 'have': 43,
 'exact': 31,
 'same': 32,
 'made': 34,
 'actual': 35,
 'deals': 36,
 'get': 38,
 'much': 39,
 'more': 40,
 'left': 44,
 'out': 45,
 'all': 46,
 'ridiculous': 48,
 'drama': 49,
 'that': 50,
 'keeps': 51,
 'trump': 52,
 'approval': 53,
 'so': 59,
 'low': 55,
 'his': 57,
 'accomplishments': 58,
 'meager': 60,
 '<unk>': 61}

In [18]:
def get_word_index(word):
    index = vocab2index.get(
        word,
        vocab2index["<unk>"]
    )
    return index

df["word_indices"] = df.words.map(
    lambda words: list(map(get_word_index, words))
)

In [19]:
df.word_indices

0    [42, 1, 7, 61, 3, 4, 5, 6, 7, 61, 29, 33, 42, ...
1    [22, 23, 24, 25, 26, 27, 43, 29, 47, 31, 32, 3...
Name: word_indices, dtype: object

In [20]:
from torch.autograd import Variable

def list2var(l):
    print(l)
    tensor = torch.LongTensor(l)
    return Variable(tensor)

variables = df.word_indices.map(list2var).tolist()

[42, 1, 7, 61, 3, 4, 5, 6, 7, 61, 29, 33, 42, 11, 61, 12, 13, 37, 47, 16, 56, 18, 19, 61, 20, 21, 61]
[22, 23, 24, 25, 26, 27, 43, 29, 47, 31, 32, 33, 61, 34, 35, 36, 37, 38, 39, 40, 61, 56, 42, 61, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 61, 53, 59, 55, 56, 57, 58, 59, 60, 61]


In [21]:
from torch.nn.utils.rnn import pad_sequence

seq = pad_sequence(variables, batch_first=True)

In [22]:
embedding = nn.Embedding(num_embeddings=len(vocab), embedding_dim=100)

In [52]:
a = embedding(seq[0])
b = embedding(seq[1])

In [53]:
c = torch.stack([a,b], dim=0)
c.shape

torch.Size([2, 44, 100])

In [60]:
seq

tensor([[42,  1,  7, 61,  3,  4,  5,  6,  7, 61, 29, 33, 42, 11, 61, 12, 13, 37,
         47, 16, 56, 18, 19, 61, 20, 21, 61,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0],
        [22, 23, 24, 25, 26, 27, 43, 29, 47, 31, 32, 33, 61, 34, 35, 36, 37, 38,
         39, 40, 61, 56, 42, 61, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 61, 53,
         59, 55, 56, 57, 58, 59, 60, 61]])

In [77]:
class WordLSTM(nn.Module):
    
    def __init__(self, embedding_dim, vocab_size, lstm_size=100, bidirectional=False):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, lstm_size, bidirectional=bidirectional, batch_first=True)
    
    def seq_to_embedding(self, seq):
        '''
        Use the padded sequence to get embeddings
        '''
        embeds = []
        
        for s in seq:
            embeds.append(self.embedding(s))
            
        return torch.stack(embeds, dim=0)
    
    def forward(self, input, hidden, cell):
        batch = self.seq_to_embedding(input)
        output, (hidden,cell) = self.lstm(batch, (hidden, cell))
        
        return output, hidden, cell
        

In [78]:
lstm = WordLSTM(100, len(vocab), bidirectional=True)

In [82]:
h_0 = torch.zeros(2, 2, 100)
c_0 = torch.zeros(2, 2, 100)
o, h, cs = lstm(seq, h_0,c_0)
o.shape

torch.Size([2, 44, 200])

- 2 sentences
- 44 words in each
- 100 dim of each word
- PyTorch LSTM is only concerned with the last dimension (100)
- For word attention all sentences( each sentence is a batch ) is padded

## Word Attention

In [114]:
class WordAttn(nn.Module):
    
    def __init__(self, hidden_size):
        super().__init__()
        
        self.linear = nn.Linear(hidden_size, hidden_size)
        self.activation = nn.functional.tanh
        self.word_context = nn.Parameter(torch.randn(hidden_size,1))
    
    def forward(self, word_outputs):
        
        o = self.linear(word_outputs)
        o = self.activation(o)
        o = torch.matmul(o, self.word_context)
        o = torch.mul(o, word_outputs)
        o = torch.sum(o, dim=0)
        
        return o
        

In [119]:
attn = WordAttn(200)
s_vec = attn(o)



In [120]:
s_vec

tensor([[-0.0057, -0.1097,  0.0328,  ..., -0.3123,  0.2545, -0.2326],
        [ 0.0394, -0.0855,  0.0560,  ..., -0.2262,  0.2011, -0.0985],
        [-0.0079, -0.0233,  0.0218,  ...,  0.1382, -0.1077,  0.0372],
        ...,
        [-0.0120, -0.1128, -0.0554,  ..., -0.0325, -0.1447,  0.0488],
        [-0.1920, -0.1302,  0.1134,  ...,  0.1166,  0.0937, -0.0985],
        [-0.3153,  0.0516, -0.1962,  ..., -0.0024,  0.2650, -0.2625]],
       grad_fn=<SumBackward2>)