In [33]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
import torch
import torch.nn.functional as F
from torch import optim
from datetime import datetime
from torch import nn
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt
%matplotlib inline

In [34]:
vocabulary_size = 4000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
sentence_extra = "SENTENCE_EXTRA"
max_len = 30

print("Reading CSV file...")
with open('reddit-comments-2015-08.csv', 'rt', encoding= 'utf-8') as f:
    reader = csv.reader(f, skipinitialspace = True)
    next(reader)
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print("Parsed %d sentences." % (len(sentences)))

Reading CSV file...
Parsed 79170 sentences.


In [35]:
max_len = 30
ts = [nltk.word_tokenize(sent) for sent in sentences]
tokenized_sentences = []
end_token = [sentence_extra]*40
for x in ts:
    if len(x) > max_len:
        continue
    tokenized_sentences.append(x + end_token[:max_len-len(x)])
print("Parsed %d sentences with len %d." % (len(tokenized_sentences),len(tokenized_sentences[0])))

Parsed 64109 sentences with len 30.


In [36]:
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique words tokens." % len(list(word_freq.items())))

Found 45260 unique words tokens.


In [37]:
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

In [38]:
print("Using vocabulary size %d." % vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

Using vocabulary size 4000.
The least frequent word in our vocabulary is 'impression' and appeared 14 times.


In [39]:
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
print("\nExample sentence: '%s'" % sentences[0])
print("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0])


Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'UNKNOWN_TOKEN', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END', 'SENTENCE_EXTRA', 'SENTENCE_EXTRA', 'SENTENCE_EXTRA', 'SENTENCE_EXTRA', 'SENTENCE_EXTRA', 'SENTENCE_EXTRA', 'SENTENCE_EXTRA', 'SENTENCE_EXTRA', 'SENTENCE_EXTRA']'


In [40]:
x_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [41]:
x_example, y_example = x_train[17], y_train[17]
print("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example))
print("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example))

x:
SENTENCE_START i 'd like to make it available , but there 's no point in making it mandatory . SENTENCE_END SENTENCE_EXTRA SENTENCE_EXTRA SENTENCE_EXTRA SENTENCE_EXTRA SENTENCE_EXTRA SENTENCE_EXTRA SENTENCE_EXTRA SENTENCE_EXTRA SENTENCE_EXTRA
[   1    6  144   41    7  101   11  724    5   25   48   17   66  174
   15  290   11 2653    3    2    0    0    0    0    0    0    0    0
    0]

y:
i 'd like to make it available , but there 's no point in making it mandatory . SENTENCE_END SENTENCE_EXTRA SENTENCE_EXTRA SENTENCE_EXTRA SENTENCE_EXTRA SENTENCE_EXTRA SENTENCE_EXTRA SENTENCE_EXTRA SENTENCE_EXTRA SENTENCE_EXTRA SENTENCE_EXTRA
[   6  144   41    7  101   11  724    5   25   48   17   66  174   15
  290   11 2653    3    2    0    0    0    0    0    0    0    0    0
    0]


In [42]:
class GRUpytorch(nn.Module):
    def __init__(self, vocab_size, hidden_dim=100, bptt_truncate=4, batch_size=3):
        super().__init__()
        self.word_dim = vocab_size
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        self.batch_size = 3
        
        self.gru = nn.GRU(vocab_size, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim,vocab_size)
    
    def forward(self, x):
        x_train = F.one_hot(x, num_classes=self.word_dim)
        ht,_ = self.gru(x_train.float())
        st = self.linear(ht.contiguous().view(ht.shape[0]*ht.shape[1],self.hidden_dim))
        yt = F.log_softmax(st, dim=1)
        return yt
    def pre(self, x):
        x_train = F.one_hot(x, num_classes=self.word_dim)
        ht,_ = self.gru(x_train.view(1,len(x_train),-1).float())
        st = self.linear(ht.view(len(x_train),-1))
        yt = F.log_softmax(st, dim=1)
        return yt

In [59]:
from torch.utils.data import TensorDataset

def train_with_sgd(model, x_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5,batch_size=64):
    num_examples_seen = 0
    calculate_loss=nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.005)
    N = np.sum((len(y_i) for y_i in y_train))
    train_ds = TensorDataset(x_train,y_train)
    for epoch in range(nepoch):
        loss_total=0
        for i in range(batch_size-1,len(y_train),batch_size):
            optimizer.zero_grad()
            
            xb,yb=train_ds[i-batch_size+1:i+1]
            y_hat=model.forward(xb)
            loss=calculate_loss(y_hat,yb.view(-1))

            loss.backward()
            loss_total+=loss
            optimizer.step()
            
            num_examples_seen += batch_size
        if(epoch % evaluate_loss_after == 0):
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss_total/N))
            sys.stdout.flush()
        

In [58]:
model=GRUpytorch(vocabulary_size)
#dev='cpu'
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(dev)
xx = torch.tensor(x_train,device=dev).long()
yy = torch.tensor(y_train,device=dev).long()
losses = train_with_sgd(model, xx[:10000], yy[:10000], nepoch=10, evaluate_loss_after=1)

2020-05-24 10:30:38: Loss after num_examples_seen=936 epoch=0: 0.004317
2020-05-24 10:30:40: Loss after num_examples_seen=1872 epoch=1: 0.003885
2020-05-24 10:30:42: Loss after num_examples_seen=2808 epoch=2: 0.002753
2020-05-24 10:30:44: Loss after num_examples_seen=3744 epoch=3: 0.002460
2020-05-24 10:30:47: Loss after num_examples_seen=4680 epoch=4: 0.002421
2020-05-24 10:30:49: Loss after num_examples_seen=5616 epoch=5: 0.002386
2020-05-24 10:30:51: Loss after num_examples_seen=6552 epoch=6: 0.002354
2020-05-24 10:30:53: Loss after num_examples_seen=7488 epoch=7: 0.002322
2020-05-24 10:30:55: Loss after num_examples_seen=8424 epoch=8: 0.002290
2020-05-24 10:30:57: Loss after num_examples_seen=9360 epoch=9: 0.002258


In [60]:
#model.to(dev)
losses = train_with_sgd(model, xx[:10000], yy[:10000], nepoch=100, evaluate_loss_after=1)

2020-05-24 10:31:28: Loss after num_examples_seen=9984 epoch=0: 0.002225
2020-05-24 10:31:31: Loss after num_examples_seen=19968 epoch=1: 0.002194
2020-05-24 10:31:33: Loss after num_examples_seen=29952 epoch=2: 0.002169
2020-05-24 10:31:35: Loss after num_examples_seen=39936 epoch=3: 0.002148
2020-05-24 10:31:37: Loss after num_examples_seen=49920 epoch=4: 0.002129
2020-05-24 10:31:39: Loss after num_examples_seen=59904 epoch=5: 0.002113
2020-05-24 10:31:41: Loss after num_examples_seen=69888 epoch=6: 0.002097
2020-05-24 10:31:44: Loss after num_examples_seen=79872 epoch=7: 0.002082
2020-05-24 10:31:46: Loss after num_examples_seen=89856 epoch=8: 0.002067
2020-05-24 10:31:48: Loss after num_examples_seen=99840 epoch=9: 0.002052
2020-05-24 10:31:50: Loss after num_examples_seen=109824 epoch=10: 0.002037
2020-05-24 10:31:52: Loss after num_examples_seen=119808 epoch=11: 0.002022
2020-05-24 10:31:54: Loss after num_examples_seen=129792 epoch=12: 0.002007
2020-05-24 10:31:57: Loss after n

In [61]:
def generate_sentence(model):
    # We start the sentence with the start token
  with torch.no_grad():
    new_sentence = [word_to_index[sentence_start_token]]
    # Repeat until we get an end token
    while (not new_sentence[-1] == word_to_index[sentence_end_token] and not new_sentence[-1] == word_to_index[sentence_extra]):
        next_word_probs = np.exp(model.pre(torch.tensor(new_sentence).long()))
        sampled_word = word_to_index[unknown_token]
        # We don't want to sample unknown words
        while sampled_word == word_to_index[unknown_token]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)
    sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    return sentence_str

In [62]:
num_sentences = 10
senten_min_length = 1
model.to('cpu')
for i in range(num_sentences):
    sent = []
    while len(sent) < senten_min_length:
        sent = generate_sentence(model)
    print(" ".join(sent))

families israel and and , you a , the they , transfer billion , in for drag identify mode it n't 's and confidence sometimes the . but to i to to n't
manufacturing her of is the that as to batteries that thin
culture ugly examples the early dry you
apps other heavily with purpose
book it has confident is . in is poverty
involving replied soft yay ago what with decides to your of of showing ? already to made is the 's with am of bonus for .
normal ask a political enjoyable given ^^^have , any attacks on if do
bowl they he spent fastest wash microsoft , that player a
lacking system several statement 's accidentally town legs blade canada
sexual chest was most ahead fan hotel riding that pour san a and you
