In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
import torch
import torch.nn.functional as F
from torch import optim
from datetime import datetime
from torch import nn
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
vocabulary_size = 4000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
max_len = 30

print("Reading CSV file...")
with open('reddit-comments-2015-08.csv', 'rt', encoding= 'utf-8') as f:
    reader = csv.reader(f, skipinitialspace = True)
    next(reader)
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print("Parsed %d sentences." % (len(sentences)))

Reading CSV file...
Parsed 79170 sentences.


In [3]:
max_len = 30
ts = [nltk.word_tokenize(sent) for sent in sentences]
tokenized_sentences = []
end_token = [sentence_end_token]*40
for x in ts:
    if len(x) > max_len:
        continue
    tokenized_sentences.append(x + end_token[:max_len-len(x)])
print("Parsed %d sentences with len %d." % (len(tokenized_sentences),len(tokenized_sentences[0])))

Parsed 64109 sentences with len 30.


In [4]:
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique words tokens." % len(list(word_freq.items())))

Found 45259 unique words tokens.


In [5]:
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

In [6]:
print("Using vocabulary size %d." % vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

Using vocabulary size 4000.
The least frequent word in our vocabulary is 'pipe' and appeared 14 times.


In [7]:
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
print("\nExample sentence: '%s'" % sentences[0])
print("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0])


Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'UNKNOWN_TOKEN', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END', 'SENTENCE_END', 'SENTENCE_END', 'SENTENCE_END', 'SENTENCE_END', 'SENTENCE_END', 'SENTENCE_END', 'SENTENCE_END', 'SENTENCE_END', 'SENTENCE_END']'


In [8]:
x_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [9]:
x_example, y_example = x_train[17], y_train[17]
print("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example))
print("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example))

x:
SENTENCE_START i 'd like to make it available , but there 's no point in making it mandatory . SENTENCE_END SENTENCE_END SENTENCE_END SENTENCE_END SENTENCE_END SENTENCE_END SENTENCE_END SENTENCE_END SENTENCE_END SENTENCE_END
[   1    5  143   40    6  100   10  723    4   24   47   16   65  173
   14  289   10 2652    2    0    0    0    0    0    0    0    0    0
    0]

y:
i 'd like to make it available , but there 's no point in making it mandatory . SENTENCE_END SENTENCE_END SENTENCE_END SENTENCE_END SENTENCE_END SENTENCE_END SENTENCE_END SENTENCE_END SENTENCE_END SENTENCE_END SENTENCE_END
[   5  143   40    6  100   10  723    4   24   47   16   65  173   14
  289   10 2652    2    0    0    0    0    0    0    0    0    0    0
    0]


In [10]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
xx = torch.tensor(x_train,device=dev).long()
yy = torch.tensor(y_train,device=dev).long()

In [11]:
class GRUpytorch(nn.Module):
    def __init__(self, vocab_size, hidden_dim=100, bptt_truncate=4):
        super().__init__()
        self.word_dim = vocab_size
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        
        self.gru = nn.GRU(vocab_size, hidden_dim)
        self.linear = nn.Linear(hidden_dim,vocab_size)
    
    def forward(self, x):
        x_train = F.one_hot(x, num_classes=self.word_dim)
        ht,_ = self.gru(x_train.view(len(x_train),1,-1).float())
        st = self.linear(ht.view(len(x_train),-1))
        yt = F.log_softmax(st, dim=1)
        return yt
        

In [12]:
def train_with_sgd(model, x_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    num_examples_seen = 0
    calculate_loss=nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.005)
    N = np.sum((len(y_i) for y_i in y_train))
    for epoch in range(nepoch):
        loss_total=0
        for i in range(len(y_train)):
            optimizer.zero_grad()
            
            y_hat=model.forward(x_train[i])
            loss=calculate_loss(y_hat,y_train[i])

            loss.backward()
            loss_total+=loss
            optimizer.step()
            
            num_examples_seen += 1
        if(epoch % evaluate_loss_after == 0):
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss_total/N))
            sys.stdout.flush()
        

In [18]:
model=GRUpytorch(vocabulary_size)
model.to(dev)
losses = train_with_sgd(model, xx[:1000], yy[:1000], nepoch=10, evaluate_loss_after=1)

2020-05-24 10:21:40: Loss after num_examples_seen=1000 epoch=0: 0.185995
2020-05-24 10:21:43: Loss after num_examples_seen=2000 epoch=1: 0.139535
2020-05-24 10:21:47: Loss after num_examples_seen=3000 epoch=2: 0.131527
2020-05-24 10:21:51: Loss after num_examples_seen=4000 epoch=3: 0.125992
2020-05-24 10:21:54: Loss after num_examples_seen=5000 epoch=4: 0.121228
2020-05-24 10:21:58: Loss after num_examples_seen=6000 epoch=5: 0.117645
2020-05-24 10:22:02: Loss after num_examples_seen=7000 epoch=6: 0.115003
2020-05-24 10:22:05: Loss after num_examples_seen=8000 epoch=7: 0.112906
2020-05-24 10:22:09: Loss after num_examples_seen=9000 epoch=8: 0.111184
2020-05-24 10:22:13: Loss after num_examples_seen=10000 epoch=9: 0.109741


In [15]:
model.to(dev)
losses = train_with_sgd(model, xx[:1000], yy[:1000], nepoch=1, evaluate_loss_after=1)

2020-05-24 10:07:05: Loss after num_examples_seen=1000 epoch=0: 0.108356


In [16]:
def generate_sentence(model):
    # We start the sentence with the start token
  with torch.no_grad():
    new_sentence = [word_to_index[sentence_start_token]]
    # Repeat until we get an end token
    while not new_sentence[-1] == word_to_index[sentence_end_token]:
        next_word_probs = np.exp(model.forward(torch.tensor(new_sentence).long()))
        sampled_word = word_to_index[unknown_token]
        # We don't want to sample unknown words
        while sampled_word == word_to_index[unknown_token]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)
    sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    return sentence_str

In [17]:
num_sentences = 10
senten_min_length = 5
model.to('cpu')
for i in range(num_sentences):
    sent = []
    while len(sent) < senten_min_length:
        sent = generate_sentence(model)
    print(" ".join(sent))

none `` sometimes among and society suppose
harassment near post crimes interviews under and to . ? a for
pace today awesome for president is .
bunch os my for you on in . blocks
damage filled vs gt , a , the it randomly lighter and see cell to , but suggesting haki you requests ,
help flair blocks to transition rock models .
wait google san . shadow to moment if in excited ,
grass question a concrete insanely .
hole be sad elder was provide place should instance the i based martin empire if assumed .
convince u.s. /r/writingprompts i similarly
