In [54]:
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.functional as F
from torch.nn.utils import clip_grad_norm_

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [55]:
import numpy as np
import nltk
import bs4 as bs
import re
import urllib.request
import warnings

from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

class parse_and_prepare():
    def __init__(self, EOS=False):
        self.EOS = EOS

    def parse(self, sites):
        text = []
        for site in sites:
            scrapped_data = urllib.request.urlopen(site)
            article = scrapped_data.read()

            parsed_article = bs.BeautifulSoup(article, 'lxml')
            paragraphs = parsed_article.find_all('p')
            article_text = ""
            for p in paragraphs:
                article_text += p.text

            processed_article = article_text.lower()
            if self.EOS:
                processed_article = re.sub('[^a-zA-Z.]', ' ', processed_article)
            else:
                processed_article = re.sub('[^a-zA-Z]', ' ', processed_article)

            processed_article = re.sub(r'\s+', ' ', processed_article)
            all_sentences = nltk.sent_tokenize(processed_article)

            new_text = [nltk.word_tokenize(sent) for sent in all_sentences]

            for sentence in new_text:
                text.append(' '.join(sentence[:-1]))

        return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [56]:
sites =['https://breakingbad.fandom.com/wiki/Walter_White',
         'https://en.wikipedia.org/wiki/Breaking_Bad',
         'https://breakingbad.fandom.com/wiki/Jesse_Pinkman',
         'https://breakingbad.fandom.com/wiki/Gustavo_Fring', 
        'https://breakingbad.fandom.com/wiki/Jimmy_McGill',
        'https://breakingbad.fandom.com/wiki/Mike_Ehrmantraut',
        'https://breakingbad.fandom.com/wiki/Skyler_White',
        'https://breakingbad.fandom.com/wiki/Hank_Schrader',
        'https://en.wikiquote.org/wiki/Breaking_Bad',
        'https://en.wikipedia.beta.wmflabs.org/wiki/Breaking_Bad',
        'https://www.wikidata.org/wiki/Q1079',
        'https://marcelsadusbreakingbadwiki.wordpress.com/',
        'https://de.zxc.wiki/wiki/Breaking_Bad',
        'https://breakingbad.fandom.com/wiki/Lalo_Salamanca',
        'https://breakingbad.fandom.com/wiki/Lydia_Rodarte-Quayle',
        'https://breakingbad.fandom.com/wiki/Todd_Alquist',
        'https://breakingbad.fandom.com/wiki/Marie_Schrader',
        'https://breakingbad.fandom.com/wiki/Walter_White_Jr.',
        'https://breakingbad.fandom.com/wiki/Kim_Wexler',
        'https://breakingbad.fandom.com/wiki/Walter_White_Jr.',
        'https://breakingbad.fandom.com/wiki/Chuck_McGill',
        'https://breakingbad.fandom.com/wiki/Nacho_Varga',
        'https://breakingbad.fandom.com/wiki/Season_1_(Better_Call_Saul)'
         ]

parser = parse_and_prepare(EOS=True)
text = parser.parse(sites)

In [57]:
for sentence in text:
    print(sentence)
    break

main walter white portrayed by bryan cranston character information full name walter hartwell white sr. aliases waltheisenbergmr


In [58]:
len(text)

8081

In [59]:
class Dictionary():
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __len__(self):
        return self.idx

class Corpus():
    def __init__(self):
        self.dictionary = Dictionary()
    
    def get_data(self, text, batch_size):
        tokens_size = 0

        for sentence in text:
            words = sentence.split() + ['<EOS>']
            tokens_size += len(words)
            for word in words:
                self.dictionary.add_word(word)

        ids = torch.LongTensor(tokens_size)
        size = 0
        for sentence in text:
            words = sentence.split() + ['<EOS>']
            for word in words:
                ids[size] = self.dictionary.word2idx[word]
                size += 1
    
        num_batches = size // batch_size
        ids = ids[:num_batches*batch_size]

        return ids.view(batch_size, -1)

In [60]:
embedding_size = 300
hidden_size = 1024
num_layers = 1
max_epochs = 5
batch_size = 20
seq_length = 20
lr = 0.002

corpus = Corpus()
ids = corpus.get_data(text, batch_size)
vocab_size = corpus.dictionary.idx
num_batches = ids.size(1) // seq_length

In [61]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_size, 
                 hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 
                                      embedding_size)
        self.lstm = nn.LSTM(embedding_size, 
                            hidden_size,
                            num_layers,
                            # bid
                            batch_first=True)
        self.fc1 = nn.Linear(hidden_size,
                             vocab_size)

    def forward(self, x, h):
        x = self.embedding(x)

        out, (h, c) = self.lstm(x, h)

        out = out.reshape(out.size(0)*out.size(1), out.size(2))

        out = self.fc1(out)

        return out, (h, c)

model = LSTM(vocab_size,
             embedding_size,
             hidden_size,
             num_layers).to(device)

In [62]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [63]:
def detach(states):
    return [state.detach() for state in states] 

In [64]:
for epoch in range(max_epochs):
    states = (torch.randn(num_layers, batch_size, hidden_size).to(device),
              torch.randn(num_layers, batch_size, hidden_size).to(device))
    
    for pos in range(0, ids.size(1) - seq_length, seq_length):
        seq = ids[:, pos:pos+seq_length].to(device)
        target = ids[:, (pos+1):(pos+1)+seq_length].to(device)

        states = detach(states)
        outputs, states = model.forward(seq, states)

        loss = criterion(outputs, target.reshape(-1))
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

        step = (pos+1) // seq_length
        if step % 100 == 0:
            print ('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                   .format(epoch+1, max_epochs, step,
                           num_batches, loss.item(),
                           np.exp(loss.item())))

Epoch [1/5], Step[0/505], Loss: 9.3116, Perplexity: 11065.33
Epoch [1/5], Step[100/505], Loss: 6.1063, Perplexity: 448.69
Epoch [1/5], Step[200/505], Loss: 5.4447, Perplexity: 231.52
Epoch [1/5], Step[300/505], Loss: 5.7483, Perplexity: 313.65
Epoch [1/5], Step[400/505], Loss: 5.0267, Perplexity: 152.43
Epoch [1/5], Step[500/505], Loss: 5.3698, Perplexity: 214.81
Epoch [2/5], Step[0/505], Loss: 5.0580, Perplexity: 157.28
Epoch [2/5], Step[100/505], Loss: 4.6263, Perplexity: 102.13
Epoch [2/5], Step[200/505], Loss: 3.8878, Perplexity: 48.80
Epoch [2/5], Step[300/505], Loss: 4.0105, Perplexity: 55.18
Epoch [2/5], Step[400/505], Loss: 3.7000, Perplexity: 40.45
Epoch [2/5], Step[500/505], Loss: 3.9862, Perplexity: 53.85
Epoch [3/5], Step[0/505], Loss: 4.1631, Perplexity: 64.27
Epoch [3/5], Step[100/505], Loss: 3.2952, Perplexity: 26.98
Epoch [3/5], Step[200/505], Loss: 2.8212, Perplexity: 16.80
Epoch [3/5], Step[300/505], Loss: 2.7664, Perplexity: 15.90
Epoch [3/5], Step[400/505], Loss: 2.

In [65]:
num_samples = 200

with torch.no_grad():
    with open('sample.txt', 'w') as f:

        state = (torch.zeros(num_layers, 1, hidden_size).to(device),
                 torch.zeros(num_layers, 1, hidden_size).to(device))

        prob = torch.ones(vocab_size)
        input = torch.multinomial(prob, num_samples=1).unsqueeze(1).to(device)

        for i in range(num_samples):
            output, state = model(input, state)

            prob = output.exp()
            word_id = torch.multinomial(prob, num_samples=1).item()

            input.fill_(word_id)

            word = corpus.dictionary.idx2word[word_id]
            word = '\n' if word == '<eos>' else word + ' '
            print(word, end=' ')

pinning  that  the  trailer  <EOS>  gus  offers  t  trust  hector  s  deal  and  walt  made  increasingly  rodarte  for  eleven  class  late  <EOS>  his  face  remains  in  mexico  gus  death.the  dea  trackers  about  the  max  s  efforts  to  become  less  and  less  reluctant  to  resort  to  which  jesse  now  that  their  right  is  the  website  of  all  their  cases  tuco  <EOS>  walt  contained  and  jesse  in  the  position  that  walt  is  retired  in  town  <EOS>  knowing  that  walt  no  longer  calling  jesse  s  drug  money  and  needs  if  he  poisoned  them  to  his  dad  a  car  can  call  hank  schrader  <EOS>  a  waitress  later  gomez  and  walt  go  into  one  of  the  time  walter  white  and  jesse  pinkman  entered  the  drug  business  while  hank  and  walt  made  a  mistake  found  the  leader  of  a  year  old  boy  he  d  known  since  the  boy  was  so  anyway  could  receive  help  from  some  police  officers  stand  and  as  a  well  trained  warehouse 