In [18]:
import wiki
corpus = wiki.WikiCorpus.from_corpus_files()

100%|██████████| 26397/26397 [00:00<00:00, 161235.52it/s]

Opening corpus files...
Loading users...





Loading posts...


100%|██████████| 516766/516766 [00:09<00:00, 52634.84it/s]
  0%|          | 122/389121 [00:00<05:20, 1214.51it/s]

Setting up corpus...


100%|██████████| 389121/389121 [05:38<00:00, 1151.13it/s]


In [62]:
import torch
from collections import Counter
from tqdm import tqdm

class Tokenizer:                                                                                           
        
    END_OF_TEXT = '<END OF TEXT>'
    UNKNOWN_TOKEN = '<UNKNOWN TOKEN>'
    PAD_TOKEN = '<PAD TOKEN>'
        
    def __init__(self, tokens):                                                                          
        self._tokens = tokens                                                                            
        self._index_map = {t: i for i, t in enumerate(self._tokens)}
        self.num_tokens = len(self._tokens)
        self.UNKNOWN_TOKEN_INDEX = self._index_map[Tokenizer.UNKNOWN_TOKEN]
        self.END_OF_TEXT_INDEX = self._index_map[Tokenizer.END_OF_TEXT]
        self.PAD_TOKEN_INDEX = self._index_map[Tokenizer.PAD_TOKEN]
                                                                                                         
    @classmethod                                                                                         
    def fit(cls, texts, max_tokens):
        print("Fitting tokenizer to texts...")
        token_count = Counter()
        for text in tqdm(texts):
            for token in text:                                                                           
                token_count[token] += 1
        aux_tokens = [Tokenizer.END_OF_TEXT, Tokenizer.UNKNOWN_TOKEN, Tokenizer.PAD_TOKEN]
        tokens = token_count.most_common(max_tokens - len(aux_tokens)) + aux_tokens 
        return cls(tokens)                                     
                                                                                                         
    def token_to_index(self, t):                                                                               
        return self._index_map.get(t, self.UNKNOWN_TOKEN_INDEX)
                                                                                                         
    def index_to_token(self, i):                                                                               
        return self._tokens[i]

    
    def encode_texts(self, texts, sequence_length):
        data = []
        for text in texts:
            tensor = torch.LongTensor(sequence_length)
            i = 0
            text.append(Tokenizer.END_OF_TEXT)
            for token in text:
                tensor[i] = self.token_to_index(token)
                i += 1
                if i == sequence_length:
                    data.append(tensor)
                    tensor = torch.LongTensor(sequence_length)
                    i = 0
            for i in range(i, sequence_length):
                tensor[i] = self.PAD_TOKEN_INDEX
                
                                                                                                 

In [60]:
i = 10
for i in range(i, 14):
    print(i)

10
11
12
13


In [58]:
posts = [t for t in post.tokens for post in corpus.posts.values()]
tokenizer = Tokenizer.fit(posts, 50000)

  0%|          | 3405/778242 [00:00<00:22, 34036.07it/s]

Fitting tokenizer to texts...


 14%|█▍        | 109013/778242 [00:02<00:18, 37130.45it/s]


KeyboardInterrupt: 

In [49]:
max_tokens = 10000                                                                                   
embedding_dim = 200
hidden_dim = 200
num_layers = 2
dropout = 0.5
learning_rate = 0.005
batch_size = 128 
max_seq_len = 32

data = tokenizer.encode_texts(posts)
n_batches = data.size(0) // batch_size
data = data.narrow(0, 0, n_batches * batch_size)
data = data.view(batch_size, -1).t().contiguous()


In [54]:
from torch.autograd import Variable

def get_batch(source, i, evaluation=False):
    seq_len = min(max_seq_len, len(source) - 1 - i)
    data = Variable(source[i:i+seq_len], volatile=evaluation)
    target = Variable(source[i+1:i+1+seq_len].view(-1))
    return data, target

In [55]:
get_batch(data, 0)

(Variable containing:
    32    32    32  ...     32    32    32
    32    32    31  ...     32    32    32
    32    32    32  ...     32    32    32
        ...          ⋱          ...       
    32    32    32  ...     31    32    32
    32    32    32  ...     32    32    32
    32    32    32  ...     32    32    32
 [torch.LongTensor of size 32x128], Variable containing:
  32
  32
  31
  ⋮ 
  32
  32
  32
 [torch.LongTensor of size 4096])

In [56]:
data[:10]


   32    32    32  ...     32    32    32
   32    32    31  ...     32    32    32
   32    32    32  ...     32    32    32
       ...          ⋱          ...       
   32    32    32  ...     32    32    32
   32    32    32  ...     32    32    32
   32    32    32  ...     32    32    32
[torch.LongTensor of size 10x128]