In [1]:
from data import wiki
from data import corpus
import alignment

import nltk
import pandas as pd
import pickle
from tqdm import tqdm_notebook as tqdm

tqdm().pandas()

posts = wiki.load_posts()
pairs = corpus.get_reply_pairs(posts)
users = wiki.load_users(posts=posts)
network = wiki.load_network(reply_pairs=pairs, recreate=False)

df = pd.merge(posts, users, left_on='user', right_index=True)

threshold = users.centrality.mean() + users.centrality.std()
users['highly_central'] = (users['centrality'] > threshold)
df['highly_central'] = (df['centrality'] > threshold)
users.highly_central.value_counts()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




False    29007
True      1893
Name: highly_central, dtype: int64

In [3]:
pos_tags = posts.tokens.apply(nltk.pos_tag)
pos_tags_only = pos_tags.apply(lambda x: [y[1] for y in x])

In [18]:
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0
    
    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
    
    def __len__(self):
        return len(self.word2idx)

class Corpus(object):
    def __init__(self):
        self.dictionary = Dictionary()

    def get_data(self, series, batch_size=20):
        # Add words to the dictionary
        tokens = 0
        for line in series:
            words = line + ['<eos>']
            tokens += len(words)
            for word in words: 
                self.dictionary.add_word(word)  

        ids = torch.LongTensor(tokens)
        token = 0
        for line in series:
            words = line + ['<eos>']
            for word in words:
                ids[token] = self.dictionary.word2idx[word]
                token += 1
        num_batches = ids.size(0) // batch_size
        ids = ids[:num_batches*batch_size]
        return ids.view(batch_size, -1)



In [42]:
import torch.nn as nn
import torch.optim as optim

class RNNLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, h):
        # Embed word ids to vectors
        x = self.embed(x)
        
        # Forward propagate LSTM
        out, (h, c) = self.lstm(x, h)
        
        # Reshape output to (batch_size*sequence_length, hidden_size)
        out = out.reshape(out.size(0)*out.size(1), out.size(2))
        
        # Decode hidden states of all time steps
        out = self.linear(out)
        return out, (h, c)

In [50]:
batch_size = 20
hidden_size = 200
embedding_size = 50
seq_length = 30
num_epochs = 1
model = RNNLM(len(corpus.dictionary), embedding_size, hidden_size, 1)

In [19]:
corpus = Corpus()
data = corpus.get_data(pos_tags_only, batch_size)

num_batches = data.size(1) // seq_length

In [54]:
import numpy as np

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.01)

# Truncated backpropagation
def detach(states):
    return [state.detach() for state in states] 

# Train the model
for epoch in range(num_epochs):
    # Set initial hidden and cell states
    states = (torch.zeros(1, batch_size, hidden_size),
              torch.zeros(1, batch_size, hidden_size))
    
    for i in range(0, data.size(1) - seq_length, seq_length):
        # Get mini-batch inputs and targets
        inputs = data[:, i:i+seq_length]
        targets = data[:, (i+1):(i+1)+seq_length]
        
        # Forward pass
        states = detach(states)
        outputs, states = model(inputs, states)
        loss = criterion(outputs, targets.reshape(-1))
        
        # Backward and optimize
        model.zero_grad()
        loss.backward()
        #clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        step = (i+1) // seq_length
        if step % 100 == 0:
            print ('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                   .format(epoch+1, num_epochs, step, num_batches, loss.item(), np.exp(loss.item())))


Epoch [1/1], Step[0/38059], Loss: 3.0300, Perplexity: 20.70
Epoch [1/1], Step[100/38059], Loss: 2.3599, Perplexity: 10.59
Epoch [1/1], Step[200/38059], Loss: 2.2038, Perplexity:  9.06
Epoch [1/1], Step[300/38059], Loss: 2.4060, Perplexity: 11.09
Epoch [1/1], Step[400/38059], Loss: 2.3193, Perplexity: 10.17
Epoch [1/1], Step[500/38059], Loss: 2.3512, Perplexity: 10.50
Epoch [1/1], Step[600/38059], Loss: 2.1364, Perplexity:  8.47
Epoch [1/1], Step[700/38059], Loss: 2.1374, Perplexity:  8.48
Epoch [1/1], Step[800/38059], Loss: 2.2399, Perplexity:  9.39
Epoch [1/1], Step[900/38059], Loss: 2.2031, Perplexity:  9.05
Epoch [1/1], Step[1000/38059], Loss: 2.2540, Perplexity:  9.53
Epoch [1/1], Step[1100/38059], Loss: 2.1650, Perplexity:  8.71
Epoch [1/1], Step[1200/38059], Loss: 2.2341, Perplexity:  9.34
Epoch [1/1], Step[1300/38059], Loss: 2.1859, Perplexity:  8.90
Epoch [1/1], Step[1400/38059], Loss: 2.1410, Perplexity:  8.51
Epoch [1/1], Step[1500/38059], Loss: 2.2121, Perplexity:  9.14
Epoc

Epoch [1/1], Step[13000/38059], Loss: 2.1132, Perplexity:  8.27
Epoch [1/1], Step[13100/38059], Loss: 2.1356, Perplexity:  8.46
Epoch [1/1], Step[13200/38059], Loss: 2.1821, Perplexity:  8.87
Epoch [1/1], Step[13300/38059], Loss: 2.1223, Perplexity:  8.35
Epoch [1/1], Step[13400/38059], Loss: 2.0002, Perplexity:  7.39
Epoch [1/1], Step[13500/38059], Loss: 2.1746, Perplexity:  8.80
Epoch [1/1], Step[13600/38059], Loss: 1.9798, Perplexity:  7.24
Epoch [1/1], Step[13700/38059], Loss: 2.2077, Perplexity:  9.10
Epoch [1/1], Step[13800/38059], Loss: 2.0405, Perplexity:  7.69
Epoch [1/1], Step[13900/38059], Loss: 2.1770, Perplexity:  8.82
Epoch [1/1], Step[14000/38059], Loss: 2.1220, Perplexity:  8.35
Epoch [1/1], Step[14100/38059], Loss: 2.1523, Perplexity:  8.60
Epoch [1/1], Step[14200/38059], Loss: 2.1695, Perplexity:  8.75
Epoch [1/1], Step[14300/38059], Loss: 2.2626, Perplexity:  9.61
Epoch [1/1], Step[14400/38059], Loss: 2.1097, Perplexity:  8.25
Epoch [1/1], Step[14500/38059], Loss: 2.

Epoch [1/1], Step[25900/38059], Loss: 2.2116, Perplexity:  9.13
Epoch [1/1], Step[26000/38059], Loss: 2.0234, Perplexity:  7.56
Epoch [1/1], Step[26100/38059], Loss: 2.0130, Perplexity:  7.49
Epoch [1/1], Step[26200/38059], Loss: 2.0342, Perplexity:  7.65
Epoch [1/1], Step[26300/38059], Loss: 2.1823, Perplexity:  8.87
Epoch [1/1], Step[26400/38059], Loss: 1.9995, Perplexity:  7.39
Epoch [1/1], Step[26500/38059], Loss: 2.1187, Perplexity:  8.32
Epoch [1/1], Step[26600/38059], Loss: 2.0233, Perplexity:  7.56
Epoch [1/1], Step[26700/38059], Loss: 2.0948, Perplexity:  8.12
Epoch [1/1], Step[26800/38059], Loss: 2.1766, Perplexity:  8.82
Epoch [1/1], Step[26900/38059], Loss: 2.0650, Perplexity:  7.89
Epoch [1/1], Step[27000/38059], Loss: 2.0980, Perplexity:  8.15
Epoch [1/1], Step[27100/38059], Loss: 2.1229, Perplexity:  8.36
Epoch [1/1], Step[27200/38059], Loss: 2.1182, Perplexity:  8.32
Epoch [1/1], Step[27300/38059], Loss: 2.1526, Perplexity:  8.61
Epoch [1/1], Step[27400/38059], Loss: 2.