In [1]:
%matplotlib inline

In [2]:
from __future__ import division
from __future__ import unicode_literals

import logging
import os

import numpy as np
import pandas as pd
import torch as tc
import torch.utils.data as tc_data
import tqdm

import preprocessing
import model_utils as mu
import s2s_autoencoder as s2s
import tensorboard as tb

from pprint import pprint
from matplotlib import pyplot as plt
from IPython.display import display, HTML


plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize']= (10, 5)

pd.options.display.max_colwidth = 75
pd.options.display.width = 150
pd.options.display.max_columns = 40
css = open('/indocker/notebooks/style-table.css').read()
HTML('<style>{}</style>'.format(css))

is_cuda: True


In [3]:
%load_ext autoreload
%autoreload 2

## Setup logging

In [4]:
reload(logging)
log = logging.getLogger()
log.setLevel(logging.DEBUG)

myhandler = logging.StreamHandler()
myformatter = logging.Formatter(fmt='%(asctime)s:%(name)s:%(levelname)s: %(message)s')
myhandler.setFormatter(myformatter)
log.addHandler(myhandler)

**So we have around 4 million reviews**

## Constants

In [12]:
REVIEW_FL = '/indocker/data/yelpds11/review.json'
W2V_FILE = '/indocker/data/glove.6B/glove.6B.50d-min.txt'
N_REVIEWS = 25000
MAX_LEN = 20
NUM_CPUS = 4

## Load data

In [6]:
import codecs
import json
# Peek into the review data
with codecs.open(REVIEW_FL, encoding='utf-8') as infile:
    pprint(json.loads(infile.next()))

{u'business_id': u'0W4lkclzZThpx3V65bVgig',
 u'cool': 0,
 u'date': u'2016-05-28',
 u'funny': 0,
 u'review_id': u'v0i_UHJMo_hPBq9bxWvW4w',
 u'stars': 5,
 u'text': u"Love the staff, love the meat, love the place. Prepare for a long line around lunch or dinner hours. \n\nThey ask you how you want you meat, lean or something maybe, I can't remember. Just say you don't want it too fatty. \n\nGet a half sour pickle and a hot pepper. Hand cut french fries too.",
 u'useful': 0,
 u'user_id': u'bv2nCi5Qv5vroFiqKGopiw'}


In [7]:
w2v_data = preprocessing.load_w2v_data(path=W2V_FILE)

Loading embeddings: 100%|██████████| 9633/9633 [00:00<00:00, 69370.01it/s]


In [8]:
split_data = preprocessing.load_and_split_reviews(
    review_fl=REVIEW_FL,
    lang=w2v_data.lang,
    dev_frac=0.2,
    test_frac=0.2,
    n_reviews=N_REVIEWS,
    max_len=MAX_LEN,
    min_len=5,
)
split_data.train.head(2)




Unnamed: 0,review_id,sent_num,sent_len,ixs,tokens,text
0,--vWmCoqCGZgcmsn7ELlUQ,0,20,"[0, 55, 115, 8, 164, 395, 29, 3012, 947, 25, 1976, 5770, 113, 9, 4274, ...","[<SOS>, we, like, to, get, away, from, vegas, sometimes, by, visiting, ...",We like to get away from Vegas sometimes by visiting Boulder City and c...
1,--yQsDsWtziQMDSjmCMXrQ,0,20,"[0, 39, 233, 175, 8, 32, 4, 3, 3, 112, 24, 13, 170, 3, 3, 8507, 934, 10...","[<SOS>, this, place, used, to, be, the, notorious, p.i.g., now, it, 's,...",This place used to be the Notorious P.I.G. now it 's called Hammered Ho...


In [9]:
split_data.train.sent_len.describe()

count    22250.000000
mean        19.523506
std          2.075740
min          5.000000
25%         20.000000
50%         20.000000
75%         20.000000
max         20.000000
Name: sent_len, dtype: float64

## Training

In [23]:
tb.clear_expts()
crayon_expt = tb.get_experiment(name='pytorch_ae')

### Create model

In [24]:
s2s_ae = s2s.Seq2SeqAutoEncoder(
    lang=w2v_data.lang,
    config=s2s.Seq2SeqConfig(
        max_len=MAX_LEN,
        embed_dim=w2v_data.embedding_matrix.shape[1],
        hidden_dim=100,
        vocab_size=w2v_data.embedding_matrix.shape[0],
        bidirectional=True,
        num_layers=1,
        lr=0.001,
    ),
    embedding_matrix=w2v_data.embedding_matrix,
    verbose=True,
).cuda()
s2s_ae.share_memory()

Seq2SeqAutoEncoder(
  (embedding): Embedding(9637, 50)
  (encoder): EncoderRNN(
    (embedding): Embedding(9637, 50)
    (lstm): LSTM(50, 25, batch_first=True, bidirectional=True)
  )
  (decoder): DecoderRNN(
    (embedding): Embedding(9637, 50)
    (lstm): LSTM(100, 50, batch_first=True)
    (vocab_softmax): Sequential(
      (0): Linear(in_features=50, out_features=9637, bias=True)
      (1): LogSoftmax()
    )
  )
  (loss_fn): NLLLoss(
  )
)

### Actually train

In [31]:
dataloader = mu.to_dataloader(
    train_sent_df=split_data.train,
    dev_sent_df=split_data.dev,
    batch_size=1024,
    num_workers=0,
    drop_last=True,
)

In [None]:
s2s.s2s_fit(
    s2s_model=s2s_ae,
    dataloader=dataloader,
    save_epoch_freq=1,
    model_save_path='s2s.pkl',
    epochs=10,
    teacher_forcing=0.5,
    tb_expt=crayon_expt,
)


2018-03-17 00:25:29,966:s2s_autoencoder:INFO: 

---------------------------------------------------------------------

EPOCH RESULT: 0

dev/loss: 187,018.8013
grad_by_wts/mean: 6.3550
train/loss: 187,965.2744
dev/evaluation/compute_accuracy: 0.0054
grad_by_wts/std: 938.6041
train/evaluation/compute_accuracy: 0.0054

---------------------------------------------------------------------


2018-03-17 00:25:29,968:s2s_autoencoder:INFO: Saving model to `s2s.pkl`...
2018-03-17 00:26:27,537:s2s_autoencoder:INFO: Saved model to `s2s.pkl`.


2018-03-17 00:41:15,988:s2s_autoencoder:INFO: 

---------------------------------------------------------------------

EPOCH RESULT: 1

dev/loss: 164,424.4854
grad_by_wts/mean: 8.1475
train/loss: 165,269.0688
dev/evaluation/compute_accuracy: 0.0096
grad_by_wts/std: 2,365.9916
train/evaluation/compute_accuracy: 0.0096

---------------------------------------------------------------------


2018-03-17 00:41:15,989:s2s_autoencoder:INFO: Saving model to `s2s.pkl`...
2018-03-17 00:42:14,313:s2s_autoencoder:INFO: Saved model to `s2s.pkl`.


In [None]:
s2s_ae.decode_sentences(
    s2s_ae.encode_sentences(sents=train_sents[0])
)

In [None]:
cudable_variable(
    tc.LongTensor([[1,2,3,5], [23,3,54,5]]),
    requires_grad=False,
)