In [1]:
%matplotlib inline

In [2]:
from __future__ import division
from __future__ import unicode_literals

import logging
import os

import numpy as np
import pandas as pd
import torch as tc
import torch.utils.data as tc_data
import tqdm

import preprocessing
import model_utils as mu
import s2s_autoencoder as s2s
import tensorboard as tb

from pprint import pprint
from matplotlib import pyplot as plt
from IPython.display import display, HTML


plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize']= (10, 5)

pd.options.display.max_colwidth = 75
pd.options.display.width = 150
pd.options.display.max_columns = 40
try:
    css = open('/indocker/notebooks/style-table.css').read()
except IOError:
    css = open('/nail/home/visp/ipython/style-table.css').read()
HTML('<style>{}</style>'.format(css))

is_cuda: False


In [3]:
%load_ext autoreload
%autoreload 2

## Setup logging

In [4]:
reload(logging)
log = logging.getLogger()
log.setLevel(logging.DEBUG)

myhandler = logging.StreamHandler()
myformatter = logging.Formatter(fmt='%(asctime)s:%(name)s:%(levelname)s: %(message)s')
myhandler.setFormatter(myformatter)
log.addHandler(myhandler)

**So we have around 4 million reviews**

## Constants

In [10]:
REVIEW_FL = 'data/yelpds11/review.json'
W2V_FILE = 'data/glove.6B/glove.6B.50d-min.txt'
N_REVIEWS = 10
MAX_LEN = 20
NUM_CPUS = 4

## Load data

In [8]:
import codecs
import json
# Peek into the review data
with codecs.open(REVIEW_FL, encoding='utf-8') as infile:
    pprint(json.loads(infile.next()))

{u'business_id': u'0W4lkclzZThpx3V65bVgig',
 u'cool': 0,
 u'date': u'2016-05-28',
 u'funny': 0,
 u'review_id': u'v0i_UHJMo_hPBq9bxWvW4w',
 u'stars': 5,
 u'text': u"Love the staff, love the meat, love the place. Prepare for a long line around lunch or dinner hours. \n\nThey ask you how you want you meat, lean or something maybe, I can't remember. Just say you don't want it too fatty. \n\nGet a half sour pickle and a hot pepper. Hand cut french fries too.",
 u'useful': 0,
 u'user_id': u'bv2nCi5Qv5vroFiqKGopiw'}


In [9]:
w2v_data = preprocessing.load_w2v_data(path=W2V_FILE)

Loading embeddings: 100%|██████████| 9633/9633 [00:00<00:00, 58931.07it/s]
  size=n_special_tokens,


In [11]:
split_data = preprocessing.load_and_split_reviews(
    review_fl=REVIEW_FL,
    lang=w2v_data.lang,
    dev_frac=0.2,
    test_frac=0.2,
    n_reviews=N_REVIEWS,
    max_len=MAX_LEN,
    min_len=5,
)
split_data.train.head(2)




Unnamed: 0,review_id,sent_num,sent_len,ixs,tokens,text
0,8UIishPUD92hXtScSga_gw,0,11,"[0, 632, 2420, 327, 39, 2565, 161, 9, 5067, 61, 24, 6, 1, 2, 2, 2, 2, 2...","[<SOS>, always, drove, past, this, coffee, house, and, wondered, about,...",Always drove past this coffee house and wondered about it .
1,8UIishPUD92hXtScSga_gw,1,14,"[0, 7877, 9, 43, 1078, 114, 4, 755, 8, 759, 39, 233, 68, 6, 1, 2, 2, 2,...","[<SOS>, bf, and, i, finally, made, the, stop, to, try, this, place, out...",BF and I finally made the stop to try this place out . \n\n


In [12]:
split_data.train.sent_len.describe()

count    32.00000
mean     14.93750
std       5.14899
min       6.00000
25%      10.75000
50%      15.50000
75%      20.00000
max      20.00000
Name: sent_len, dtype: float64

## Training

In [13]:
tb.clear_expts()
crayon_expt = tb.get_experiment(name='pytorch_ae')

### Create model

In [None]:
s2s_ae = s2s.Seq2SeqAutoEncoder(
    lang=w2v_data.lang,
    config=s2s.Seq2SeqConfig(
        max_len=MAX_LEN,
        embed_dim=w2v_data.embedding_matrix.shape[1],
        hidden_dim=100,
        vocab_size=w2v_data.embedding_matrix.shape[0],
        bidirectional=True,
        num_layers=1,
        lr=0.001,
    ),
    embedding_matrix=w2v_data.embedding_matrix,
    verbose=True,
).cuda()
s2s_ae.share_memory()

### Actually train

In [None]:
dataloader = mu.to_dataloader(
    train_sent_df=split_data.train[:10],
    dev_sent_df=split_data.dev[:10],
    batch_size=1024,
    num_workers=0,
    drop_last=True,
)

In [None]:
s2s.s2s_fit(
    s2s_model=s2s_ae,
    dataloader=dataloader,
    save_epoch_freq=1,
    model_save_path='s2s_2.pkl',
    epochs=10,
    teacher_forcing=0.5,
    tb_expt=crayon_expt,
)


In [None]:
for d in dataloader:
    break

In [None]:
s2s_ae.forward(batch_sent_ixs=d['train']).dec_fwd.output_sents

In [None]:
encd = s2s_ae.encode_sentences(
    sents=train_tkns,
)

In [None]:
x = s2s_ae.decode_sentences(enc_hiddens=encd)

In [None]:
x

In [None]:
cudable_variable(
    tc.LongTensor([[1,2,3,5], [23,3,54,5]]),
    requires_grad=False,
)