In [1]:
%matplotlib inline

In [2]:
from __future__ import division
from __future__ import unicode_literals

import logging
import os

import numpy as np
import pandas as pd
import torch as tc
import torch.utils.data as tc_data
import tqdm

import preprocessing
import model_utils as mu
import s2s_autoencoder as s2s
import tensorboard as tb

from pprint import pprint
from matplotlib import pyplot as plt
from IPython.display import display, HTML


plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize']= (10, 5)

pd.options.display.max_colwidth = 75
pd.options.display.width = 150
pd.options.display.max_columns = 40
css = open('/nail/home/visp/ipython/style-table.css').read()
HTML('<style>{}</style>'.format(css))

is_cuda: False


In [3]:
#%load_ext autoreload
#%autoreload 2

## Setup logging

In [4]:
reload(logging)
log = logging.getLogger()
log.setLevel(logging.DEBUG)

myhandler = logging.StreamHandler()
myformatter = logging.Formatter(fmt='%(asctime)s:%(name)s:%(levelname)s: %(message)s')
myhandler.setFormatter(myformatter)
log.addHandler(myhandler)

In [5]:
!tree 

.
├── PyTorch-Tutorials-EncDec.ipynb
├── PyTorch-Tutorials.ipynb
├── Reviews_AutoEncoder-Pytorch-v2.ipynb
├── Reviews_AutoEncoder-Pytorch-v3.ipynb
├── Reviews_AutoEncoder-Pytorch.ipynb
├── Reviews_AutoEncoder_Keras.ipynb
├── data
│   ├── glove.6B
│   │   ├── glove.6B.100d.txt
│   │   ├── glove.6B.200d.txt
│   │   ├── glove.6B.50d-min.txt
│   │   └── glove.6B.50d.txt
│   └── yelpds11
│       ├── Dataset_Challenge_Dataset_Agreement.pdf
│       ├── Yelp_Dataset_Challenge_Round_11.pdf
│       ├── business.json
│       ├── checkin.json
│       ├── photos.json
│       ├── review.json
│       ├── tip.json
│       └── user.json
├── decoder.py
├── decoder.pyc
├── embedder.py
├── encoder.py
├── encoder.pyc
├── extract_review_text.py
├── model_utils.py
├── model_utils.pyc
├── most-common-words.ipynb
├── neural-model-report.pdf
├── preprocessing.py
├── preprocessing.pyc
├── s2s.pkl
├── s2s_autoencoder.py
├── s2s_autoencoder.pyc
├── tensorboard.py
├── tensorboa

**So we have around 4 million reviews**

## Constants

In [7]:
REVIEW_FL = 'data/yelpds11/review.json'
W2V_FILE = 'data/glove.6B/glove.6B.50d-min.txt'
N_REVIEWS = 10
MAX_LEN = 20

## Load data

In [8]:
import codecs
import json
# Peek into the review data
with codecs.open(REVIEW_FL, encoding='utf-8') as infile:
    pprint(json.loads(infile.next()))

{u'business_id': u'0W4lkclzZThpx3V65bVgig',
 u'cool': 0,
 u'date': u'2016-05-28',
 u'funny': 0,
 u'review_id': u'v0i_UHJMo_hPBq9bxWvW4w',
 u'stars': 5,
 u'text': u"Love the staff, love the meat, love the place. Prepare for a long line around lunch or dinner hours. \n\nThey ask you how you want you meat, lean or something maybe, I can't remember. Just say you don't want it too fatty. \n\nGet a half sour pickle and a hot pepper. Hand cut french fries too.",
 u'useful': 0,
 u'user_id': u'bv2nCi5Qv5vroFiqKGopiw'}


In [10]:
w2v_data = preprocessing.load_w2v_data(path=W2V_FILE)

Loading embeddings: 100%|██████████| 9633/9633 [00:00<00:00, 55397.05it/s]
  size=n_special_tokens,


In [11]:
split_data = preprocessing.load_and_split_reviews(
    review_fl=REVIEW_FL,
    lang=w2v_data.lang,
    dev_frac=0.2,
    test_frac=0.2,
    n_reviews=N_REVIEWS,
    max_len=MAX_LEN,
    min_len=5,
)
log.info('Peek into the loaded dataset: `{df}`'.format(df=split_data.train.head(2)))
split_data.train.head(2)




2018-03-16 06:33:49,523:root:INFO: Peek into the loaded dataset: `                review_id  sent_num  sent_len                                                                         ixs  \
0  8UIishPUD92hXtScSga_gw         0        11  [0, 632, 2420, 327, 39, 2565, 161, 9, 5067, 61, 24, 6, 1, 2, 2, 2, 2, 2...   
1  8UIishPUD92hXtScSga_gw         1        14  [0, 7877, 9, 43, 1078, 114, 4, 755, 8, 759, 39, 233, 68, 6, 1, 2, 2, 2,...   

                                                                       tokens                                                         text  
0  [<SOS>, always, drove, past, this, coffee, house, and, wondered, about,...  Always drove past this coffee house and wondered about it .  
1  [<SOS>, bf, and, i, finally, made, the, stop, to, try, this, place, out...  BF and I finally made the stop to try this place out . \n\n  `


Unnamed: 0,review_id,sent_num,sent_len,ixs,tokens,text
0,8UIishPUD92hXtScSga_gw,0,11,"[0, 632, 2420, 327, 39, 2565, 161, 9, 5067, 61, 24, 6, 1, 2, 2, 2, 2, 2...","[<SOS>, always, drove, past, this, coffee, house, and, wondered, about,...",Always drove past this coffee house and wondered about it .
1,8UIishPUD92hXtScSga_gw,1,14,"[0, 7877, 9, 43, 1078, 114, 4, 755, 8, 759, 39, 233, 68, 6, 1, 2, 2, 2,...","[<SOS>, bf, and, i, finally, made, the, stop, to, try, this, place, out...",BF and I finally made the stop to try this place out . \n\n


In [12]:
split_data.train.sent_len.describe()

count    32.00000
mean     14.18750
std       4.99314
min       6.00000
25%      10.75000
50%      12.50000
75%      20.00000
max      20.00000
Name: sent_len, dtype: float64

## Training

In [13]:
tb.clear_expts()
crayon_expt = tb.get_experiment(name='pytorch_ae')

### Create model

In [14]:
s2s_ae = s2s.Seq2SeqAutoEncoder(
    lang=w2v_data.lang,
    config=s2s.Seq2SeqConfig(
        max_len=MAX_LEN,
        embed_dim=w2v_data.embedding_matrix.shape[1],
        hidden_dim=40,
        vocab_size=w2v_data.embedding_matrix.shape[0],
        bidirectional=True,
        num_layers=1,
        lr=0.001,
    ),
    embedding_matrix=w2v_data.embedding_matrix,
    verbose=True,
)

### Actually train

In [15]:
dataloader = mu.to_dataloader(
    train_sent_df=split_data.train,
    dev_sent_df=split_data.dev,
    batch_size=10,
    num_workers=4,
)

In [16]:
s2s.s2s_fit(
    s2s_model=s2s_ae,
    dataloader=dataloader,
    save_epoch_freq=100,
    model_save_path='s2s.pkl',
    epochs=3,
    teacher_forcing=0.5,
    tb_expt=crayon_expt,
)




[{'grad_by_wts': {'mean': 0.012478409655854991, 'std': 4.8680703734682691},
  'split_metrics_map': {'dev': {'evaluation/compute_accuracy': 0.0,
    'loss': 293.7275085449219},
   'train': {'evaluation/compute_accuracy': 0.0, 'loss': 293.5926742553711}}},
 {'grad_by_wts': {'mean': 0.012139479214713816, 'std': 4.2570152124016323},
  'split_metrics_map': {'dev': {'evaluation/compute_accuracy': 0.0,
    'loss': 293.3723945617676},
   'train': {'evaluation/compute_accuracy': 0.0, 'loss': 292.71355056762695}}},
 {'grad_by_wts': {'mean': 0.0091891739237921211, 'std': 1.0752866564693626},
  'split_metrics_map': {'dev': {'evaluation/compute_accuracy': 0.0,
    'loss': 293.1985454559326},
   'train': {'evaluation/compute_accuracy': 0.0, 'loss': 291.97930908203125}}}]

In [None]:
s2s_ae.decode_sentences(
    s2s_ae.encode_sentences(sents=train_sents[0])
)