# im2latex

&copy; Copyright 2017 - 2018 Sumeet S Singh

    This file is part of im2latex solution by Sumeet S Singh.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the Affero GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    Affero GNU General Public License for more details.

    You should have received a copy of the Affero GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

----
## Create dataset files for inferencing

This notebook creates data files that can be used for inferencing.

-----

In [1]:
import os
import numpy as np
from IPython.display import display, Image as ipImage
import PIL
from PIL import Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves import cPickle as pickle
from mpl_toolkits.axes_grid1 import ImageGrid
import pandas as pd
import matplotlib.pyplot as plt

# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline

In [2]:
# pd.options.display.max_rows = 600
pd.options.display.max_columns = 20
pd.options.display.max_colwidth = 100
pd.options.display.width = 160

### Notebook Arguments
Create a new 'raw_data_dir' - e.g. "inferencing" - in your data_dir (alongside the folders step1, step2 etc.). Copy the image files who's LaTeX you want to generate into the formula_images folder. Next supply the name of image files below and modify any other arguments you may want below. For inferencing, execute run.py under the --test flag and with --raw-data-dir option pointing to the abovesaid folder. 

You'll need to supply the correct per-gpu batch-size value (-b flag) to run.py. The overall batch-size HYPER_batch_size = per-gpu-batch-size * num_gpus. HYPER_batch_size is set to number of images by default below, which means that when you execute run.py, you need to supply a per-gpu batch-size (using the -b option) = HYPER_batch_size/num_gpus or an integral factor of it. For e.g. if you had 50 images to infer, and had 1 GPU, you could supply "-b 50", "-b 25", "-b 10" etc. If you had 2 GPUs, then you can specify "-b 25" and "-b 5" in this case. You can also set the value of HYPER_batch_size to a factor of len(images) as long as you ensure that HYPER_batch_size is a factor of len(images) - add, remove or duplicate images from/to your inferencing set to get suitable total size. So for e.g. if you had 2 GPUs and 998 images to infer, you could duplicate two images (giving them different names) to get a total number of 1000 and then set HYPER_batch_size = 50 and supply "-b 25" option to run.py.

Finally, after you're convinced that everything is working, switch the flag "dump" below to True and rerun the notebook. The flag defaults to False, meaning that it won't write files to disk.

In [3]:
images = ['0000a586456794e_basic.png', '0000a8416b30429_basic.png', '0000ca7c3d3830b_basic.png', '00021a10c3d0ffc_basic.png']
HYPER_batch_size = len(images)
data_folder = data_dir = '../data/dataset5'
image_folder = image_dir = os.path.join(data_folder, 'formula_images')
raw_data_dir = os.path.join(data_folder, 'inferencing_%d'%HYPER_batch_size)
dump = False

In [4]:
print('Output Dir = %s'%raw_data_dir)

Output Dir = ../data/dataset5/inferencing_4


In [5]:
data_dict = {'image':[], 'height':[], 'width':[], 'word2id_len':[], u'bin_len':[], u'word2id':[], u'padded_seq':[], u'padded_seq_len':[], u'seq_len':[], u'squashed_len':[], u'squashed_seq':[]}
df_train_squashed = pd.DataFrame(data_dict)[['image', 'height', 'width', 'word2id_len', u'bin_len', u'word2id', u'padded_seq', u'padded_seq_len', u'seq_len', u'squashed_len', u'squashed_seq']]
df_valid_squashed = pd.DataFrame(data_dict)[['image', 'height', 'width', 'word2id_len', u'bin_len', u'word2id', u'padded_seq', u'padded_seq_len', u'seq_len', u'squashed_len', u'squashed_seq']]

l = 150
word2id = [1]*l
padded_seq = word2id + [0]
padded_len = l + 1
for f in images:
    im = Image.open(os.path.join(image_folder,f))
    w = im.size[0]
    h = im.size[1]
    data_dict['image'].append(f)
    data_dict['height'].append(h)
    data_dict['width'].append(w)
    data_dict['word2id_len'].append(l)
    data_dict['bin_len'].append(padded_len)
    data_dict['word2id'].append(word2id)
    data_dict['padded_seq'].append(padded_seq)
    data_dict['padded_seq_len'].append(padded_len)
    data_dict['seq_len'].append(padded_len)
    data_dict['squashed_len'].append(padded_len)
    data_dict['squashed_seq'].append(padded_seq)
df_test_squashed = pd.DataFrame(data_dict)[['image', 'height', 'width', 'word2id_len', u'bin_len', u'word2id', u'padded_seq', u'padded_seq_len', u'seq_len', u'squashed_len', u'squashed_seq']]
df_test_squashed

Unnamed: 0,image,height,width,word2id_len,bin_len,word2id,padded_seq,padded_seq_len,seq_len,squashed_len,squashed_seq
0,0000a586456794e_basic.png,82,615,150,151,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",151,151,151,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
1,0000a8416b30429_basic.png,30,264,150,151,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",151,151,151,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
2,0000ca7c3d3830b_basic.png,78,360,150,151,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",151,151,151,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
3,00021a10c3d0ffc_basic.png,79,882,150,151,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",151,151,151,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."


In [6]:
df_train_squashed

Unnamed: 0,image,height,width,word2id_len,bin_len,word2id,padded_seq,padded_seq_len,seq_len,squashed_len,squashed_seq


In [7]:
def make_seq_bins(df_):
    """
    Creates ndarrays of (padded) sequence bins from df_*_squashed / df_*_padded 
    and pickles them as a dictionary of ndarrays wrapped in dataframes.
    This preprocessing is needed in order to quickly obtain an ndarray of
    token-sequences at training time.
    """
    bin_lens = df_.bin_len.unique()
    bins = {}
    bins_squashed = {}
    
    for len_ in bin_lens:
        df_slice = df_[df_.padded_seq_len == len_]
        bin_ = np.array(df_slice.padded_seq.values.tolist(), dtype=np.int32)
        bin_squashed = np.array(df_slice.squashed_seq.values.tolist(), dtype=np.int32)
        assert bin_.shape[1] == len_
        assert bin_.shape[0] == df_slice.shape[0]
        bins[len_] = pd.DataFrame(bin_, index=df_slice.index)
        bins_squashed[len_] = pd.DataFrame(bin_squashed, index=df_slice.index)
    return bins, bins_squashed

In [8]:
bins_test, bins_sq_test = make_seq_bins(df_test_squashed)
bins_train, bins_sq_train = make_seq_bins(df_train_squashed)
bins_valid, bins_sq_valid = make_seq_bins(df_valid_squashed)

### Persist to Disk

In [9]:
if dump:
    if not os.path.exists(raw_data_dir):
        os.makedirs(raw_data_dir)
    with open(os.path.join(raw_data_dir, 'batch_size.pkl'), 'wb') as f:
      pickle.dump(HYPER_batch_size, f, pickle.HIGHEST_PROTOCOL)
    with open(os.path.join(raw_data_dir, 'batch_size.pkl'), 'rb') as f:
        assert pickle.load(f) == HYPER_batch_size
        
    df_train_squashed.to_pickle(os.path.join(raw_data_dir, 'df_train.pkl'))
    df_test_squashed.to_pickle(os.path.join(raw_data_dir, 'df_test.pkl'))
    df_valid_squashed.to_pickle(os.path.join(raw_data_dir, 'df_valid.pkl'))
    
    with open(os.path.join(raw_data_dir, 'raw_seq_train.pkl'), 'wb') as f:
      pickle.dump(bins_train, f, pickle.HIGHEST_PROTOCOL)
    with open(os.path.join(raw_data_dir, 'raw_seq_sq_train.pkl'), 'wb') as f:
      pickle.dump(bins_sq_train, f, pickle.HIGHEST_PROTOCOL)

    with open(os.path.join(raw_data_dir, 'raw_seq_test.pkl'), 'wb') as f:
      pickle.dump(bins_test, f, pickle.HIGHEST_PROTOCOL)
    with open(os.path.join(raw_data_dir, 'raw_seq_sq_test.pkl'), 'wb') as f:
      pickle.dump(bins_sq_test, f, pickle.HIGHEST_PROTOCOL)

    with open(os.path.join(raw_data_dir, 'raw_seq_valid.pkl'), 'wb') as f:
      pickle.dump(bins_valid, f, pickle.HIGHEST_PROTOCOL)
    with open(os.path.join(raw_data_dir, 'raw_seq_sq_valid.pkl'), 'wb') as f:
      pickle.dump(bins_sq_valid, f, pickle.HIGHEST_PROTOCOL)
    
    data_props = {}
    dict_vocab = pd.read_pickle(os.path.join(data_dir, 'step2', 'dict_vocab.pkl'))
    word2id = dict_vocab['id']
    id2word = pd.read_pickle(os.path.join(data_dir, 'step2', 'dict_id2word.pkl'))
    data_props['id2word'] = id2word
    data_props['word2id'] = word2id
    data_props['K'] = max(id2word.keys()) + 1
    data_props['SpaceTokenID'] = word2id[' '] if ' ' in word2id else None
    data_props['NullTokenID'] = word2id[r'\eos']
    data_props['StartTokenID'] = word2id[r'\bos']
    data_props['MaxSeqLen'] = df_test_squashed.padded_seq_len.max()
    padded_image_dim = pd.read_pickle(os.path.join(data_dir, 'step3', 'padded_image_dim.pkl'))
    data_props['padded_image_dim'] = {'height': padded_image_dim['height'], 'width':padded_image_dim['width']}
    with open(os.path.join(raw_data_dir, 'data_props.pkl'), 'wb') as f:
        pickle.dump(data_props, f, pickle.HIGHEST_PROTOCOL)

### Running an inferencing cycle
After you've generated the dataset as above and have a trained model snapshot, execute run.py using the --test flag and with --raw-data-folder pointing to the abovementioned directory and providing model snapshot via. the --restore option. Also remember to adjust the per-gpu batch-size (-b flag) appropriately e.g.: ./run.py -a 0.0001 -e -1 -b 50 -v -1 -i 2 --r-lambda 0.00005 --raw-data-folder ../data/dataset5/inferencing_100 --test --save-all-eval --restore "./tb_metrics/2017-12-25 21-04-15 PST 140K_noRegroup_score89.09/test_runs/step_00167526_temp".

See the notebook visualize.ipynb for examples on how to extract the predictions fro the underlying h5py files.

# END