# im2latex(S): Tokenizer

&copy; Copyright 2017 Sumeet S Singh

    This file is part of im2latex solution by Sumeet S Singh.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the Affero GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    Affero GNU General Public License for more details.

    You should have received a copy of the Affero GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [1]:
from IPython.display import display, HTML, Math, Image, Latex
display(HTML("<style>.container { width:70% !important; }</style>"))

In [2]:
import pandas as pd
import os
import re
import codecs
from six.moves import cPickle as pickle
import string
from PIL import Image

In [3]:
pd.options.display.max_rows = 999
pd.options.display.max_columns = 20
pd.options.display.max_colwidth = 100
pd.options.display.width = 160

In [4]:
data_dir = data_folder = '../data/dataset5'
image_folder = image_dir = os.path.join(data_dir,'formula_images')
output_dir = 'step2'
dump = True # set this to True to dump data to disk, otherwise set it to False.

#### Characterset Chart for Reference
##### ASCII Control Characters

                        CTRL   (^D means to hold the CTRL key and hit d)
    Oct  Dec Char  Hex  Key     Comments
    \000   0  NUL  \x00  ^@ \0 (Null byte)
    \001   1  SOH  \x01  ^A    (Start of heading)
    \002   2  STX  \x02  ^B    (Start of text)
    \003   3  ETX  \x03  ^C    (End of text) (see: UNIX keyboard CTRL)
    \004   4  EOT  \x04  ^D    (End of transmission) (see: UNIX keyboard CTRL)
    \005   5  ENQ  \x05  ^E    (Enquiry)
    \006   6  ACK  \x06  ^F    (Acknowledge)
    \007   7  BEL  \x07  ^G    (Ring terminal bell)
    \010   8   BS  \x08  ^H \b (Backspace)  (\b matches backspace inside [] only)
                                            (see: UNIX keyboard CTRL)
    \011   9   HT  \x09  ^I \t (Horizontal tab)
    \012  10   LF  \x0A  ^J \n (Line feed)  (Default UNIX NL) (see End of Line below)
    \013  11   VT  \x0B  ^K    (Vertical tab)
    \014  12   FF  \x0C  ^L \f (Form feed)
    \015  13   CR  \x0D  ^M \r (Carriage return)  (see: End of Line below)
    \016  14   SO  \x0E  ^N    (Shift out)
    \017  15   SI  \x0F  ^O    (Shift in)
    \020  16  DLE  \x10  ^P    (Data link escape)
    \021  17  DC1  \x11  ^Q    (Device control 1) (XON) (Default UNIX START char.)
    \022  18  DC2  \x12  ^R    (Device control 2)
    \023  19  DC3  \x13  ^S    (Device control 3) (XOFF)  (Default UNIX STOP char.)
    \024  20  DC4  \x14  ^T    (Device control 4)
    \025  21  NAK  \x15  ^U    (Negative acknowledge)  (see: UNIX keyboard CTRL)
    \026  22  SYN  \x16  ^V    (Synchronous idle)
    \027  23  ETB  \x17  ^W    (End of transmission block)
    \030  24  CAN  \x18  ^X    (Cancel)
    \031  25  EM   \x19  ^Y    (End of medium)
    \032  26  SUB  \x1A  ^Z    (Substitute character)
    \033  27  ESC  \x1B  ^[    (Escape)
    \034  28  FS   \x1C  ^\    (File separator, Information separator four)
    \035  29  GS   \x1D  ^]    (Group separator, Information separator three)
    \036  30  RS   \x1E  ^^    (Record separator, Information separator two)
    \037  31  US   \x1F  ^_    (Unit separator, Information separator one)
    \177 127  DEL  \x7F  ^?    (Delete)  (see: UNIX keyboard CTRL)
    
    string.printable = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'
    string.whitespace = '\t\n\x0b\x0c\r '

### Collect data into a data-frame

In [5]:
def makeDatasetDetails(data_dir):
    pickle_path = os.path.join(data_dir, output_dir, 'df1_dataset_details.pkl')
    if (not dump) and os.path.exists(pickle_path):
        raise Exception('File %s already exists'%pickle_path)

    widths=[]
    heights=[]
    formula_lens=[]
    # step1/im2latex_dataset_map.df.pkl is generated by formula_list.py in the repo.
    # untrix/im2latex-dataset
    datasetDF = pd.read_pickle(os.path.join(data_dir, 'step1', 'im2latex_dataset_map.df.pkl'))
    for _, row in datasetDF.iterrows():
        image_name = row.image
        im = Image.open(os.path.join(image_folder,image_name))
        widths.append(im.size[0])
        heights.append(im.size[1])
        formula_lens.append(len(row.latex))
    print(len(widths), len(heights))
    datasetDF = datasetDF.assign(width=widths, height=heights, formula_len=formula_lens)
    if not os.path.exists(os.path.join(data_dir, output_dir)):
        os.makedirs(os.path.join(data_dir, output_dir))
    if dump:
        datasetDF.to_pickle(pickle_path)
    return datasetDF
    
def getDatasetDetails(data_dir):
    try:
        df = pd.read_pickle(os.path.join(data_dir, output_dir, 'df1_dataset_details.pkl'))
    except:
        df = makeDatasetDetails(data_dir)
    print df.shape
    display(df.iloc[:1])
    return df

In [6]:
# df1_dataset_details = makeDatasetDetails(data_dir)
df1_dataset_details = getDatasetDetails(data_dir)

(154944, 6)


Unnamed: 0,image,formula_name,latex,formula_len,height,width
0,23be72ded29e9b4_basic.png,23be72ded29e9b4_basic,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331,78,738


### Remove Duplicate Samples
There are some formulas that hash to the same image name. These will be regarded as duplicates since only one image was created for them and hence we need to discard all but one formula for each such case.

TODO: Perhaps we should altogether remove all samples mapping to the same hash-value since they may be different formulas and may not generate the same picture although the hash value was the same.

In [7]:
def count_dupe_images(df_):
    d_c = df_.groupby('image').count()
    discard = d_c[d_c.latex>1].index.values
    print(len(discard))
    return len(discard)

def discard_dupe_images(df_):
    d_c = df_.groupby('image').count()
    discard = d_c[d_c.latex>1].index.values
    return df_[~df_.image.isin(discard)]

def unintersect_images(df1_, df2_):
    o = (set(df1_.image) & set(df2_.image))
    print('%d images are intersecting'%len(o))
    return df2_[~df2_.image.isin(o)]

def assert_dupes_are_identical(df_):
    d_c = df_.groupby('image').count()
    discard = d_c[d_c.latex>1].index.values
    for image in discard:
        d_ = df_[df_.image == image]
        latex = d_.latex.iloc[0]
        for i in range(len(d_)):
            assert d_.image.iloc[i] == image
            assert d_.latex.iloc[i] == latex, '%s \n!=\n %s'%(l, latex)

In [8]:
df2_dataset_details = df1_dataset_details
if df2_dataset_details.shape[0] != df2_dataset_details.image.unique().shape[0]:
    final_shape = (df2_dataset_details.image.unique().shape[0],) + df2_dataset_details.shape[1:]
    print('Removing duplicates will reduce num_samples from %d to %d ... '%(df2_dataset_details.shape[0], df2_dataset_details.image.unique().shape[0]))
    df_cnt = df2_dataset_details.groupby('image').count()
    dupes = df_cnt[df_cnt.latex>1].index.values.tolist()
    for image in dupes:
        ids = df2_dataset_details[df2_dataset_details.image == image].index.values.tolist()
        assert len(ids) > 1
        df2_dataset_details = df2_dataset_details.drop(ids[1:])
    assert df2_dataset_details.shape == final_shape
    print('Final shape = %s'%(final_shape,))
else:
    print('No duplicates found')

Removing duplicates will reduce num_samples from 154944 to 154384 ... 
Final shape = (154384, 6)


### Clean the formula text

In [9]:
def load_df_clean(data_dir_, df_image_details_):
    NOT_PRINTABLE_CHARS_RE = r'[^\\' + string.printable + r']'
    DELETE_RE = re.compile(r".\x7F")
    PERCENTS_RE = r'%'
    return pd.read_pickle(os.path.join(data_dir_, output_dir, 'df3_clean.pkl'))
    
def make_df_clean(data_dir_, df_image_details_):
    NOT_PRINTABLE_CHARS_RE = r'[^\\' + string.printable + r']'
    DELETE_RE = re.compile(r".\x7F")
    PERCENTS_RE = r'%'

    df = df_image_details_
    # Ensure everything's ascii. str.decode will throw an exception if any non-ascii character is found.
    # If an exception does get thrown, then you'll have to write code to filter out the non-ascii rows.
    # Possibly the code below that filters out all but non-printable ascii chars will suffice, but has
    # not been tested for this purpose.
    cleaned = df.latex.str.decode('ascii').str.encode('ascii')
    # Coalesce whitespace to a single space
    cleaned = cleaned.str.replace(r"\s+", ' ')
    # Strip whitespace from the sides
    cleaned = cleaned.str.strip()
    # Discard strings with non-printable characters
    bad1 = df.latex.str.contains(NOT_PRINTABLE_CHARS_RE)
    print 'nonprintables #: ', bad1.sum()
    # Discard strings with embedded percent signs
    bad2 = df.latex.str.contains(PERCENTS_RE)
    print 'percents #: ', bad1.sum()
    good = ~(bad1 | bad2)
    df = df.assign(latex_ascii=cleaned, latex_ascii_len=cleaned.str.len())
    print 'good #: ', good.sum()
    df = df[good]
    return df


In [10]:
df3_clean = make_df_clean(data_dir, df2_dataset_details)
display(df3_clean.shape)

nonprintables #:  0
percents #:  0
good #:  154384


(154384, 8)

In [11]:
df3_clean.iloc[:2]

Unnamed: 0,image,formula_name,latex,formula_len,height,width,latex_ascii,latex_ascii_len
0,23be72ded29e9b4_basic.png,23be72ded29e9b4_basic,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331,78,738,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331
1,8c904c5d9c7bd63_basic.png,8c904c5d9c7bd63_basic,d s ^ { 2 } = ( 1 - { \frac { q c o s \theta } { r } } ) ^ { \frac { 2 } { 1 + \alpha ^ { 2 } } ...,326,94,962,d s ^ { 2 } = ( 1 - { \frac { q c o s \theta } { r } } ) ^ { \frac { 2 } { 1 + \alpha ^ { 2 } } ...,326


In [12]:
assert df3_clean[df2_dataset_details.formula_len != df3_clean.latex_ascii_len].shape[0] == 0
assert df3_clean[df2_dataset_details.latex != df3_clean.latex_ascii].shape[0] == 0
df_clean = df3_clean

### Create the token dictionary

In [13]:
class TokenDict(object):
    def __init__(self):
        self._tokens = {}
    
    def account(self, token_list):
        for token in token_list:
            self._count(token)
            
    def _count(self, token):
        if token in self._tokens:
            self._tokens[token] += 1
        else:
            self._tokens[token] = 1
        return 1
    
    @property
    def dict(self):
        return self._tokens
    
    @property
    def tokens(self):
        return sorted(self._tokens.keys())

def append_special_words(df_vocab_, freq_):
    assert 0 not in df_vocab_.id.values
    df_vocab_ = df_vocab_.append(pd.DataFrame({'id':0, 'freq': freq_}, index=[r'\eos']), verify_integrity=True)
    assert 1 not in df_vocab_.id.values
    df_vocab_ = df_vocab_.append(pd.DataFrame({'id':1, 'freq': freq_}, index=[r'\bos']), verify_integrity=True)
    return df_vocab_

def remove_special_words(df_vocab_):
    return df_vocab_.drop(labels=[r'\eos', r'\bos'])
    
def make_vocabulary(df_, data_dir_, already_tokenized=False):
    ## Split latex into tokens.
    if not already_tokenized:
        ## Isolate latex commands first - i.e.
        ## (optionally even number of backslashes) followed by one backslash followed by letters.
        ## Everything else is a one-character token in itself.
        LATEX_RE = re.compile(r"(?:(?<=\\\\\\\\\\\\)\\[a-zA-Z]+)|(?:(?<=\\\\\\\\)\\[a-zA-Z]+)|(?:(?<=\\\\)\\[a-zA-Z]+)|(?:(?<!\\)\\[a-zA-Z]+)|.")
        sr_token = df_.latex_ascii.str.findall(LATEX_RE)
    else:
        ## Assume that the latex formula strings are already tokenized into string-tokens separated by whitespace
        ## Hence we just need to split the string by whitespace.
        sr_token = df_.latex.str.split(' ')
        
    sr_tokenized_len = sr_token.str.len()
    df_tokenized = df_.assign(latex_tokenized=sr_token, tokenized_len=sr_tokenized_len)
    ## Aggregate the tokens
    vocab = TokenDict()
    sr_token.agg(lambda l: vocab.account(l))
    ## Sort and save
    tokens = []
    count = []
    for t in vocab.tokens:
        tokens.append(t)
        count.append(vocab.dict[t])
    ## Assign token-ids. Start with 2. RESERVE 0 as a 'NULL' token, 1 as BEGIN-SEQUENCE token
    df_vocab = pd.DataFrame({'id':range(2,len(tokens)+2), 'freq':count}, index=tokens, columns=['id', 'freq'])
    df_vocab = append_special_words(df_vocab, df_.shape[0])
    print 'Vocab Size = ', df_vocab.shape[0]
    max_id = df_vocab.id.max()
    print 'Max TokenID = ', max_id, type(max_id)
    
    
    if not already_tokenized:
        ## Now ensure that space is the last ID.
        ## This is required by the CTC decoder if we wanted to use space as blank-token for CTC
        max_idx = df_vocab[df_vocab.id == max_id].index[0]
        #print 'max_idx=', max_idx, type(max_idx)
        space_id = df_vocab.loc[' '].id
        #print 'space_id=', space_id, type(space_id)
        df_vocab.loc[' '].id = max_id
        df_vocab.loc[max_idx].id = space_id
        print 'swapped ids %d and %d'%(max_id, space_id)
        print('SpaceTokenID = ', df_vocab.loc[' '])
            
    display(df_tokenized.iloc[:1])
    return df_vocab, df_tokenized

def make_vocabulary2(df_dataset_details_, data_dir_):
    """
    This function shortucts the make_df_clean steps. It assumes that the original latex formulas have
    already been normalized (for e.g. using katex) and tokenized with each token separated by one space
    character. Given this assumption the cleaning step above is not needed. This is roughly how the
    harvardnlp im2latex solution preprocessor creates their vocabulary and I've included this procedure
    here in order to compare my vocabulary with theirs.
    """
    df_dataset_details_ = df_dataset_details_.assign(latex_ascii=df_dataset_details_.latex)
    return make_vocabulary(df_dataset_details_, data_dir_, already_tokenized=True)

def load_vocabulary(df_, data_dir_):
    df_vocab = pd.read_pickle(os.path.join(data_dir_, output_dir, 'df_vocab.pkl'))
    df_tokenized = pd.read_pickle(os.path.join(data_dir_, output_dir, 'df_tokenized.pkl'))        
    return df_vocab, df_tokenized

In [14]:
df_vocab, df_tokenized = make_vocabulary2(df2_dataset_details, data_dir)

Vocab Size =  581
Max TokenID =  580 <type 'numpy.int64'>


Unnamed: 0,image,formula_name,latex,formula_len,height,width,latex_ascii,latex_tokenized,tokenized_len
0,23be72ded29e9b4_basic.png,23be72ded29e9b4_basic,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331,78,738,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,"[\int, _, {, -, \epsilon, }, ^, {, \infty, }, d, l, \:, \mathrm, {, e, }, ^, {, -, l, \zeta, }, ...",112


In [15]:
df_vocab2, df_tokenized2 = make_vocabulary(df_clean, data_dir)

Vocab Size =  505
Max TokenID =  504 <type 'numpy.int64'>
swapped ids 504 and 2
('SpaceTokenID = ', freq    10460053
id           504
Name:  , dtype: int64)


Unnamed: 0,image,formula_name,latex,formula_len,height,width,latex_ascii,latex_ascii_len,latex_tokenized,tokenized_len
0,23be72ded29e9b4_basic.png,23be72ded29e9b4_basic,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331,78,738,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331,"[\int, , _, , {, , -, , \epsilon, , }, , ^, , {, , \infty, , }, , d, , l, , \, :, ,...",228


In [16]:
df_vocab.sort_values(by='id')

Unnamed: 0,freq,id
\eos,154384,0
\bos,154384,1
!,3111,2
"""",127,3
&,47450,4
',987,5
(,276055,6
),275817,7
*,10711,8
+,151305,9


In [17]:
df_vocab2.sort_values(by='id')

Unnamed: 0,freq,id
\eos,154384,0
\bos,154384,1
~,47217,2
!,12666,3
"""",127,4
#,116,5
&,47465,6
',988,7
(,321264,8
),321033,9


In [18]:
set1 = set(df_vocab.index.tolist())
set2 = set(df_vocab2.index.tolist())

In [19]:
pd.Series(list(set2-set1))

0          
1         #
2    \rceil
3    \right
4     \left
5    \begin
6      \end
dtype: object

In [20]:
df_vocab.loc[(list(set1-set2))].sort_values(by='freq')

Unnamed: 0,freq,id
1.7,1,24
15,1,27
\right\rceil,1,421
10,1,25
\left>,1,294
\^,1,130
\right<,1,414
\right\Vert,1,417
\left\Vert,1,296
\',1,76


In [21]:
## Erroneous tokens in the normalized latex code
df_vocab[df_vocab.index.str.contains('bject')]

Unnamed: 0,freq,id
Object],29,58
[object,29,71


In [22]:
# Remove low-frequencey words. Also, remove the words '[object' and 'Object]' which are probably an artifact of
# a bug in harvardnlp latex normalization code because they are not valid latex commands. Also a few others
# that do not produce an output.
def prune_vocab(df_data_, df_vocab_, remove_words, min_freq=24):
    df_vocab_keep = df_vocab_.drop(remove_words)
    df_vocab_keep = df_vocab_keep[df_vocab_keep.freq >= min_freq]
    remove_words = (set(df_vocab.index.values.tolist()) - set(df_vocab_keep.index.values.tolist()))
#     remove_words |= set(['[object', 'Object]', r'\llap', r'\rlap'])
    print 'Removing the following %d words from the vocabulary: %s'%(len(remove_words), remove_words)
    sr_keep = df_data_.latex_tokenized.map(lambda a: len(remove_words & set(a))==0)
    df_pruned = df_data_[sr_keep]
    kept = df_pruned.shape[0]
    removed = df_data_.shape[0] - kept
    print '%d samples (%.1f%%) removed'%(removed, removed*100./df_data_.shape[0])
    print 'df_pruned.shape = %s'%(df_pruned.shape,)
    display(df_pruned[:1])
    
    ## Prune vocabulary
    df_vocab_keep = remove_special_words(df_vocab_keep)
    num_words_keep = df_vocab_keep.shape[0]
    df_vocab_keep = df_vocab_keep.assign(id=range(2, num_words_keep+2))
    df_vocab_keep = append_special_words(df_vocab_keep, df_data_.shape[0])
#     df_vocab_unk = df_vocab_[df_vocab_.index.isin(remove_words)]
#     num_words_unk = df_vocab_unk.shape[0]
#     df_vocab_unk = df_vocab_unk.assign(id=([UnkID]*num_words_unk))
    print 'Vocabulary size reduced from %d to %d'%(df_vocab_.shape[0], df_vocab_keep.shape[0])
    print 'Pruned vocab shape = %s'%(df_vocab_keep.shape,)
    return df_pruned, df_vocab_keep

In [23]:
df_data_pruned, df_vocab_pruned = prune_vocab(df_tokenized, 
                                              df_vocab, 
                                              set(['[object', 'Object]', r'\llap', r'\rlap', r'\rule', r'\tag']), 
                                              50)

Removing the following 243 words from the vocabulary: set(['\\vline', '\\varUpsilon', '\\vrule', '\\exists', '\\searrow', '\\longmapsto', '\\makebox', '\\right\\|', '\\setminus', '\\bigtriangledown', '\\mathrel', '\\medskip', '\\oslash', '\\joinrel', '\\mod', '\\scshape', '\\&', "\\'", '\\left\\Vert', '\\underrightarrow', '\\hfil', '\\smallint', '\\nearrow', '\\surd', '\\hspace', '\\-', '\\*', '\\lfloor', '\\left]', '\\atopwithdelims', '\\Tilde', '@', '\\]', '\\textcircled', '\\rlap', '\\boldsymbol', '\\supseteq', '\\^', '\\ref', '\\bigwedge', '\\left/', '\\[', '\\Vec', '\\Leftarrow', '\\negthickspace', '\\of', '\\ddag', '\\verb', '\\succ', '\\arrowvert', '\\left>', '\\pmb', '\\tag', '\\right\\rceil', '\\fbox', '\\normalsize', '\\Huge', '\\def', '\\SS', '\\framebox', '\\b', '\\special', '\\a', '\\bigcirc', '\\j', '\\AA', '\\overleftrightarrow', '\\allowbreak', '\\lceil', '\\index', '\\pounds', '\\asymp', '\\ss', '\\left\\lbrack', '\\mathnormal', '\\footnotemark', '\\left\\lceil', '\\sc

Unnamed: 0,image,formula_name,latex,formula_len,height,width,latex_ascii,latex_tokenized,tokenized_len
0,23be72ded29e9b4_basic.png,23be72ded29e9b4_basic,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331,78,738,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,"[\int, _, {, -, \epsilon, }, ^, {, \infty, }, d, l, \:, \mathrm, {, e, }, ^, {, -, l, \zeta, }, ...",112


Vocabulary size reduced from 581 to 338
Pruned vocab shape = (338, 2)


In [24]:
# i=1
# patt = r'\\hfill '
# formula = (df_data_pruned[df_data_pruned.latex.str.contains(patt)].latex.values[i])
# image = (df_data_pruned[df_data_pruned.latex.str.contains(patt)].image.values[i])
# print formula
# display(Math(formula))
# display(Image(filename='../data/dataset5/formula_images/%s'%image, format='png', unconfined=True))

In [25]:
df_vocab_pruned.sort_values(by='freq', ascending=False)  # (338, 2)

Unnamed: 0,freq,id
{,1819361,334
},1819361,336
_,593759,305
^,485778,304
2,328261,17
(,276055,6
),275817,7
1,224635,16
-,197544,11
=,186207,28


In [26]:
dict_vocab = df_vocab_pruned.to_dict()

In [27]:
def reverse_dict(d):
    r = {}
    for k in d.keys():
        v = d[k]
        r[v] = k
    return r
dict_id2word = reverse_dict(dict_vocab['id'])

In [28]:
dict_vocab['id']

{'!': 2,
 '"': 3,
 '&': 4,
 "'": 5,
 '(': 6,
 ')': 7,
 '*': 8,
 '+': 9,
 ',': 10,
 '-': 11,
 '--': 12,
 '.': 13,
 '/': 14,
 '0': 15,
 '1': 16,
 '2': 17,
 '3': 18,
 '4': 19,
 '5': 20,
 '6': 21,
 '7': 22,
 '8': 23,
 '9': 24,
 ':': 25,
 ';': 26,
 '<': 27,
 '=': 28,
 '>': 29,
 'A': 30,
 'B': 31,
 'C': 32,
 'D': 33,
 'E': 34,
 'F': 35,
 'G': 36,
 'H': 37,
 'I': 38,
 'J': 39,
 'K': 40,
 'L': 41,
 'M': 42,
 'N': 43,
 'O': 44,
 'P': 45,
 'Q': 46,
 'R': 47,
 'S': 48,
 'T': 49,
 'U': 50,
 'V': 51,
 'W': 52,
 'X': 53,
 'Y': 54,
 'Z': 55,
 '[': 56,
 '\\': 57,
 '\\!': 58,
 '\\#': 59,
 '\\,': 60,
 '\\/': 61,
 '\\:': 62,
 '\\;': 63,
 '\\Big': 64,
 '\\Bigg': 65,
 '\\Biggl': 66,
 '\\Biggr': 67,
 '\\Bigl': 68,
 '\\Bigr': 69,
 '\\Delta': 70,
 '\\Gamma': 71,
 '\\Im': 72,
 '\\L': 73,
 '\\Lambda': 74,
 '\\Large': 75,
 '\\Leftrightarrow': 76,
 '\\Longleftrightarrow': 77,
 '\\Longrightarrow': 78,
 '\\O': 79,
 '\\Omega': 80,
 '\\P': 81,
 '\\Phi': 82,
 '\\Pi': 83,
 '\\Psi': 84,
 '\\Re': 85,
 '\\Rightarrow': 86,

In [29]:
dict_id2word

{0: '\\eos',
 1: '\\bos',
 2: '!',
 3: '"',
 4: '&',
 5: "'",
 6: '(',
 7: ')',
 8: '*',
 9: '+',
 10: ',',
 11: '-',
 12: '--',
 13: '.',
 14: '/',
 15: '0',
 16: '1',
 17: '2',
 18: '3',
 19: '4',
 20: '5',
 21: '6',
 22: '7',
 23: '8',
 24: '9',
 25: ':',
 26: ';',
 27: '<',
 28: '=',
 29: '>',
 30: 'A',
 31: 'B',
 32: 'C',
 33: 'D',
 34: 'E',
 35: 'F',
 36: 'G',
 37: 'H',
 38: 'I',
 39: 'J',
 40: 'K',
 41: 'L',
 42: 'M',
 43: 'N',
 44: 'O',
 45: 'P',
 46: 'Q',
 47: 'R',
 48: 'S',
 49: 'T',
 50: 'U',
 51: 'V',
 52: 'W',
 53: 'X',
 54: 'Y',
 55: 'Z',
 56: '[',
 57: '\\',
 58: '\\!',
 59: '\\#',
 60: '\\,',
 61: '\\/',
 62: '\\:',
 63: '\\;',
 64: '\\Big',
 65: '\\Bigg',
 66: '\\Biggl',
 67: '\\Biggr',
 68: '\\Bigl',
 69: '\\Bigr',
 70: '\\Delta',
 71: '\\Gamma',
 72: '\\Im',
 73: '\\L',
 74: '\\Lambda',
 75: '\\Large',
 76: '\\Leftrightarrow',
 77: '\\Longleftrightarrow',
 78: '\\Longrightarrow',
 79: '\\O',
 80: '\\Omega',
 81: '\\P',
 82: '\\Phi',
 83: '\\Pi',
 84: '\\Psi',
 85: '\

In [30]:
if dump:
    with open(os.path.join(data_dir, output_dir, 'dict_vocab.pkl'), 'wb') as f:
        pickle.dump(dict_vocab, f, pickle.HIGHEST_PROTOCOL)
    with open(os.path.join(data_dir, output_dir, 'dict_id2word.pkl'), 'wb') as f:
        pickle.dump(dict_id2word, f, pickle.HIGHEST_PROTOCOL)

In [31]:
print df_data_pruned.latex_ascii[df_clean.latex.str.contains(r'\\\\\\\\\\\\\\\\\\\\')].count()

0


In [32]:
def make_word2id(df_tokenized_, df_vocab_):
    word2id = df_vocab_.id.to_dict()
    sr_word2id = df_tokenized_.latex_tokenized.apply(lambda l: map(lambda t: word2id[t], l))
    df_ = df_tokenized_.assign(word2id=sr_word2id, word2id_len=sr_word2id.str.len())
    assert df_.word2id_len.equals(df_.tokenized_len)
    df_ = df_.drop(labels=['tokenized_len'], axis=1)
    print df_.shape
    display(df_[:1])
    return df_

In [33]:
df_word2id = make_word2id(df_data_pruned, df_vocab_pruned)  # 152840

(152840, 10)


Unnamed: 0,image,formula_name,latex,formula_len,height,width,latex_ascii,latex_tokenized,word2id,word2id_len
0,23be72ded29e9b4_basic.png,23be72ded29e9b4_basic,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331,78,738,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,"[\int, _, {, -, \epsilon, }, ^, {, \infty, }, d, l, \:, \mathrm, {, e, }, ^, {, -, l, \zeta, }, ...","[163, 305, 334, 11, 145, 336, 304, 334, 162, 336, 311, 319, 62, 195, 334, 312, 336, 304, 334, 11...",112


In [34]:
if dump:
    df_word2id.to_pickle(os.path.join(data_dir, output_dir, 'df_word2id.pkl'))

### END

In [35]:
old_dict_vocab = pd.read_pickle('../data/dataset3/step2/dict_vocab.pkl')

In [36]:
set(dict_vocab['id'].keys()) - set(old_dict_vocab['id'].keys())

{'\\varDelta'}

In [37]:
set(old_dict_vocab['id'].keys()) - set(dict_vocab['id'].keys())

{'---',
 '\\aleph',
 '\\backslash',
 '\\bigcup',
 '\\c',
 '\\cdotp',
 '\\colon',
 '\\diamond',
 '\\emptyset',
 '\\flat',
 '\\left\\|',
 '\\longmapsto',
 '\\ni',
 '\\ref',
 '\\rfloor',
 '\\right\\|',
 '\\sharp',
 '\\subseteq',
 '\\thinspace',
 '\\vline'}

In [38]:
one = ['\\varDelta']
two = ['---',
 '\\aleph',
 '\\backslash',
 '\\bigcup',
 '\\c',
 '\\cdotp',
 '\\colon',
 '\\diamond',
 '\\emptyset',
 '\\flat',
 '\\left\\|',
 '\\longmapsto',
 '\\ni',
 '\\ref',
 '\\rfloor',
 '\\right\\|',
 '\\sharp',
 '\\subseteq',
 '\\thinspace',
 '\\vline']
for w in one:
    print(w, dict_vocab['freq'][w])

('\\varDelta', 55)


In [39]:
for w in two:
    print(w, old_dict_vocab['freq'][w])

('---', 35)
('\\aleph', 24)
('\\backslash', 33)
('\\bigcup', 26)
('\\c', 26)
('\\cdotp', 41)
('\\colon', 25)
('\\diamond', 42)
('\\emptyset', 36)
('\\flat', 28)
('\\left\\|', 43)
('\\longmapsto', 30)
('\\ni', 25)
('\\ref', 35)
('\\rfloor', 36)
('\\right\\|', 43)
('\\sharp', 31)
('\\subseteq', 27)
('\\thinspace', 24)
('\\vline', 25)
