# im2latex(S): Tokenizer

&copy; Copyright 2017 Sumeet S Singh

    This file is part of im2latex solution by Sumeet S Singh.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the Affero GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    Affero GNU General Public License for more details.

    You should have received a copy of the Affero GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))

In [2]:
import pandas as pd
import os
import re
import codecs
from six.moves import cPickle as pickle
import string
from PIL import Image

In [3]:
pd.options.display.max_rows = 999
pd.options.display.max_columns = 20
pd.options.display.max_colwidth = 100
pd.options.display.width = 160
data_dir = data_folder = '../data/dataset3'
image_folder = image_dir = os.path.join(data_dir,'formula_images')
output_dir = 'step2'

#### Characterset Chart for Reference
##### ASCII Control Characters

                        CTRL   (^D means to hold the CTRL key and hit d)
    Oct  Dec Char  Hex  Key     Comments
    \000   0  NUL  \x00  ^@ \0 (Null byte)
    \001   1  SOH  \x01  ^A    (Start of heading)
    \002   2  STX  \x02  ^B    (Start of text)
    \003   3  ETX  \x03  ^C    (End of text) (see: UNIX keyboard CTRL)
    \004   4  EOT  \x04  ^D    (End of transmission) (see: UNIX keyboard CTRL)
    \005   5  ENQ  \x05  ^E    (Enquiry)
    \006   6  ACK  \x06  ^F    (Acknowledge)
    \007   7  BEL  \x07  ^G    (Ring terminal bell)
    \010   8   BS  \x08  ^H \b (Backspace)  (\b matches backspace inside [] only)
                                            (see: UNIX keyboard CTRL)
    \011   9   HT  \x09  ^I \t (Horizontal tab)
    \012  10   LF  \x0A  ^J \n (Line feed)  (Default UNIX NL) (see End of Line below)
    \013  11   VT  \x0B  ^K    (Vertical tab)
    \014  12   FF  \x0C  ^L \f (Form feed)
    \015  13   CR  \x0D  ^M \r (Carriage return)  (see: End of Line below)
    \016  14   SO  \x0E  ^N    (Shift out)
    \017  15   SI  \x0F  ^O    (Shift in)
    \020  16  DLE  \x10  ^P    (Data link escape)
    \021  17  DC1  \x11  ^Q    (Device control 1) (XON) (Default UNIX START char.)
    \022  18  DC2  \x12  ^R    (Device control 2)
    \023  19  DC3  \x13  ^S    (Device control 3) (XOFF)  (Default UNIX STOP char.)
    \024  20  DC4  \x14  ^T    (Device control 4)
    \025  21  NAK  \x15  ^U    (Negative acknowledge)  (see: UNIX keyboard CTRL)
    \026  22  SYN  \x16  ^V    (Synchronous idle)
    \027  23  ETB  \x17  ^W    (End of transmission block)
    \030  24  CAN  \x18  ^X    (Cancel)
    \031  25  EM   \x19  ^Y    (End of medium)
    \032  26  SUB  \x1A  ^Z    (Substitute character)
    \033  27  ESC  \x1B  ^[    (Escape)
    \034  28  FS   \x1C  ^\    (File separator, Information separator four)
    \035  29  GS   \x1D  ^]    (Group separator, Information separator three)
    \036  30  RS   \x1E  ^^    (Record separator, Information separator two)
    \037  31  US   \x1F  ^_    (Unit separator, Information separator one)
    \177 127  DEL  \x7F  ^?    (Delete)  (see: UNIX keyboard CTRL)
    
    string.printable = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'
    string.whitespace = '\t\n\x0b\x0c\r '

In [4]:
def makeDatasetDetails(data_dir, overwrite=False):
    pickle_path = os.path.join(data_dir, output_dir, 'df2_dataset_details.pkl')
    if (not overwrite) and os.path.exists(pickle_path):
        raise Exception('File %s already exists'%pickle_path)

    widths=[]
    heights=[]
    formula_lens=[]
    datasetDF = pd.read_pickle(os.path.join(data_dir, 'step1', 'im2latex_dataset_map.df.pkl'))
    for _, row in datasetDF.iterrows():
        image_name = row.image
        im = Image.open(os.path.join(image_folder,image_name))
        widths.append(im.size[0])
        heights.append(im.size[1])
        formula_lens.append(len(row.latex))
    print(len(widths), len(heights))
    datasetDF = datasetDF.assign(width=widths, height=heights, formula_len=formula_lens)
    if not os.path.exists(os.path.join(data_dir, output_dir)):
        os.makedirs(os.path.join(data_dir, output_dir))
    datasetDF.to_pickle(pickle_path)
    return datasetDF
    
def getDatasetDetails(data_dir):
    df = pd.read_pickle(os.path.join(data_dir, output_dir, 'df2_dataset_details.pkl'))
    print df.shape
    display(df.iloc[:1])
    return df

In [5]:
# df2_dataset_details = makeDatasetDetails(data_dir)
df2_dataset_details = getDatasetDetails(data_dir)

(100700, 6)


Unnamed: 0,image,formula_name,latex,formula_len,height,width
0,23be72ded29e9b4_basic.png,23be72ded29e9b4_basic,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331,78,738


In [6]:
def load_df_clean(data_dir_, df_image_details_):
    NOT_PRINTABLE_CHARS_RE = r'[^\\' + string.printable + r']'
    DELETE_RE = re.compile(r".\x7F")
    PERCENTS_RE = r'%'
    return pd.read_pickle(os.path.join(data_dir_, output_dir, 'df3_clean.pkl'))
    
def make_df_clean(data_dir_, df_image_details_):
    NOT_PRINTABLE_CHARS_RE = r'[^\\' + string.printable + r']'
    DELETE_RE = re.compile(r".\x7F")
    PERCENTS_RE = r'%'

    df = df_image_details_
    # Ensure everything's ascii. str.decode will throw an exception if any non-ascii character is found.
    # If an exception does get thrown, then you'll have to write code to filters out the non-ascii rows.
    # Possibly the code below that filters out all but non-printable ascii chars will suffice, but is
    # not tested for this purpose.
    cleaned = df.latex.str.decode('ascii').str.encode('ascii')
    # Coalesce whitespace to a single space
    cleaned = cleaned.str.replace(r"\s+", ' ')
    # Strip whitespace from the sides
    cleaned = cleaned.str.strip()
    # Discard strings with non-printable characters
    bad1 = df.latex.str.contains(NOT_PRINTABLE_CHARS_RE)
    print 'nonprintables #: ', bad1.sum()
    # Discard strings with embedded percent signs (because textogif ignores everything after the % sign)
    bad2 = df.latex.str.contains(PERCENTS_RE)
    print 'percents #: ', bad1.sum()
    good = ~(bad1 | bad2)
    df = df.assign(latex_ascii=cleaned, latex_ascii_len=cleaned.str.len())
    print 'good #: ', good.sum()
    df = df[good]
#     pickle_path = os.path.join(data_dir_, output_dir, 'df3_clean.pkl')
#     df.to_pickle(pickle_path)
    return df


In [7]:
df3_clean = make_df_clean(data_dir, df2_dataset_details)
display(df3_clean.shape)

nonprintables #:  0
percents #:  0
good #:  100700


(100700, 8)

In [8]:
df3_clean.iloc[:2]

Unnamed: 0,image,formula_name,latex,formula_len,height,width,latex_ascii,latex_ascii_len
0,23be72ded29e9b4_basic.png,23be72ded29e9b4_basic,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331,78,738,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331
1,8c904c5d9c7bd63_basic.png,8c904c5d9c7bd63_basic,d s ^ { 2 } = ( 1 - { \frac { q c o s \theta } { r } } ) ^ { \frac { 2 } { 1 + \alpha ^ { 2 } } ...,326,94,962,d s ^ { 2 } = ( 1 - { \frac { q c o s \theta } { r } } ) ^ { \frac { 2 } { 1 + \alpha ^ { 2 } } ...,326


In [9]:
assert df3_clean[df2_dataset_details.formula_len != df3_clean.latex_ascii_len].shape[0] == 0

In [10]:
assert df3_clean[df2_dataset_details.latex != df3_clean.latex_ascii].shape[0] == 0

In [11]:
df_clean = df3_clean

In [12]:
class TokenDict(object):
    def __init__(self):
        self._tokens = {}
    
    def account(self, token_list):
        for token in token_list:
            self._count(token)
            
    def _count(self, token):
        if token in self._tokens:
            self._tokens[token] += 1
        else:
            self._tokens[token] = 1
        return 1
    
    @property
    def dict(self):
        return self._tokens
    
    @property
    def tokens(self):
        return sorted(self._tokens.keys())

def append_special_words(df_vocab_, freq_):
    assert 0 not in df_vocab_.id.values
    df_vocab_ = df_vocab_.append(pd.DataFrame({'id':0, 'freq': freq_}, index=[r'\eos']), verify_integrity=True)
    assert 1 not in df_vocab_.id.values
    df_vocab_ = df_vocab_.append(pd.DataFrame({'id':1, 'freq': freq_}, index=[r'\bos']), verify_integrity=True)
    return df_vocab_

def remove_special_words(df_vocab_):
    return df_vocab_.drop(labels=[r'\eos', r'\bos'])
    
def make_vocabulary(df_, data_dir_, already_tokenized=False):
    ## Split latex into tokens.
    if not already_tokenized:
        ## Isolate latex commands first - i.e.
        ## (optionally even number of backslashes) followed by one backslash followed by letters.
        ## Everything else is a one-character token in itself.
        LATEX_RE = re.compile(r"(?:(?<=\\\\\\\\\\\\)\\[a-zA-Z]+)|(?:(?<=\\\\\\\\)\\[a-zA-Z]+)|(?:(?<=\\\\)\\[a-zA-Z]+)|(?:(?<!\\)\\[a-zA-Z]+)|.")
        sr_token = df_.latex_ascii.str.findall(LATEX_RE)
    else:
        ## Assume that the latex formula strings are already tokenized into string-tokens separated by whitespace
        ## Hence we just need to split the string by whitespace.
        sr_token = df_.latex.str.split(' ')
        
    sr_tokenized_len = sr_token.str.len()
    df_tokenized = df_.assign(latex_tokenized=sr_token, tokenized_len=sr_tokenized_len)
    ## Aggregate the tokens
    vocab = TokenDict()
    sr_token.agg(lambda l: vocab.account(l))
    ## Sort and save
    tokens = []
    count = []
    for t in vocab.tokens:
        tokens.append(t)
        count.append(vocab.dict[t])
    ## Assign token-ids. Start with 2. RESERVE 0 as a 'NULL' token, 1 as BEGIN-SEQUENCE token
    df_vocab = pd.DataFrame({'id':range(2,len(tokens)+2), 'freq':count}, index=tokens, columns=['id', 'freq'])
    df_vocab = append_special_words(df_vocab, df_.shape[0])
    print 'Vocab Size = ', df_vocab.shape[0]
    max_id = df_vocab.id.max()
    print 'Max TokenID = ', max_id, type(max_id)
    
    
    if not already_tokenized:
        ## Now ensure that space is the last ID.
        ## This is required by the CTC decoder if we wanted to use space as blank-token for CTC
        max_idx = df_vocab[df_vocab.id == max_id].index[0]
        #print 'max_idx=', max_idx, type(max_idx)
        space_id = df_vocab.loc[' '].id
        #print 'space_id=', space_id, type(space_id)
        df_vocab.loc[' '].id = max_id
        df_vocab.loc[max_idx].id = space_id
        print 'swapped ids %d and %d'%(max_id, space_id)
        print('SpaceTokenID = ', df_vocab.loc[' '])
            
    display(df_tokenized.iloc[:1])
    return df_vocab, df_tokenized

def make_vocabulary2(df_dataset_details_, data_dir_):
    """
    This function shortucts the make_df_clean steps. It assumes that the original latex formulas have
    already been normalized (for e.g. using katex) and tokenized with each token separated by one space
    character. Given this assumption the cleaning step above is not needed. This is roughly how the
    harvardnlp im2latex solution preprocessor creates their vocabulary and I've included this procedure
    here in order to compare my vocabulary with theirs.
    """
    df_dataset_details_ = df_dataset_details_.assign(latex_ascii=df_dataset_details_.latex)
    return make_vocabulary(df_dataset_details_, data_dir_, already_tokenized=True)

def load_vocabulary(df_, data_dir_):
    df_vocab = pd.read_pickle(os.path.join(data_dir_, output_dir, 'df_vocab.pkl'))
    df_tokenized = pd.read_pickle(os.path.join(data_dir_, output_dir, 'df_tokenized.pkl'))        
    return df_vocab, df_tokenized

In [13]:
df_vocab, df_tokenized = make_vocabulary2(df2_dataset_details, data_dir)

Vocab Size =  519
Max TokenID =  518 <type 'numpy.int64'>


Unnamed: 0,image,formula_name,latex,formula_len,height,width,latex_ascii,latex_tokenized,tokenized_len
0,23be72ded29e9b4_basic.png,23be72ded29e9b4_basic,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331,78,738,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,"[\int, _, {, -, \epsilon, }, ^, {, \infty, }, d, l, \:, \mathrm, {, e, }, ^, {, -, l, \zeta, }, ...",112


In [14]:
df_vocab2, df_tokenized2 = make_vocabulary(df_clean, data_dir)

Vocab Size =  450
Max TokenID =  449 <type 'numpy.int64'>
swapped ids 449 and 2
('SpaceTokenID = ', freq    6491267
id          449
Name:  , dtype: int64)


Unnamed: 0,image,formula_name,latex,formula_len,height,width,latex_ascii,latex_ascii_len,latex_tokenized,tokenized_len
0,23be72ded29e9b4_basic.png,23be72ded29e9b4_basic,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331,78,738,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331,"[\int, , _, , {, , -, , \epsilon, , }, , ^, , {, , \infty, , }, , d, , l, , \, :, ,...",228


In [15]:
df_vocab.sort_values(by='id')

Unnamed: 0,freq,id
\eos,100700,0
\bos,100700,1
!,2445,2
"""",60,3
&,30701,4
',646,5
(,176035,6
),175851,7
*,6313,8
+,99643,9


In [16]:
df_vocab2.sort_values(by='id')

Unnamed: 0,freq,id
\eos,100700,0
\bos,100700,1
~,30529,2
!,7977,3
"""",60,4
#,95,5
&,30709,6
',647,7
(,204056,8
),203870,9


In [17]:
set1 = set(df_vocab.index.tolist())
set2 = set(df_vocab2.index.tolist())

In [18]:
pd.Series(list(set2-set1))

0          
1         #
2    \rceil
3    \right
4     \left
5    \begin
6      \end
dtype: object

In [19]:
pd.Series(list(set1-set2))

0                 1.7
1             \right|
2                  cm
3       \end{tabular}
4                 0.1
5                 0.3
6             \left\{
7                 0.5
8                 0.4
9                  \{
10      \right\rbrace
11         \left\vert
12           \right\}
13       \left\langle
14                 in
15     \operatorname*
16        \end{cases}
17      \right\rbrack
18             \left|
19     \begin{matrix}
20      \right\rangle
21            \right[
22      \begin{cases}
23            \right]
24                 pt
25               0.25
26                 \:
27                 \;
28            Object]
29                 \&
30                 \'
31         \left\Vert
32                 \#
33    \begin{tabular}
34                 \!
35             \left[
36                 \/
37                 \,
38                 \-
39                 \*
40                 \\
41             \left]
42                 \]
43       \left\lfloor
44       \left\lbrace
45        

In [20]:
## Erroneous tokens in the normalized latex code
df_vocab[df_vocab.index.str.contains('bject')]

Unnamed: 0,freq,id
Object],23,56
[object,23,69


In [21]:
# Remove low-frequencey words. Also, remove the words '[object' and 'Object]' which are probably an artifact of
# a bug in harvardnlp latex normalization code because they are not valid latex commands. Their frequencey is 23,
# therefore we'll set the frequency threshold to 24
def prune_vocab(df_data_, df_vocab_, min_freq=24):
    df_vocab_keep = df_vocab_[df_vocab_.freq >= min_freq]
    remove_words = set(df_vocab.index.values.tolist()) - set(df_vocab_keep.index.values.tolist())
    remove_words |= set(['[object', 'Object]'])
    print 'Removing the following %d words from the vocabulary: %s'%(len(remove_words), remove_words)
    sr_keep = df_data_.latex_tokenized.map(lambda a: len(remove_words & set(a))==0)
    df_pruned = df_data_[sr_keep]
    kept = df_pruned.shape[0]
    removed = df_data_.shape[0] - kept
    print '%d samples (%.1f%%) removed'%(removed, removed*100./df_data_.shape[0])
    print 'df_pruned.shape = %s'%(df_pruned.shape,)
    display(df_pruned[:1])
    
    ## Prune vocabulary
    df_vocab_keep = remove_special_words(df_vocab_keep)
    num_words_keep = df_vocab_keep.shape[0]
    df_vocab_keep = df_vocab_keep.assign(id=range(2, num_words_keep+2))
    df_vocab_keep = append_special_words(df_vocab_keep, df_data_.shape[0])
#     df_vocab_unk = df_vocab_[df_vocab_.index.isin(remove_words)]
#     num_words_unk = df_vocab_unk.shape[0]
#     df_vocab_unk = df_vocab_unk.assign(id=([UnkID]*num_words_unk))
    print 'Vocabulary size reduced from %d to %d'%(df_vocab_.shape[0], num_words_keep)
    print 'Pruned vocab shape = %s'%(df_vocab_keep.shape,)
    return df_pruned, df_vocab_keep

In [22]:
df_data_pruned, df_vocab_pruned = prune_vocab(df_tokenized, df_vocab, 24)

Removing the following 162 words from the vocabulary: set(['\\searrow', '\\succ', '\\makebox', '\\setminus', '\\bigtriangledown', '\\medskip', '\\oslash', '\\bigsqcup', '\\scshape', '\\&', "\\'", '\\left\\Vert', '\\hfil', '\\smallint', '\\nearrow', '\\surd', '\\hspace', '\\-', '\\*', '\\lfloor', '\\left]', '\\atopwithdelims', '@', '\\textcircled', '\\rlap', '\\supseteq', '\\]', '\\left/', '\\[', '\\of', '\\ddag', '\\verb', '\\arrowvert', '\\left>', '\\right\\rceil', '\\fbox', '\\normalsize', '\\Huge', '\\def', '\\framebox', '\\b', '?', '\\bigcirc', '\\j', '\\AA', '\\longleftarrow', '\\lceil', '\\pounds', '\\mathnormal', '\\footnotemark', '\\left\\lceil', '\\sc', '\\triangleleft', '\\rightleftharpoons', '\\right\\rbrack', '\\SS', '\\right[', '\\symbol', '\\triangleright', '0.25', '\\Longleftarrow', '\\succeq', '0.23', '\\mathord', '\\coprod', '\\bigwedge', '\\asymp', '\\nulldelimiterspace', '\\mathbin', '\\land', '\\hphantom', '\\hookrightarrow', '\\right<', '\\right\\rfloor', '\\ominus

Unnamed: 0,image,formula_name,latex,formula_len,height,width,latex_ascii,latex_tokenized,tokenized_len
0,23be72ded29e9b4_basic.png,23be72ded29e9b4_basic,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331,78,738,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,"[\int, _, {, -, \epsilon, }, ^, {, \infty, }, d, l, \:, \mathrm, {, e, }, ^, {, -, l, \zeta, }, ...",112


Vocabulary size reduced from 519 to 355
Pruned vocab shape = (357, 2)


In [23]:
df_vocab_pruned.sort_values(by='freq', ascending=False)

Unnamed: 0,freq,id
},1110918,355
{,1110918,353
_,356305,324
^,303405,323
2,191217,18
(,176035,6
),175851,7
1,144127,17
-,134013,11
=,128043,29


In [24]:
dict_vocab = df_vocab_pruned.to_dict()

In [25]:
def reverse_dict(d):
    r = {}
    for k in d.keys():
        v = d[k]
        r[v] = k
    return r
dict_id2word = reverse_dict(dict_vocab['id'])

In [26]:
dict_vocab['id']

{'!': 2,
 '"': 3,
 '&': 4,
 "'": 5,
 '(': 6,
 ')': 7,
 '*': 8,
 '+': 9,
 ',': 10,
 '-': 11,
 '--': 12,
 '---': 13,
 '.': 14,
 '/': 15,
 '0': 16,
 '1': 17,
 '2': 18,
 '3': 19,
 '4': 20,
 '5': 21,
 '6': 22,
 '7': 23,
 '8': 24,
 '9': 25,
 ':': 26,
 ';': 27,
 '<': 28,
 '=': 29,
 '>': 30,
 'A': 31,
 'B': 32,
 'C': 33,
 'D': 34,
 'E': 35,
 'F': 36,
 'G': 37,
 'H': 38,
 'I': 39,
 'J': 40,
 'K': 41,
 'L': 42,
 'M': 43,
 'N': 44,
 'O': 45,
 'P': 46,
 'Q': 47,
 'R': 48,
 'S': 49,
 'T': 50,
 'U': 51,
 'V': 52,
 'W': 53,
 'X': 54,
 'Y': 55,
 'Z': 56,
 '[': 57,
 '\\': 58,
 '\\!': 59,
 '\\#': 60,
 '\\,': 61,
 '\\/': 62,
 '\\:': 63,
 '\\;': 64,
 '\\Big': 65,
 '\\Bigg': 66,
 '\\Biggl': 67,
 '\\Biggr': 68,
 '\\Bigl': 69,
 '\\Bigr': 70,
 '\\Delta': 71,
 '\\Gamma': 72,
 '\\Im': 73,
 '\\L': 74,
 '\\Lambda': 75,
 '\\Large': 76,
 '\\Leftrightarrow': 77,
 '\\Longleftrightarrow': 78,
 '\\Longrightarrow': 79,
 '\\O': 80,
 '\\Omega': 81,
 '\\P': 82,
 '\\Phi': 83,
 '\\Pi': 84,
 '\\Psi': 85,
 '\\Re': 86,
 '\\Righ

In [27]:
dict_id2word

{0: '\\eos',
 1: '\\bos',
 2: '!',
 3: '"',
 4: '&',
 5: "'",
 6: '(',
 7: ')',
 8: '*',
 9: '+',
 10: ',',
 11: '-',
 12: '--',
 13: '---',
 14: '.',
 15: '/',
 16: '0',
 17: '1',
 18: '2',
 19: '3',
 20: '4',
 21: '5',
 22: '6',
 23: '7',
 24: '8',
 25: '9',
 26: ':',
 27: ';',
 28: '<',
 29: '=',
 30: '>',
 31: 'A',
 32: 'B',
 33: 'C',
 34: 'D',
 35: 'E',
 36: 'F',
 37: 'G',
 38: 'H',
 39: 'I',
 40: 'J',
 41: 'K',
 42: 'L',
 43: 'M',
 44: 'N',
 45: 'O',
 46: 'P',
 47: 'Q',
 48: 'R',
 49: 'S',
 50: 'T',
 51: 'U',
 52: 'V',
 53: 'W',
 54: 'X',
 55: 'Y',
 56: 'Z',
 57: '[',
 58: '\\',
 59: '\\!',
 60: '\\#',
 61: '\\,',
 62: '\\/',
 63: '\\:',
 64: '\\;',
 65: '\\Big',
 66: '\\Bigg',
 67: '\\Biggl',
 68: '\\Biggr',
 69: '\\Bigl',
 70: '\\Bigr',
 71: '\\Delta',
 72: '\\Gamma',
 73: '\\Im',
 74: '\\L',
 75: '\\Lambda',
 76: '\\Large',
 77: '\\Leftrightarrow',
 78: '\\Longleftrightarrow',
 79: '\\Longrightarrow',
 80: '\\O',
 81: '\\Omega',
 82: '\\P',
 83: '\\Phi',
 84: '\\Pi',
 85: '\\P

In [28]:
with open(os.path.join(data_dir, output_dir, 'dict_vocab.pkl'), 'wb') as f:
    pickle.dump(dict_vocab, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(data_dir, output_dir, 'dict_id2word.pkl'), 'wb') as f:
    pickle.dump(dict_id2word, f, pickle.HIGHEST_PROTOCOL)

In [29]:
print df_data_pruned.latex_ascii[df_clean.latex.str.contains(r'\\\\\\\\\\\\\\\\\\\\')].count()

0


In [30]:
def make_word2id(df_tokenized_, df_vocab_):
    word2id = df_vocab_.id.to_dict()
    sr_word2id = df_tokenized_.latex_tokenized.apply(lambda l: map(lambda t: word2id[t], l))
    df_ = df_tokenized_.assign(word2id=sr_word2id, word2id_len=sr_word2id.str.len())
    assert df_.word2id_len.equals(df_.tokenized_len)
    df_ = df_.drop(columns=['tokenized_len'])
    print df_.shape
    display(df_[:1])
    return df_

In [31]:
df_word2id = make_word2id(df_data_pruned, df_vocab_pruned)

(100088, 10)


Unnamed: 0,image,formula_name,latex,formula_len,height,width,latex_ascii,latex_tokenized,word2id,word2id_len
0,23be72ded29e9b4_basic.png,23be72ded29e9b4_basic,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331,78,738,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,"[\int, _, {, -, \epsilon, }, ^, {, \infty, }, d, l, \:, \mathrm, {, e, }, ^, {, -, l, \zeta, }, ...","[173, 324, 353, 11, 154, 355, 323, 353, 172, 355, 330, 338, 63, 207, 353, 331, 355, 323, 353, 11...",112


In [32]:
df_word2id.to_pickle(os.path.join(data_dir, output_dir, 'df_word2id.pkl'))

### Temp