# im2latex(S): Tokenizer

&copy; Copyright 2017 Sumeet S Singh

    This file is part of im2latex solution by Sumeet S Singh.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the Affero GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    Affero GNU General Public License for more details.

    You should have received a copy of the Affero GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [1]:
import pandas as pd
import os
import re
import codecs
from IPython.display import display
from six.moves import cPickle as pickle
import string
from PIL import Image

In [2]:
pd.options.display.max_rows = 600
pd.options.display.max_columns = 20
pd.options.display.max_colwidth = 100
pd.options.display.width = 160
data_dir = '../data/generated2'
image_folder = os.path.join(data_dir,'formula_images')

#### Characterset Chart for Reference
##### ASCII Control Characters

                        CTRL   (^D means to hold the CTRL key and hit d)
    Oct  Dec Char  Hex  Key     Comments
    \000   0  NUL  \x00  ^@ \0 (Null byte)
    \001   1  SOH  \x01  ^A    (Start of heading)
    \002   2  STX  \x02  ^B    (Start of text)
    \003   3  ETX  \x03  ^C    (End of text) (see: UNIX keyboard CTRL)
    \004   4  EOT  \x04  ^D    (End of transmission) (see: UNIX keyboard CTRL)
    \005   5  ENQ  \x05  ^E    (Enquiry)
    \006   6  ACK  \x06  ^F    (Acknowledge)
    \007   7  BEL  \x07  ^G    (Ring terminal bell)
    \010   8   BS  \x08  ^H \b (Backspace)  (\b matches backspace inside [] only)
                                            (see: UNIX keyboard CTRL)
    \011   9   HT  \x09  ^I \t (Horizontal tab)
    \012  10   LF  \x0A  ^J \n (Line feed)  (Default UNIX NL) (see End of Line below)
    \013  11   VT  \x0B  ^K    (Vertical tab)
    \014  12   FF  \x0C  ^L \f (Form feed)
    \015  13   CR  \x0D  ^M \r (Carriage return)  (see: End of Line below)
    \016  14   SO  \x0E  ^N    (Shift out)
    \017  15   SI  \x0F  ^O    (Shift in)
    \020  16  DLE  \x10  ^P    (Data link escape)
    \021  17  DC1  \x11  ^Q    (Device control 1) (XON) (Default UNIX START char.)
    \022  18  DC2  \x12  ^R    (Device control 2)
    \023  19  DC3  \x13  ^S    (Device control 3) (XOFF)  (Default UNIX STOP char.)
    \024  20  DC4  \x14  ^T    (Device control 4)
    \025  21  NAK  \x15  ^U    (Negative acknowledge)  (see: UNIX keyboard CTRL)
    \026  22  SYN  \x16  ^V    (Synchronous idle)
    \027  23  ETB  \x17  ^W    (End of transmission block)
    \030  24  CAN  \x18  ^X    (Cancel)
    \031  25  EM   \x19  ^Y    (End of medium)
    \032  26  SUB  \x1A  ^Z    (Substitute character)
    \033  27  ESC  \x1B  ^[    (Escape)
    \034  28  FS   \x1C  ^\    (File separator, Information separator four)
    \035  29  GS   \x1D  ^]    (Group separator, Information separator three)
    \036  30  RS   \x1E  ^^    (Record separator, Information separator two)
    \037  31  US   \x1F  ^_    (Unit separator, Information separator one)
    \177 127  DEL  \x7F  ^?    (Delete)  (see: UNIX keyboard CTRL)
    
    string.printable = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'
    string.whitespace = '\t\n\x0b\x0c\r '

In [3]:
def loadImageList(filepath):
    df=pd.read_table(filepath, header=None, 
                     names=['id', 'name', 'type'], 
                     delim_whitespace=True, 
                     usecols=('id','name'), 
                     dtype={'id':int, 'name':str, 'type':str})
    return df

def getImageDetails(data_dir):
    try:
        image_details = pd.read_csv(os.path.join(data_dir,'image_details.csv'),
                                   index_col=0)
        return image_details
    except:
        pass
    widths=[]
    heights=[]
    filenames=[]
    imageList = loadImageList(os.path.join(data_dir,'im2latex.lst'))
    for i in range(imageList.shape[0]):
        try:
            image_name = imageList.iloc[i,1] + '.png'
            im = Image.open(os.path.join(image_folder,image_name))
            widths.append(im.size[0])
            heights.append(im.size[1])
            filenames.append(image_name)
        except Exception as e:
            #print(e)
            pass
    print(len(widths), len(filenames), len(heights))
    dff = pd.DataFrame({'filename':filenames, 'width':widths, 'height':heights})
    dff.to_csv(os.path.join(data_dir,'image_details.csv'))
    return dff


def getDatasetDetails(data_dir):
    try:
        image_details = pd.read_pickle(os.path.join(data_dir,'df_image_details.pkl'))
        return image_details
    except:
        pass
    widths=[]
    heights=[]
    formula_lens=[]
    datasetDF = pd.read_pickle(os.path.join(data_dir,'im2latex_map.pkl'))
    for _, row in datasetDF.iterrows():
        image_name = row.image
        im = Image.open(os.path.join(image_folder,image_name))
        widths.append(im.size[0])
        heights.append(im.size[1])
        formula_lens.append(len(row.latex))
    print(len(widths), len(heights))
    datasetDF = datasetDF.assign(width=widths, height=heights, formula_len=formula_lens)
    datasetDF.to_pickle(os.path.join(data_dir,'df_image_details.pkl'))
    return datasetDF

df_image_details = getDatasetDetails(data_dir)

In [4]:
def get_df_clean(data_dir_, df_image_details_):
    NONPRINTABLE_CHARS_RE = r'[^\\' + string.printable + r']'
    DELETE_RE = re.compile(r".\x7F")
    PERCENTS_RE = r'%'
    try:
        return pd.read_pickle(os.path.join(data_dir_,'df_clean.pkl'))
    except Exception as e:
        print e    
        df = df_image_details_
        # Make sure everything's ascii
        # Coalesce whitespace to a single space
        # Strip whitespace from the sides
        # Strip percent signs from the sides
        # Discard strings with non-printable characters
        # Discard strings with embedded percent signs (because textogif ignores everything after the % sign)
        cleaned = df.latex.str.decode('ascii').str.encode('ascii').str.replace(r"\s+", ' ').str.strip().str.strip('%')
        df = df.assign(latex_ascii=cleaned, latex_ascii_len=cleaned.str.len())
        bad1 = df.latex.str.contains(NONPRINTABLE_CHARS_RE)
        print 'nonprintables #: ', bad1.shape
        bad2 = df.latex.str.contains(PERCENTS_RE)
        print 'percents #: ', bad1.shape
        good = ~(bad1 | bad2)
        print 'good #: ', good.shape
        df = df[good]
        df.to_pickle((os.path.join(data_dir_,'df_clean.pkl')))
        return df

In [5]:
df_clean = get_df_clean(data_dir, df_image_details)
display(df_clean.shape)

(99600, 8)

In [6]:
class TokenDict(object):
    def __init__(self):
        self._tokens = {}
    
    def account(self, token_list):
        for token in token_list:
            self._count(token)
            
    def _count(self, token):
        if token in self._tokens:
            self._tokens[token] += 1
        else:
            self._tokens[token] = 1
        return 1
    
    @property
    def dict(self):
        return self._tokens
    
    @property
    def tokens(self):
        return sorted(self._tokens.keys())
            
def get_vocabulary(df_, data_dir_):
    try:
        df_vocab = pd.read_pickle(os.path.join(data_dir_,'df_vocab.pkl'))
        df_tokenized = pd.read_pickle(os.path.join(data_dir_,'df_tokenized.pkl'))
    except Exception as e:
        print e
        ## Split latex into tokens. Isolate latex commands first - i.e.
        ## (optionally even number of backslashes) followed by one backslash followed by letters.
        ## Everything else is a one-character token in itself.
        LATEX_RE = re.compile(r"(?:(?<=\\\\\\\\\\\\)\\[a-zA-Z]+)|(?:(?<=\\\\\\\\)\\[a-zA-Z]+)|(?:(?<=\\\\)\\[a-zA-Z]+)|(?:(?<!\\)\\[a-zA-Z]+)|.")
        sr_token = df_.latex_ascii.str.findall(LATEX_RE)
        df_tokenized = df_.assign(latex_tokenized=sr_token)
        ## Aggregate the tokens
        vocab = TokenDict()
        sr_token.agg(lambda l: vocab.account(l))
        ## Sort and save
        tokens = []; 
        count = []
        for t in vocab.tokens:
            tokens.append(t)
            count.append(vocab.dict[t])
        ## Assign token-ids. Start with 1. Reserve 0 as a 'null' token.
        df_vocab = pd.DataFrame({'id':range(1,len(tokens)+1), 'freq':count}, index=tokens, columns=['id', 'freq'])
        ## Persist to disk
        df_vocab.to_pickle(os.path.join(data_dir_,'df_vocab.pkl'))
        df_tokenized.to_pickle(os.path.join(data_dir_,'df_tokenized.pkl'))
        
    return df_vocab, df_tokenized

In [7]:
df_vocab, df_tokenized = get_vocabulary(df_clean, data_dir)

[Errno 2] No such file or directory: '../data/generated2\\df_vocab.pkl'


In [13]:
#df_vocab[df_vocab.index.str.contains(r'\\')]

Unnamed: 0,id,freq
\,60,151304
\AA,61,5
\Big,62,2042
\Bigg,63,307
\Biggl,64,185
\Biggm,65,5
\Biggr,66,194
\Bigl,67,826
\Bigm,68,6
\Bigr,69,878


In [14]:
print df_clean.latex_ascii[df_clean.latex.str.contains(r'\\\\\\\\\\\\\\\\\\\\')].count()

0


In [15]:
def get_word2id(df_tokenized_, df_vocab_, data_dir_):
    try:
        return pd.read_pickle(os.path.join(data_dir, 'df_word2id.pkl'))
    except Exception as e:
        print e        
        word2id = df_vocab_.id.to_dict()
        sr_word2id = df_tokenized_.latex_tokenized.apply(lambda l: map(lambda t: word2id[t], l))
        df_ = df_tokenized_.assign(word2id=sr_word2id, word2id_len=sr_word2id.str.len())
        df_.to_pickle(os.path.join(data_dir_, 'df_word2id.pkl'))
        return df_

In [16]:
df_word2id = get_word2id(df_tokenized, df_vocab, data_dir)

[Errno 2] No such file or directory: '../data/generated2\\df_word2id.pkl'


In [17]:
df_word2id.shape

(99600, 11)