<h2>Get LaTeX equations from PDFs</h2>

In [1]:
import pdfminer

from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

In [2]:
import os
import re

In [3]:
def extract_text_from_pdf(pdfname):
    output_string = StringIO()

    with open('pdfs/'+pdfname,'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr,output_string,laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            try:
                interpreter.process_page(page)
            except:
                print(page)
    return output_string.getvalue()

In [4]:
def getAllLatexEquations(origtext,text):    
    all_eqs = []

    # $...$
    dollar_eqs = re.findall("\$.*?\$",origtext)
    all_eqs+=dollar_eqs

    # \begin{equation*}. OR \begin{equation}.. OR \begin{subequations} OR \begin{empheq}
    begineq_eqs =  re.findall("\\\\begin{equation\*.*?equation\*}|\\\\begin{equation.*?equation}|\\\\begin{subequations}.*?subequations}|\\\\begin{empheq}.*?empheq}",text)
    all_eqs+=begineq_eqs

    # \[...\]
    square_brackets = re.findall("\\\\\[.*?\\\\\]",text)
    all_eqs+=square_brackets


    # \begin{align} OR \begin{align*}
    begin_align = re.findall("begin{align\*}.*?end{align\*}|begin{align}.*?end{align}",text)
    all_eqs+=begin_align


    # \begin{multline}.....\end{multline}
    begin_multiline = re.findall("\\\\begin{multline}.*?\\\\end{multline}",text)
    all_eqs+=begin_multiline

    # \begin{eqnarray} OR \begin{array}
    begin_array = re.findall("\\\\begin{eqnarray}.*?\\\\end{eqnarray}|\\\\begin{array}.*?\\\\end{array}",text)
    all_eqs+=begin_array


    # \begin{verbatim}
    begin_verbatim = re.findall("\\\\begin{verbatim}.*?\\\\end{verbatim}",text)
    all_eqs+=begin_verbatim

    # \begin{gather*}. ... \end{gather*}
    begin_gather = re.findall("\\\\begin{gather}.*?\\\\end{gather}|\\\\begin{gather\*}.*?\\\\end{gather\*}",text)
    all_eqs+=begin_gather


    #\begin{cases}
    begin_cases = re.findall("\\\\begin{gather}.*?\\\\end{gather}|\\\\begin{gather\*}.*?\\\\end{gather\*}",text)
    all_eqs+=begin_cases


    #\begin{pmat}
    begin_pmat = re.findall("\\\\begin{pmat}.*?\\\\end{pmat}",text)
    all_eqs+=begin_pmat


    return all_eqs

In [5]:
def writeToTextFile(fileName, all_eqs):

    f = open("training_data/"+fileName,"w")
    strng = ""
    for equation in all_eqs:
        strng = strng + '\n' +equation
    print(len(all_eqs))
    f.write(strng)
    f.close()

In [6]:
os.listdir('pdfs')

['mil_maths.pdf',
 'GSWLaTeX_maths.pdf',
 'novices-report_math.pdf',
 'Latex_math.pdf',
 'TheLaTeXMathematicsCompanionGai.pdf']

<h3>TheLatexMathematicsCompanion<h1>


In [7]:
origtext = extract_text_from_pdf('TheLaTeXMathematicsCompanionGai.pdf')

In [8]:
text = re.sub("\n", " ", origtext) # removing newline characters in equations

In [9]:
all_eqs = getAllLatexEquations(origtext,text)
print(len(all_eqs))


337


In [10]:
writeToTextFile("TheLaTeXMathematicsCompanionGai.txt", all_eqs)

337


<h3>Mil</h3>

In [11]:
origtext = extract_text_from_pdf('mil_maths.pdf')

In [12]:
text = re.sub("\n", " ", origtext) # removing newline characters in equations

In [14]:
all_eqs = getAllLatexEquations(origtext,text)
print(len(all_eqs))


157


In [15]:
custom = re.findall("\\\\.*? ",text)

In [16]:
all_eqs+=custom
#writeToTextFile("mil_maths.txt",all_eqs)


1798


<h3> GSW </h3>

In [17]:
origtext = extract_text_from_pdf("GSWLaTeX_maths.pdf")

In [18]:
text = re.sub("\n", " ", origtext)

In [None]:
all_eqs = getAllLatexEquations(origtext,text)
print(len(all_eqs))
for eq in all_eqs:
    print(eq)

In [22]:
custom = re.findall("\\\\.*?[ \n]",text)

In [25]:
all_eqs+=custom

In [26]:
#writeToTextFile("GSWLaTeX_maths.txt",all_eqs)

753


<h3>novices report</h3>

In [9]:
origtext = extract_text_from_pdf("novices-report_math.pdf")

In [11]:
text = re.sub("\n", " ", origtext) # removing newline characters in equations

In [12]:
all_eqs = getAllLatexEquations(origtext,text)
print(len(all_eqs))


110


In [19]:
#writeToTextFile("novices_report.txt",all_eqs)

110


<h3> Latex_math </h3>

In [20]:
origtext = extract_text_from_pdf("Latex_math.pdf")

In [21]:
text = re.sub("\n", " ", origtext) # removing newline characters in equations

In [22]:
all_eqs = getAllLatexEquations(origtext,text)

In [25]:
custom = re.findall("\\\\.*?[ \n]",text)

In [27]:
all_eqs+=custom

In [28]:
#writeToTextFile("Latex_math.txt",all_eqs)

1099
