<h1>Notebook to get LaTeX equations from Website URLs</h1>

<h3>Importing libraries</h3>

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

import re
import os

<h3>Extract LaTeX equations from text</h3>
<p>This method returns the equations. see comments for details. Comments will tell which regular expression extracts which equaiton set<p>

In [154]:
def getAllLatexEquations(text,text2):    
    all_eqs = []

    # $...$
    dollar_eqs = re.findall("\$.*?\$",text)
    all_eqs+=dollar_eqs

    # \begin{equation*}. OR \begin{equation}.. OR \begin{subequations} OR \begin{empheq}
    begineq_eqs =  re.findall("\\\\begin{equation\*.*?equation\*}|\\\\begin{equation.*?equation}|\\\\begin{subequations}.*?subequations}|\\\\begin{empheq}.*?empheq}",text)
    all_eqs+=begineq_eqs

    # \[...\]
    square_brackets = re.findall("\\\\\[.*?\\\\\]",text)
    all_eqs+=square_brackets
    
    # \(...\)
    round_brackets = re.findall("\\\\\(.*?\\\\\)",text)
    all_eqs+=round_brackets


    # \begin{align} OR \begin{align*}
    begin_align = re.findall("begin{align\*}.*?end{align\*}|begin{align}.*?end{align}",text)
    all_eqs+=begin_align


    # \begin{multline}.....\end{multline}
    begin_multiline = re.findall("\\\\begin{multline}.*?\\\\end{multline}",text)
    all_eqs+=begin_multiline

    # \begin{eqnarray} OR \begin{array}
    begin_array = re.findall("\\\\begin{eqnarray}.*?\\\\end{eqnarray}|\\\\begin{array}.*?\\\\end{array}",text)
    all_eqs+=begin_array


    # \begin{verbatim}
    begin_verbatim = re.findall("\\\\begin{verbatim}.*?\\\\end{verbatim}",text)
    all_eqs+=begin_verbatim

    # \begin{gather*}. ... \end{gather*}
    begin_gather = re.findall("\\\\begin{gather}.*?\\\\end{gather}|\\\\begin{gather\*}.*?\\\\end{gather\*}",text)
    all_eqs+=begin_gather


    #\begin{cases}
    begin_cases = re.findall("\\\\begin{gather}.*?\\\\end{gather}|\\\\begin{gather\*}.*?\\\\end{gather\*}",text)
    all_eqs+=begin_cases


    #\begin{pmat}
    begin_pmat = re.findall("\\\\begin{pmat}.*?\\\\end{pmat}",text)
    all_eqs+=begin_pmat


    #terms
    terms = re.findall("\\\\.*? ",text2)
    all_eqs+=terms
    
    return all_eqs

<h3>Given Url extract text</h3>
<p>Note: This method is currently only customized for the website named 'overleaf'</p>

In [165]:
def get_text(url):
    page = urlopen(url)
    full_html = page.read().decode("utf-8")
    soup = BeautifulSoup(full_html,"html.parser")
    div_html_ob = soup.find("div", class_="mw-parser-output")
    # Stringify html
    div_html_txt=""
    for content in div_html_ob.contents:
        div_html_txt+=str(content)
    text = BeautifulSoup(div_html_txt, "html.parser").get_text()
    text = re.sub("\n", "",text)
    
    try:
        text2 = get_table_text(full_html)
    except:
        text2 = ""
    return text,text2

<h3>This method gets the entries of tables present in the website named 'overleaf'</h3>

In [141]:
def get_table_text(full_html):
    soup = BeautifulSoup(full_html,"html.parser")
    table_html_obj = soup.find("table")
    # Stringify html
    table_html_txt=""
    for content in table_html_obj.contents:
        table_html_txt+=str(content)
    
    soup2 = BeautifulSoup(table_html_txt,"html.parser")
    text=""
    for c in soup2.find_all("code"):
        text+=c.get_text()
    return text

<h3>Writes the equaions to a text file</h3>

In [None]:
def writeToTextFile(fileName, all_eqs):

    f = open("training_data/"+fileName,"w")
    strng = ""
    for equation in all_eqs:
        strng = strng + '\n' +equation
    print(len(all_eqs))
    f.write(strng)
    f.close()

<h3>URLs</h3>
<p>All the Urls on the website named 'overleaf' related to mathematics<p>

In [166]:
url0 = "https://www.overleaf.com/learn/latex/Mathematical_expressions"
url1 = "https://www.overleaf.com/learn/Subscripts_and_superscripts"
url2 = "https://www.overleaf.com/learn/Brackets_and_Parentheses"
url3 = "https://www.overleaf.com/learn/Fractions_and_Binomials"
url4 = "https://www.overleaf.com/learn/Aligning_equations"
url5 = "https://www.overleaf.com/learn/Operators"
url6 = "https://www.overleaf.com/learn/Spacing_in_math_mode"
url7 = "https://www.overleaf.com/learn/Integrals,_sums_and_limits"
url8 = "https://www.overleaf.com/learn/Display_style_in_math_mode"
url9 = "https://www.overleaf.com/learn/List_of_Greek_letters_and_math_symbols"
url10 = "https://www.overleaf.com/learn/Mathematical_fonts"

urls = [url0,url1,url2,url3,url4,url5,url6,url7,url8,url9,url10]

<h4>Collect equation from all Urls</h4>

In [167]:
text=""
for url in urls:
    text1, text2 = get_text(url)
    eqs = getAllLatexEquations(text1,text2)
    
    print(url)
    for e in eqs:
       
        text+= "\n"+e


https://www.overleaf.com/learn/latex/Mathematical_expressions
https://www.overleaf.com/learn/Subscripts_and_superscripts
https://www.overleaf.com/learn/Brackets_and_Parentheses
https://www.overleaf.com/learn/Fractions_and_Binomials
https://www.overleaf.com/learn/Aligning_equations
https://www.overleaf.com/learn/Operators
https://www.overleaf.com/learn/Spacing_in_math_mode
https://www.overleaf.com/learn/Integrals,_sums_and_limits
https://www.overleaf.com/learn/Display_style_in_math_mode
https://www.overleaf.com/learn/List_of_Greek_letters_and_math_symbols
https://www.overleaf.com/learn/Mathematical_fonts


<h4>write all the equations to the file named 'overleaf.txt'</h4>

In [181]:
text
f = open("training_data/websites/overleaf.txt","w")
f.write(text)


5382