In [None]:
from bs4 import BeautifulSoup, SoupStrainer
from IPython.display import Markdown
from loaders import HtmlDocumentLoader
from preprocessors import ArxivHtmlPaperPreprocessor 

def display_md(content):
  display(Markdown(content))

doc_uri = "https://arxiv.org/html/2312.10997v5"
cache_path = "./loader_cache"

In [None]:
loader = HtmlDocumentLoader(doc_uri, cache_path)
doc = loader.load()
display(doc)

In [None]:
preprocessor = ArxivHtmlPaperPreprocessor()
cleaned_text = preprocessor.get_text(doc)
display_md(cleaned_text)

#### Load HTML

In [None]:
file_path = 'paper.html'
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

#### Extract Title

In [None]:
def extract_title(html_content):
  strainer = SoupStrainer('h1', class_="ltx_title ltx_title_document")
  soup = BeautifulSoup(html_content, 'html.parser', parse_only=strainer)
  title_text = soup.get_text()
  return title_text

print(extract_title(html_content))

#### Extract Authors and Affiliations

In [None]:
def extract_authors_and_affiliations(html_content):
    strainer = SoupStrainer('div', class_="ltx_authors")
    soup = BeautifulSoup(html_content, 'html.parser', parse_only=strainer)

    formatted_output = []
    for author in soup.find_all('span', class_='ltx_creator ltx_role_author'):
        name = author.find('span', class_='ltx_personname').get_text(strip=True)
        affiliation = ' '.join(span.get_text(strip=True) for span in author.find_all('span', class_='ltx_contact ltx_role_affiliation'))
        formatted_output.append(f"{name}: {affiliation}")
    output_text = "\n".join(formatted_output)
    return output_text

print(extract_authors_and_affiliations(html_content))


#### Extract Abstract

In [None]:
def extract_abstract(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    abstract_div = soup.find('div', class_='ltx_abstract')
    
    if abstract_div:
        abstract_title = abstract_div.find('h6', class_='ltx_title ltx_title_abstract')
        abstract_title_text = abstract_title.get_text(strip=True) if abstract_title else "Abstract"
        
        abstract_paragraph = abstract_div.find('p', class_='ltx_p')
        if abstract_paragraph:
            for footnote in abstract_paragraph.find_all('span', class_='ltx_note'):
                footnote.decompose()
            
            return f"{abstract_title_text}\n\n{abstract_paragraph.get_text(strip=True)}"
    return "Abstract not found"
  
display_md(extract_abstract(html_content))

#### Extract Section with Subheadings

In [None]:
def extract_section_with_subheadings(html_content, section_id):
    soup = BeautifulSoup(html_content, 'html.parser')
    section = soup.find('section', id=section_id)
    
    if section:
        output_text = []
        main_heading = section.find(['h2', 'h3'], class_='ltx_title')
        if main_heading:
            output_text.append(main_heading.get_text(strip=True))
        elements = section.find_all(['p', 'h3'], class_=lambda x: x in ['ltx_p', 'ltx_title ltx_title_subsection'])
        for element in elements:
            if element.name == 'h3':
                output_text.append("\n\n" + element.get_text(strip=True))
            else:
                output_text.append(element.get_text(strip=True))
        return '\n\n'.join(output_text)
    return "Section not found"

display_md(extract_section_with_subheadings(html_content, "S2"))