In [86]:
import tarfile
from tempfile import TemporaryDirectory


from dotenv import load_dotenv
import arxiv
from langchain.tools import Tool
from langchain_community.utilities import GoogleSearchAPIWrapper
from langchain_community.utilities import TextRequestsWrapper
from langchain.docstore.document import Document

from pylatexenc.latex2text import LatexNodes2Text

from bs4 import BeautifulSoup, SoupStrainer

In [2]:
load_dotenv()

True

In [3]:
def top_3_results(query):
    search = GoogleSearchAPIWrapper()
    return search.results(query, 3)

In [4]:
tool = Tool(
    name="Google Search",
    description="Search Google for recent results.",
    func=top_3_results,
)

In [5]:
searches = tool.run("layoutlmv3")
searches

[{'title': 'LayoutLMv3: Pre-training for Document AI with Unified Text and ...',
  'link': 'https://arxiv.org/abs/2204.08387',
  'snippet': 'Apr 18, 2022 ... Title:LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking ... Abstract:Self-supervised pre-training techniques have\xa0...'},
 {'title': 'Semantic Table Detection with LayoutLMv3',
  'link': 'https://arxiv.org/abs/2211.15504',
  'snippet': 'Nov 25, 2022 ... Title:Semantic Table Detection with LayoutLMv3 ... Abstract:This paper presents an application of the LayoutLMv3 model for semantic table\xa0...'},
 {'title': 'DocILE Benchmark for Document Information Localization and ...',
  'link': 'https://arxiv.org/abs/2302.05658',
  'snippet': 'Feb 11, 2023 ... The benchmark comes with several baselines, including RoBERTa, LayoutLMv3 and DETR-based Table Transformer; applied to both tasks of the\xa0...'}]

In [6]:
links = [search['link'] for search in searches]
links

['https://arxiv.org/abs/2204.08387',
 'https://arxiv.org/abs/2211.15504',
 'https://arxiv.org/abs/2302.05658']

In [87]:
requests = TextRequestsWrapper()

In [88]:
html_list = [requests.get(link) for link in links]

In [90]:
soup = BeautifulSoup(html_list[0], "html.parser")  

In [92]:
soup.find(property='og:title')["content"]

'LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking'

In [94]:
soup.find('meta', property='og:description')["content"]

'Self-supervised pre-training techniques have achieved remarkable progress in Document AI. Most multimodal pre-trained models use a masked language modeling objective to learn bidirectional representations on the text modality, but they differ in pre-training objectives for the image modality. This discrepancy adds difficulty to multimodal representation learning. In this paper, we propose \\textbf{LayoutLMv3} to pre-train multimodal Transformers for Document AI with unified text and image masking. Additionally, LayoutLMv3 is pre-trained with a word-patch alignment objective to learn cross-modal alignment by predicting whether the corresponding image patch of a text word is masked. The simple unified architecture and training objectives make LayoutLMv3 a general-purpose pre-trained model for both text-centric and image-centric Document AI tasks. Experimental results show that LayoutLMv3 achieves state-of-the-art performance not only in text-centric tasks, including form understanding, 

In [96]:
# extract document index from link
def extract_arxiv_id(link):
    return link.split('/')[-1]


In [97]:
# instantiate arxiv client
arxiv_client = arxiv.Client()

In [98]:
arxiv_search = arxiv.Search(id_list=[extract_arxiv_id(links[0])])

In [99]:
arxiv_paper = next(arxiv_client.results(arxiv_search))

In [100]:
# download file into temp directory
with TemporaryDirectory() as temp_dir:
    arxiv_paper.download_source(dirpath=temp_dir, filename="paper.tar.gz")

    # read the file
    with tarfile.open(f"{temp_dir}/paper.tar.gz", "r:gz") as tar:
        for member in tar.getmembers():
            if member.name.endswith("arxiv.tex"):
                file = tar.extractfile(member)
                file_content = file.read().decode("utf-8")
                break

In [101]:
file_content

'%%\n%% This is file `sample-sigconf.tex\',\n%% generated with the docstrip utility.\n%%\n%% The original source files were:\n%%\n%% samples.dtx  (with options: `sigconf\')\n%% \n%% IMPORTANT NOTICE:\n%% \n%% For the copyright see the source file.\n%% \n%% Any modified versions of this file must be renamed\n%% with new filenames distinct from sample-sigconf.tex.\n%% \n%% For distribution of the original source see the terms\n%% for copying and modification in the file samples.dtx.\n%% \n%% This generated file may be distributed as long as the\n%% original source files, as listed above, are part of the\n%% same distribution. (The sources need not necessarily be\n%% in the same archive or directory.)\n%%\n%% Commands for TeXCount\n%TC:macro \\cite [option:text,text]\n%TC:macro \\citep [option:text,text]\n%TC:macro \\citet [option:text,text]\n%TC:envir table 0 1\n%TC:envir table* 0 1\n%TC:envir tabular [ignore] word\n%TC:envir displaymath 0 word\n%TC:envir math 0 word\n%TC:envir comment 0

In [102]:
paper_text = LatexNodes2Text().latex_to_text(file_content).strip()

In [104]:
docs = Document(
    page_content=paper_text,
    metadata={
        "title": arxiv_paper.title,
        "authors": arxiv_paper.authors,
        "abstract": arxiv_paper.summary,
    },
)

In [107]:
len(docs.page_content)

45677

In [108]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [124]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)

In [125]:
splits = text_splitter.split_text(docs.page_content)

In [126]:
len(splits)

30