## Chat with Arxiv using `claude-v1.3-100k`

In [2]:
import os
import anthropic
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import openai
import time
import tempfile
import shutil
from pathlib import Path
import subprocess
from tqdm import tqdm
import shutil
import requests
import tarfile
import os
import io
import re
import gradio as gr

### Helper - Get Arxiv Data

In [12]:
def download_arxiv_source(paper_id):
    url = f"https://arxiv.org/e-print/{paper_id}"

    # Get the tar file
    response = requests.get(url)
    response.raise_for_status()

    # Open the tar file
    tar = tarfile.open(fileobj=io.BytesIO(response.content), mode="r")

    # Load all .tex files into memory, including their subdirectories
    tex_files = {
        member.name: tar.extractfile(member).read().decode("utf-8")
        for member in tar.getmembers()
        if member.name.endswith(".tex")
    }
    # Load all .tex files into memory, including their subdirectories
    tex_files = {
        member.name: tar.extractfile(member).read().decode("utf-8")
        for member in tar.getmembers()
        if member.isfile() and member.name.endswith(".tex")
    }


    # Pattern to match \input{filename} and \include{filename}
    pattern = re.compile(r"\\(input|include){(.*?)}")

    # Function to replace \input{filename} and \include{filename} with file contents
    def replace_includes(text):
        output = []
        for line in text.split("\n"):
            match = re.search(pattern, line)
            if match:
                command, filename = match.groups()
                # LaTeX automatically adds .tex extension for \input and \include commands
                if not filename.endswith(".tex"):
                    filename += ".tex"
                if filename in tex_files:
                    output.append(replace_includes(tex_files[filename]))
                else:
                    output.append(f"% {line} % FILE NOT FOUND")
            else:
                output.append(line)
        return "\n".join(output)


    if "main.tex" in tex_files:
        # Start with the contents of main.tex
        main_tex = replace_includes(tex_files["main.tex"])
    else:
        # No main.tex, concatenate all .tex files
        main_tex = "\n".join(replace_includes(text) for text in tex_files.values())

    return main_tex


### Chat with Arxiv

In [4]:
client = anthropic.Client(api_key=os.environ['ANTHROPIC_API_KEY'])

In [5]:
class ContextualQA:
    def __init__(self, client, model="claude-v1.3-100k"):
        self.client = client
        self.model = model
        self.context = ""
        self.questions = []
        self.responses = []

    def load_text(self, text):
        self.context = text

    def ask_question(self, question):
        leading_prompt = "Consider the text document below:"
        trailing_prompt = (
            "Now answer the following question, use Markdown to format your answer."
        )
        prompt = f"{anthropic.HUMAN_PROMPT} {leading_prompt}\n\n{self.context}\n\n{trailing_prompt}\n\n{anthropic.HUMAN_PROMPT} {question} {anthropic.AI_PROMPT}"
        response = self.client.completion_stream(
            prompt=prompt,
            stop_sequences=[anthropic.HUMAN_PROMPT],
            max_tokens_to_sample=6000,
            model=self.model,
            stream=False,
        )
        responses = [data for data in response]
        self.questions.append(question)
        self.responses.append(responses)
        return responses

    def clear_context(self):
        self.context = ""
        self.questions = []
        self.responses = []

    def __getstate__(self):
        state = self.__dict__.copy()
        del state["client"]
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        self.client = None


In [6]:
def qa_to_pdf(question, answer, output_filename):
    latex_template = r"""
    \documentclass{{standalone}}
    \usepackage[utf8]{{inputenc}}
    \usepackage{{amsmath}}
    \usepackage{{amssymb}}
    \usepackage{{hyperref}}
    \usepackage{{varwidth}}
    \usepackage{{adjustbox}}
    \begin{{document}}
    \begin{{adjustbox}}{{margin=5mm}}
    \begin{{varwidth}}{{\linewidth}}
    \textbf{{Question:}} \\
    {question} \\
    \textbf{{Answer:}} \\
    {answer}
    \end{{varwidth}}
    \end{{adjustbox}}
    \end{{document}}
    """

    answer = answer[0]["completion"]
    latex_content = latex_template.format(question=question, answer=answer)

    with tempfile.TemporaryDirectory() as temp_dir:
        tex_file = Path(temp_dir) / "qa.tex"
        pdf_file = Path(temp_dir) / "qa.pdf"

        with open(tex_file, "w") as f:
            f.write(latex_content)

        subprocess.run(
            [
                "pdflatex",
                "-interaction=nonstopmode",
                "-output-directory",
                temp_dir,
                tex_file,
            ],
            check=True,
        )

        shutil.copy(pdf_file, output_filename)


In [13]:
latex_source = download_arxiv_source("2203.11096")
len(latex_source)

102541

In [14]:
print(latex_source)

% \documentclass[sigconf,screen,natbib=false]{acmart}
\documentclass[sigconf,screen,authorversion,nonacm,natbib=false]{acmart}

% \acmConference[MSR 2022]{MSR '22: Proceedings of the 19th International Conference on Mining Software Repositories}{May 23–24, 2022}{Pittsburgh, PA, USA}

\usepackage[utf8]{inputenc}
\usepackage[style=ACM-Reference-Format,backend=biber,defernumbers=false,sortcites=true]{biblatex}
\usepackage{subcaption}
\usepackage{multirow}
\usepackage{xspace}
\usepackage{listings}
\usepackage{booktabs}

\RequirePackage{fontawesome}

\usepackage{xcolor, soul}
\sethlcolor{yellow}

% \usepackage{hyperref}
% \hypersetup{
%     colorlinks=true,
%     % linkcolor=blue,
%     % filecolor=magenta,      
%     urlcolor=blue
% }

% \DeclareUnicodeCharacter{0301}{*************************************}
 
% \lstset{
%   backgroundcolor=\color{white},
%   extendedchars=true,
%   basicstyle=\footnotesize\ttfamily,
%   showstringspaces=false,
%   showspaces=false,
%   numbers=none,
%   nu

In [9]:
# Initialize the anthroptic client and the ContextualQA model
client = anthropic.Client(api_key=os.environ["ANTHROPIC_API_KEY"])


def load_context(paper_id):
    latex_source = download_arxiv_source(paper_id)
    model = ContextualQA(client, model="claude-v1.3-100k")
    model.load_text(latex_source)
    return (
        model,
        [(f"Load the paper with id {paper_id}.", "Paper loaded, Now ask a question.")],
    )


def answer_fn(model, question, chat_history):

    # if question is empty, tell user that they need to ask a question
    if question == "":
        chat_history.append(("No Question Asked", "Please ask a question."))
        return model, chat_history, ""

    response = model.ask_question(question)

    chat_history.append((question, response[0]['completion']))
    return model, chat_history, ""

def clear_context():
    return []


In [13]:
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Explore ArXiv Papers in Depth with `claude-v1.3-100k` - Ask Questions and Receive Detailed Answers Instantly")
    gr.Markdown(
        "Dive into the world of academic papers with our dynamic app, powered by the cutting-edge `claude-v1.3-100k` model. This app allows you to ask detailed questions about any ArXiv paper and receive direct answers from the paper's content. Utilizing a context length of 100k tokens, it provides an efficient and comprehensive exploration of complex research studies, making knowledge acquisition simpler and more interactive. (This text is generated by GPT-4 )"
    )

    with gr.Column():
        with gr.Row():
            paper_id_input = gr.Textbox(label="Enter Paper ID", value="2303.10130")
            btn_load = gr.Button("Load Paper")
            qa_model = gr.State()

        with gr.Column():
            chatbot = gr.Chatbot().style(color_map=("blue", "yellow"))
            question_txt = gr.Textbox(
                label="Question", lines=1, placeholder="Type your question here..."
            )
            btn_answer = gr.Button("Answer Question")

            btn_clear = gr.Button("Clear Chat")

    btn_load.click(load_context, inputs=[paper_id_input], outputs=[qa_model, chatbot])

    btn_answer.click(
        answer_fn,
        inputs=[qa_model, question_txt, chatbot],
        outputs=[qa_model, chatbot, question_txt],
    )

    question_txt.submit(
        answer_fn,
        inputs=[qa_model, question_txt, chatbot],
        outputs=[qa_model, chatbot, question_txt],
    )

    btn_clear.click(clear_context, outputs=[chatbot])

demo.launch()




Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




In [12]:
import arxiv

# The arXiv ID of the paper you're interested in
paper_id = "2108.07258"

# Create a search query with the arXiv ID
search = arxiv.Search(id_list=[paper_id])

# Fetch the paper using its arXiv ID
paper = next(search.results())

# Print the paper's title
print("Title:", paper.title)

# Print the paper's abstract
print("Abstract:", paper.summary)


Title: On the Opportunities and Risks of Foundation Models
Abstract: AI is undergoing a paradigm shift with the rise of models (e.g., BERT,
DALL-E, GPT-3) that are trained on broad data at scale and are adaptable to a
wide range of downstream tasks. We call these models foundation models to
underscore their critically central yet incomplete character. This report
provides a thorough account of the opportunities and risks of foundation
models, ranging from their capabilities (e.g., language, vision, robotics,
reasoning, human interaction) and technical principles(e.g., model
architectures, training procedures, data, systems, security, evaluation,
theory) to their applications (e.g., law, healthcare, education) and societal
impact (e.g., inequity, misuse, economic and environmental impact, legal and
ethical considerations). Though foundation models are based on standard deep
learning and transfer learning, their scale results in new emergent
capabilities,and their effectiveness across so