# Sci Research Agent Dev

## arXiv


In [1]:
import requests
import xml.etree.ElementTree as ET


In [2]:
search_term = "Quantum Gravity"

max_results = 10
query = "+".join(search_term.lower().split())
url = f"http://export.arxiv.org/api/query?search_query=all:{query}max_results={max_results}"
url

'http://export.arxiv.org/api/query?search_query=all:quantum+gravitymax_results=10'

In [3]:
resp = requests.get(url)

In [4]:
resp

<Response [200]>

In [5]:
resp.text

'<?xml version="1.0" encoding="UTF-8"?>\n<feed xmlns="http://www.w3.org/2005/Atom">\n  <link href="http://arxiv.org/api/query?search_query%3Dall%3Aquantum%20gravitymax_results%3D10%26id_list%3D%26start%3D0%26max_results%3D10" rel="self" type="application/atom+xml"/>\n  <title type="html">ArXiv Query: search_query=all:quantum gravitymax_results=10&amp;id_list=&amp;start=0&amp;max_results=10</title>\n  <id>http://arxiv.org/api/vK5e9MYHl4z6Z/AxxTmNPw6+d7c</id>\n  <updated>2025-01-31T00:00:00-05:00</updated>\n  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">420401</opensearch:totalResults>\n  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>\n  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">10</opensearch:itemsPerPage>\n  <entry>\n    <id>http://arxiv.org/abs/quant-ph/0201082v1</id>\n    <updated>2002-01-18T15:08:05Z</updated>\n    <published>2002-01-18T15:08:05Z</publ

In [11]:
import xml.etree.ElementTree as ET
import json
from collections import defaultdict

def parse_arxiv_xml(xml_string):
    # Define namespaces
    namespaces = {
        "atom": "http://www.w3.org/2005/Atom",
        "opensearch": "http://a9.com/-/spec/opensearch/1.1/",
        "arxiv": "http://arxiv.org/schemas/atom"
    }

    root = ET.fromstring(xml_string)

    feed = {
        "title": root.find("atom:title", namespaces).text if root.find("atom:title", namespaces) is not None else "",
        "id": root.find("atom:id", namespaces).text if root.find("atom:id", namespaces) is not None else "",
        "updated": root.find("atom:updated", namespaces).text if root.find("atom:updated", namespaces) is not None else "",
        "totalResults": root.find("opensearch:totalResults", namespaces).text if root.find("opensearch:totalResults", namespaces) is not None else "",
        "startIndex": root.find("opensearch:startIndex", namespaces).text if root.find("opensearch:startIndex", namespaces) is not None else "",
        "itemsPerPage": root.find("opensearch:itemsPerPage", namespaces).text if root.find("opensearch:itemsPerPage", namespaces) is not None else "",
        "entries": []
    }

    for entry in root.findall("atom:entry", namespaces):
        authors = [author.find("atom:name", namespaces).text for author in entry.findall("atom:author", namespaces)]
        categories = [category.attrib.get("term", "") for category in entry.findall("atom:category", namespaces)]

        entry_data = {
            "id": entry.find("atom:id", namespaces).text,
            "updated": entry.find("atom:updated", namespaces).text,
            "published": entry.find("atom:published", namespaces).text,
            "title": entry.find("atom:title", namespaces).text,
            "summary": entry.find("atom:summary", namespaces).text.strip(),
            "authors": authors,
            "comment": entry.find("arxiv:comment", namespaces).text if entry.find("arxiv:comment", namespaces) is not None else "",
            "journal_ref": entry.find("arxiv:journal_ref", namespaces).text if entry.find("arxiv:journal_ref", namespaces) is not None else "",
            "doi": entry.find("arxiv:doi", namespaces).text if entry.find("arxiv:doi", namespaces) is not None else "",
            "links": {link.attrib.get("title", "default"): link.attrib.get("href", "") for link in entry.findall("atom:link", namespaces)},
            "primary_category": entry.find("arxiv:primary_category", namespaces).attrib.get("term", "") if entry.find("arxiv:primary_category", namespaces) is not None else "",
            "categories": categories
        }

        feed["entries"].append(entry_data)

    return feed


In [12]:
data = parse_arxiv_xml(resp.text)

In [13]:
data

{'title': 'ArXiv Query: search_query=all:quantum gravitymax_results=10&id_list=&start=0&max_results=10',
 'id': 'http://arxiv.org/api/vK5e9MYHl4z6Z/AxxTmNPw6+d7c',
 'updated': '2025-01-31T00:00:00-05:00',
 'totalResults': '420401',
 'startIndex': '0',
 'itemsPerPage': '10',
 'entries': [{'id': 'http://arxiv.org/abs/quant-ph/0201082v1',
   'updated': '2002-01-18T15:08:05Z',
   'published': '2002-01-18T15:08:05Z',
   'title': 'Quantum Computers and Quantum Computer Languages: Quantum Assembly\n  Language and Quantum C Language',
   'summary': 'We show a representation of Quantum Computers defines Quantum Turing Machines\nwith associated Quantum Grammars. We then create examples of Quantum Grammars.\nLastly we develop an algebraic approach to high level Quantum Languages using\nQuantum Assembly language and Quantum C language as examples.',
   'authors': ['Stephen Blaha'],
   'comment': '32 pages',
   'journal_ref': '',
   'doi': '',
   'links': {'default': 'http://arxiv.org/abs/quant-ph/

In [15]:
list(data.keys())

['title',
 'id',
 'updated',
 'totalResults',
 'startIndex',
 'itemsPerPage',
 'entries']

In [17]:
len(data["entries"])

10

In [18]:
data["entries"][0]

{'id': 'http://arxiv.org/abs/quant-ph/0201082v1',
 'updated': '2002-01-18T15:08:05Z',
 'published': '2002-01-18T15:08:05Z',
 'title': 'Quantum Computers and Quantum Computer Languages: Quantum Assembly\n  Language and Quantum C Language',
 'summary': 'We show a representation of Quantum Computers defines Quantum Turing Machines\nwith associated Quantum Grammars. We then create examples of Quantum Grammars.\nLastly we develop an algebraic approach to high level Quantum Languages using\nQuantum Assembly language and Quantum C language as examples.',
 'authors': ['Stephen Blaha'],
 'comment': '32 pages',
 'journal_ref': '',
 'doi': '',
 'links': {'default': 'http://arxiv.org/abs/quant-ph/0201082v1',
  'pdf': 'http://arxiv.org/pdf/quant-ph/0201082v1'},
 'primary_category': 'quant-ph',
 'categories': ['quant-ph', 'cs.PL']}

In [19]:
[paper["categories"] for paper in data["entries"]]

[['quant-ph', 'cs.PL'],
 ['quant-ph'],
 ['quant-ph'],
 ['quant-ph'],
 ['quant-ph'],
 ['q-alg', 'math.QA'],
 ['quant-ph'],
 ['quant-ph', 'physics.comp-ph'],
 ['quant-ph'],
 ['quant-ph']]

In [34]:
def get_arxiv_papers(search_term: str) -> dict:
    max_results = 10
    query = "+".join(search_term.lower().split())
    for char in list('()" '):
        if char in query:
            raise ValueError(f"Cannot have character: '{char}' in query: {query}")
    url = f"http://export.arxiv.org/api/query?search_query=all:{query}&max_results={max_results}"
    resp = requests.get(url)
    print(resp)
    data = parse_arxiv_xml(resp.text)
    print(json.dumps([{"title": paper["title"], "categories": paper["categories"]} for paper in data["entries"]], indent=2))
    return data, resp


In [32]:
data, resp = get_arxiv_papers("BEC BCS")

<Response [200]>
[{'title': 'Topological quantum phase transition in the BEC-BCS crossover phenomena', 'categories': ['cond-mat.str-el']}, {'title': 'BEC-BCS Crossover in the Nambu--Jona-Lasinio Model of QCD', 'categories': ['hep-ph', 'cond-mat.supr-con', 'hep-lat', 'hep-th', 'nucl-th']}, {'title': 'Effects of three-body scattering processes on BCS-BEC crossover', 'categories': ['cond-mat.quant-gas', 'cond-mat.supr-con']}, {'title': 'Turning the BEC-BCS crossover into a transition by radiation', 'categories': ['cond-mat.quant-gas']}, {'title': 'Superconductivity of the FeSe/SrTiO3 Interface in the View of BCS-BEC\n  Crossover', 'categories': ['cond-mat.supr-con', 'cond-mat.mtrl-sci']}, {'title': 'Possibility of BCS-BEC crossover in $κ$-type organic\n  superconductors', 'categories': ['cond-mat.supr-con', 'cond-mat.str-el']}, {'title': 'Reliability of the Ginzburg-Landau Theory in the BCS-BEC Crossover by\n  Including Gaussian Fluctuations for 3D Attractive Fermions', 'categories': ['co

In [36]:
data["entries"][0]

{'id': 'http://arxiv.org/abs/1003.4735v1',
 'updated': '2010-03-24T20:03:33Z',
 'published': '2010-03-24T20:03:33Z',
 'title': 'Topological quantum phase transition in the BEC-BCS crossover phenomena',
 'summary': 'A crossover between the Bose Einstein condensation (BEC) and BCS\nsuperconducting state is described topologically in the chiral symmetric\nfermion system with attractive interaction. Using a local Z_2 Berry phase, we\nfound a quantum phase transition between the BEC and BCS phases without\naccompanying the bulk gap closing.',
 'authors': ['Mitsuhiro Arikawa', 'Isao Maruyama', 'Yasuhiro Hatsugai'],
 'comment': '4 pages, 5 figures',
 'journal_ref': 'Phys. Rev. B 82, 073105 (2010)',
 'doi': '10.1103/PhysRevB.82.073105',
 'links': {'doi': 'http://dx.doi.org/10.1103/PhysRevB.82.073105',
  'default': 'http://arxiv.org/abs/1003.4735v1',
  'pdf': 'http://arxiv.org/pdf/1003.4735v1'},
 'primary_category': 'cond-mat.str-el',
 'categories': ['cond-mat.str-el']}

In [38]:
print(data["entries"][0]["summary"])

A crossover between the Bose Einstein condensation (BEC) and BCS
superconducting state is described topologically in the chiral symmetric
fermion system with attractive interaction. Using a local Z_2 Berry phase, we
found a quantum phase transition between the BEC and BCS phases without
accompanying the bulk gap closing.


In [39]:
# Download paper as tex file

import requests
import tarfile
import io

# URL of the .tar.gz file
url = "https://arxiv.org/src/1003.4735v1"

# Download the file into memory
resp = requests.get(url, stream=True)
if resp.status_code == 200:
    tar_gz_data = io.BytesIO(resp.content)  # Load resp content into memory
    
    # Open the tar.gz file in memory
    with tarfile.open(fileobj=tar_gz_data, mode="r:gz") as tar:
        # Iterate over each file in the archive
        for member in tar.getmembers():
            if member.isfile():  # Skip directories
                file_obj = tar.extractfile(member)
                if file_obj:
                    try:
                        content = file_obj.read().decode("utf-8")  # Read and decode file
                        print(f"\n--- {member.name} ---\n")
                        print(content[:500])  # Print first 500 characters
                    except UnicodeDecodeError:
                        print(f"Skipping binary file: {member.name}")
else:
    print(f"Failed to download file. Status code: {resp.status_code}")



--- becbcs.tex ---

%\documentclass[preprint,superscriptaddress,endfloats*,prl,showpacs,amsmath]{revtex4}
\documentclass[twocolumn,superscriptaddress,prb,showpacs,amsmath]{revtex4}
%\documentclass[twocolumn,superscriptaddress,endfloats*,prl,showpacs,amsmath]{revtex4}
%\documentclass[twocolumn,superscriptaddress,prb,showpacs]{revtex4}
\usepackage{amsmath}
%\usepackage{amsmath2000}
\usepackage{graphicx}
\usepackage{dcolumn}
\usepackage{bm}
\usepackage{color}

\begin{document}
\title{Topological quantum phase transiti

--- fig1.eps ---

%!PS-Adobe-3.0
%%Pages: (atend)
%%BoundingBox: 0 0 281 313
%%HiResBoundingBox: 0.000000 0.000000 280.963987 312.683990
%.........................................
%%Creator: ESP Ghostscript 707 (pswrite)
%%CreationDate: 2010/03/25 02:55:35
%%DocumentData: Clean7Bit
%%LanguageLevel: 2
%%EndComments
%%BeginProlog
% This copyright applies to everything between here and the %%EndProlog:
% Copyright 2003 artofcode LLC and Easy Software Products, all rights rese

In [40]:
type(content)

str

In [41]:
content

'%!PS-Adobe-3.0\n%%Pages: (atend)\n%%BoundingBox: 0 0 287 303\n%%HiResBoundingBox: 0.000000 0.000000 286.283991 302.549991\n%.........................................\n%%Creator: ESP Ghostscript 707 (pswrite)\n%%CreationDate: 2010/03/25 02:41:35\n%%DocumentData: Clean7Bit\n%%LanguageLevel: 2\n%%EndComments\n%%BeginProlog\n% This copyright applies to everything between here and the %%EndProlog:\n% Copyright 2003 artofcode LLC and Easy Software Products, all rights reserved.\n%%BeginResource: procset GS_pswrite_2_0_1001\n/GS_pswrite_2_0_1001 80 dict dup begin\n/PageSize 2 array def/setpagesize{ PageSize aload pop 3 index eq exch\n4 index eq and{ pop pop pop}{ PageSize dup  1\n5 -1 roll put 0 4 -1 roll put dup null eq {false} {dup where} ifelse{ exch get exec}\n{ pop/setpagedevice where\n{ pop 1 dict dup /PageSize PageSize put setpagedevice}\n{ /setpage where{ pop PageSize aload pop pageparams 3 {exch pop} repeat\nsetpage}if}ifelse}ifelse}ifelse} bind def\n/!{bind def}bind def/#{load def}

In [42]:
# This doesn't help me. My best bet might be to download the HTML. However there is no HTML for that one

In [45]:
# Looks like none have HTML
[paper["links"] for paper in data["entries"]]

[{'doi': 'http://dx.doi.org/10.1103/PhysRevB.82.073105',
  'default': 'http://arxiv.org/abs/1003.4735v1',
  'pdf': 'http://arxiv.org/pdf/1003.4735v1'},
 {'doi': 'http://dx.doi.org/10.1103/PhysRevD.75.096004',
  'default': 'http://arxiv.org/abs/hep-ph/0703159v3',
  'pdf': 'http://arxiv.org/pdf/hep-ph/0703159v3'},
 {'doi': 'http://dx.doi.org/10.1103/PhysRevA.82.063607',
  'default': 'http://arxiv.org/abs/1110.3417v1',
  'pdf': 'http://arxiv.org/pdf/1110.3417v1'},
 {'default': 'http://arxiv.org/abs/1505.04820v1',
  'pdf': 'http://arxiv.org/pdf/1505.04820v1'},
 {'doi': 'http://dx.doi.org/10.1088/0256-307X/36/10/107404',
  'default': 'http://arxiv.org/abs/1908.11126v1',
  'pdf': 'http://arxiv.org/pdf/1908.11126v1'},
 {'default': 'http://arxiv.org/abs/2411.15510v1',
  'pdf': 'http://arxiv.org/pdf/2411.15510v1'},
 {'doi': 'http://dx.doi.org/10.3390/condmat6040049',
  'default': 'http://arxiv.org/abs/2204.03590v1',
  'pdf': 'http://arxiv.org/pdf/2204.03590v1'},
 {'default': 'http://arxiv.org/a

In [46]:
# I'll have to play the game of PDF parsing. I can do this with Claude actually
# https://docs.anthropic.com/en/docs/build-with-claude/pdf-support

import anthropic
import base64
import httpx

# Load and encode the PDF
pdf_url = "https://assets.anthropic.com/m/1cd9d098ac3e6467/original/Claude-3-Model-Card-October-Addendum.pdf"
pdf_data = base64.standard_b64encode(httpx.get(pdf_url).content).decode("utf-8")

# Send to Claude
client = anthropic.Anthropic()
message = client.messages.create(
    model="claude-3-5-sonnet-20241022",
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "document",
                    "source": {
                        "type": "base64",
                        "media_type": "application/pdf",
                        "data": pdf_data
                    }
                },
                {
                    "type": "text",
                    "text": "What are the key findings in this document?"
                }
            ]
        }
    ],
)

print(message.content)


In [7]:
# How to print state?

from typing import Annotated

from langchain_openai import ChatOpenAI
from typing_extensions import TypedDict

from langgraph.graph import StateGraph
from langgraph.graph.message import add_messages
from langgraph.checkpoint.memory import MemorySaver


class State(TypedDict):
    messages: Annotated[list, add_messages]

graph_builder = StateGraph(State)

config = {"configurable": {"thread_id": "1"}}
memory = MemorySaver()

llm = ChatOpenAI(model="gpt-4o-mini")


def chatbot(state: State):
    return {"messages": [llm.invoke(state["messages"])]}


# The first argument is the unique node name
# The second argument is the function or object that will be called whenever
# the node is used.
graph_builder.add_node("chatbot", chatbot)
graph_builder.set_entry_point("chatbot")
graph_builder.set_finish_point("chatbot")
graph = graph_builder.compile(checkpointer=memory)


def stream_graph_updates(user_input: str):
    for event in graph.stream({"messages": [{"role": "user", "content": user_input}]}, config):
        for value in event.values():
            print("Assistant:", value["messages"][-1].content)

    snapshot = graph.get_state(config)
    return snapshot

while True:
    user_input = input("User: ")
    if user_input.lower() in ["quit", "exit", "q"]:
        print("Goodbye!")
        break

    snapshot = stream_graph_updates(user_input)


Assistant: Hello! How can I assist you today?
Assistant: Nice to meet you, Alex! How can I help you today?
Assistant: Your name is Alex! How can I assist you further?
Assistant: It looks like your message might have been empty. How can I assist you today?
Goodbye!


In [8]:
snapshot

StateSnapshot(values={'messages': [HumanMessage(content='hey', additional_kwargs={}, response_metadata={}, id='971fd0d9-b8ad-4ba9-9478-f8b4df0eadb0'), AIMessage(content='Hello! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 8, 'total_tokens': 18, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_bd83329f63', 'finish_reason': 'stop', 'logprobs': None}, id='run-02e834d4-9a92-418a-8c5e-37acafb1a4e6-0', usage_metadata={'input_tokens': 8, 'output_tokens': 10, 'total_tokens': 18, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}), HumanMessage(content='im alex', additional_kwargs={}, response_metadata={}, id='948c8f94-1c3

In [13]:
snapshot.*?

snapshot.__add__
snapshot.__annotations__
snapshot.__class__
snapshot.__class_getitem__
snapshot.__contains__
snapshot.__delattr__
snapshot.__dir__
snapshot.__doc__
snapshot.__eq__
snapshot.__format__
snapshot.__ge__
snapshot.__getattribute__
snapshot.__getitem__
snapshot.__getnewargs__
snapshot.__getstate__
snapshot.__gt__
snapshot.__hash__
snapshot.__init__
snapshot.__init_subclass__
snapshot.__iter__
snapshot.__le__
snapshot.__len__
snapshot.__lt__
snapshot.__match_args__
snapshot.__module__
snapshot.__mul__
snapshot.__ne__
snapshot.__new__
snapshot.__orig_bases__
snapshot.__reduce__
snapshot.__reduce_ex__
snapshot.__repr__
snapshot.__rmul__
snapshot.__setattr__
snapshot.__sizeof__
snapshot.__slots__
snapshot.__str__
snapshot.__subclasshook__
snapshot.config
snapshot.count
snapshot.created_at
snapshot.index
snapshot.metadata
snapshot.next
snapshot.parent_config
snapshot.tasks
snapshot.values

In [16]:
snapshot.values

{'messages': [HumanMessage(content='hey', additional_kwargs={}, response_metadata={}, id='971fd0d9-b8ad-4ba9-9478-f8b4df0eadb0'),
  AIMessage(content='Hello! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 8, 'total_tokens': 18, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_bd83329f63', 'finish_reason': 'stop', 'logprobs': None}, id='run-02e834d4-9a92-418a-8c5e-37acafb1a4e6-0', usage_metadata={'input_tokens': 8, 'output_tokens': 10, 'total_tokens': 18, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),
  HumanMessage(content='im alex', additional_kwargs={}, response_metadata={}, id='948c8f94-1c3e-46dd-9ec3-b38f7

In [17]:
import json

In [18]:
json.dumps(snapshot.values)

TypeError: Object of type HumanMessage is not JSON serializable

In [35]:
snapshot.values["messages"][-1].content

'It looks like your message might have been empty. How can I assist you today?'