# Deep Research v1
- Does a single call of arxiv api to return top 10 sources
- Extracts out only those information relevant to user query
- Based on extracted information, compile a markdown file that does in-line citation for sources

Improvements:
- Iterative search with agentic framework

In [1]:
import os
from agentjo import *
import re

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
async def llm(system_prompt: str, user_prompt: str) -> str:
    ''' Here, we use OpenAI for illustration, you can change it to your own LLM '''
    # ensure your LLM imports are all within this function
    from openai import AsyncOpenAI
    
    # define your own LLM here
    client = AsyncOpenAI()
    response = await client.chat.completions.create(
        model='o3-mini',
        # temperature = 0,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )
    return response.choices[0].message.content

In [4]:
import asyncio
import aiohttp
import feedparser
import re
import urllib.parse
from bs4 import BeautifulSoup
import io
import PyPDF2

def format_arxiv_query(query):
    """
    Format a query string for the arXiv API to handle multiple words.
    Each word is prefixed with "all:" and combined with the AND operator.
    The resulting string is URL-encoded.
    """
    words = query.strip().split()
    if not words:
        return ""
    formatted_query = " AND ".join(f"all:{word}" for word in words)
    return urllib.parse.quote(formatted_query)

def extract_citation_key(bibtex_text):
    """
    Extracts the citation key from a BibTeX entry string.

    Parameters:
        bibtex_text (str): The full BibTeX entry as a string.

    Returns:
        str or None: The citation key if found, otherwise None.
    """
    # This regex matches an entry starting with @type{citation_key,
    pattern = r'@\w+\s*\{\s*([^,]+),'
    match = re.search(pattern, bibtex_text)
    if match:
        return match.group(1).strip()
    return None

async def async_fetch_feed(query_url, session):
    """
    Asynchronously fetch the XML feed from arXiv.
    """
    async with session.get(query_url) as response:
        response.raise_for_status()
        return await response.text()

async def async_get_bibtex_entry(entry, session):
    """
    Asynchronously extract the arXiv id from an entry and retrieve its BibTeX entry.
    """
    try:
        # entry.id is typically something like "http://arxiv.org/abs/1234.56789v1"
        abs_url = entry.id
        parts = abs_url.split('/abs/')
        if len(parts) < 2:
            return "No valid arXiv id found in entry.id"
        arxiv_id_with_version = parts[1]
        # Remove version information (e.g., 'v1')
        arxiv_id = re.split("v", arxiv_id_with_version)[0]
        bibtex_url = f"https://arxiv.org/bibtex/{arxiv_id}"
        async with session.get(bibtex_url) as response:
            response.raise_for_status()
            return await response.text()
    except Exception as e:
        return f"Error retrieving BibTeX: {str(e)}"

async def async_extract_pdf_text(pdf_url, session):
    """
    Asynchronously retrieve a PDF and extract its text.
    Since PDF extraction is blocking, we use asyncio.to_thread.
    """
    try:
        async with session.get(pdf_url) as response:
            response.raise_for_status()
            content = await response.read()
            
            def extract_text(content):
                with io.BytesIO(content) as f:
                    reader = PyPDF2.PdfReader(f)
                    text = ""
                    for page in reader.pages:
                        page_text = page.extract_text()
                        if page_text:
                            text += page_text + "\n"
                    return text if text else "No text could be extracted from the PDF."
            
            return await asyncio.to_thread(extract_text, content)
    except Exception as e:
        return f"Error retrieving PDF text: {str(e)}"

async def async_extract_html_text(html_url, session):
    """
    Asynchronously retrieve an HTML page and extract its text using BeautifulSoup.
    """
    try:
        async with session.get(html_url) as response:
            response.raise_for_status()
            html = await response.text()
            soup = BeautifulSoup(html, 'html.parser')
            text = soup.get_text(separator="\n")
            cleaned_text = re.sub(r'\n+', '\n', text).strip()
            return cleaned_text
    except Exception as e:
        return f"Error retrieving HTML text: {str(e)}"

async def async_retrieve_important(user_output: str, metadata_text: str):
    '''Retrieve what is important to user_output from metadata_text'''
    res = await strict_json_async(f'''From the Text, extract useful information for query: ```{user_output}```
You must put all details so that another person can understand without referencing the Text.
You must output quantitative results and detailed descriptions whenever applicable.
You must output 'NA' if Text is not useful for query or if you are unsure''',
                             "Text: " + metadata_text[:200000],
                             output_format = {
                                 "Text Relevant for query": "type: bool",
                                 "Important Information": "type: str",
                                 "Filtered Detailed Important Information": f"Be detailed, only those directly related to query ```{user_output}```, 'NA' if not useful, type: str"},
                             llm = llm)
    if res["Text Relevant for query"]:
        return res["Filtered Detailed Important Information"]
    else:
        return 'NA'

async def search_arxiv(query, user_output):
    """
    Search arXiv for papers based on a query that is composed of keywords separated by a space and return the top 10 papers with:
      - title, abstract, authors, published date
      - metadata: text extracted from PDF (if available) or HTML
      - BibTeX entry
    Uses asynchronous HTTP requests to speed up the retrieval.

    user_output is the format we would need the output to be
    """
    base_url = "http://export.arxiv.org/api/query?"
    formatted_query = format_arxiv_query(query)
    query_url = f"{base_url}search_query={formatted_query}&start=0&max_results=10"
    
    async with aiohttp.ClientSession() as session:
        # Fetch the arXiv feed asynchronously.
        xml_data = await async_fetch_feed(query_url, session)
        feed = feedparser.parse(xml_data)
        
        # Schedule tasks for retrieving BibTeX entries and metadata concurrently.
        bibtex_tasks = []
        metadata_tasks = []
        for entry in feed.entries:
            # BibTeX task
            bibtex_tasks.append(async_get_bibtex_entry(entry, session))
            
            # Determine PDF and HTML URLs.
            pdf_url = None
            html_url = None
            if hasattr(entry, 'links'):
                for link in entry.links:
                    if link.get('type') == 'application/pdf':
                        pdf_url = link.href
                    elif link.get('rel') == 'alternate':
                        html_url = link.href
            
            # Metadata task: try PDF first, then HTML.
            if pdf_url:
                metadata_tasks.append(async_extract_pdf_text(pdf_url, session))
            elif html_url:
                metadata_tasks.append(async_extract_html_text(html_url, session))
            else:
                # If no PDF or HTML link is available, return a default message.
                metadata_tasks.append(asyncio.sleep(0, result = entry.summary.strip()))
        
        # Await all BibTeX and metadata tasks concurrently.
        bibtex_entries = await asyncio.gather(*bibtex_tasks)
        metadata_texts = await asyncio.gather(*metadata_tasks)

        # make bibtex_entries into dict form
        bibtex_dict = {extract_citation_key(bibtex_entry): bibtex_entry for bibtex_entry in bibtex_entries}

        # Get the important information out
        synthesis_tasks = [async_retrieve_important(user_output, metadata_texts[i]) for i in range(len(metadata_texts))]
        important_information = await asyncio.gather(*synthesis_tasks)
        
        return bibtex_dict, important_information

user_query = input("Enter your search query for arXiv papers: ")
user_output = '''What is memory?
Required output format:
1. Introduction
2. Types of Memory
3. How Memory can be adaptive
4. How Memory schema is created
5. Future focus areas on memory
6. Conclusion'''
bibtex_dict, important_information = await search_arxiv(user_query, user_output)

consolidated_info = []
if bibtex_dict:
    for citationkey, info in zip(bibtex_dict.keys(), important_information):
        print(citationkey, info, sep = "\n")
        consolidated_info.append({"citationkey": citationkey, "Content": info})

Enter your search query for arXiv papers:  memory adaptive neuroscience


kabir2024deepreinforcementlearningtimescale
1. Introduction: Memory in this context is the mechanism by which agents encode, store, and retrieve temporal information. It is key to learning relationships between events (cause and effect) and is inspired by biological systems where timing is critical for survival and decision-making. 
2. Types of Memory: The text highlights different memory architectures used in deep reinforcement learning. Standard types include recurrent neural networks (RNNs) and Long Short-Term Memory networks (LSTMs), while the advanced, cognitively inspired version is the scale invariant memory (also known as CogRNN). The scale invariant memory uses a Laplace transform-based approach to generate a log-compressed representation of temporal history, resembling time cells found in mammalian brains. 
3. How Memory can be adaptive: Memory becomes adaptive through scale invariance. In these systems, when temporal relationships are rescaled, the memory representation shif

# Latex Format Display
- Install pdflatex and a version of latex on your computer

In [134]:
# !pip install pdflatex

### Step 1: Create biblatex for references

In [5]:
biblatex = '\n'.join(list(bibtex_dict.values()))

In [6]:
with open("references.bib", "w") as f:
    f.write(biblatex)

### Step 2: Generate main.tex

In [29]:
res = await strict_json_async(f'''Generate a research report in latex format for the query: ```{user_output}```
If format is specified, follow format strictly. Use article format.
You must reference whenever possible. Use hyperref and biblatex.
Use as many references as possible for each section of the report.
Have detailed subsections to highlight the various viewpoints.

Print out references at the bottom. Use APA citation.
references.bib: ```{biblatex}```''',
        consolidated_info,
        output_format = {"Research Report": "In latex format, include citations, be as detailed as possible, type: code"},
        llm = llm)

In [30]:
report_latex = res["Research Report"].replace('\\\t', '\\t')

In [31]:
with open("main.tex", "w") as f:
    f.write(report_latex)

In [33]:
import subprocess

# Step 1: Run pdflatex to generate aux files.
result = subprocess.run(['pdflatex', 'main.tex'], capture_output=True, text=True)
if result.returncode != 0:
    print("Error during initial pdflatex compilation:")
    print(result.stderr)
    exit(1)

# Step 2: Run bibtex to process the bibliography.
# If you're using biblatex with biber, change 'bibtex' to 'biber' and adjust the filename accordingly.
result_bib = subprocess.run(['biber', 'main'], capture_output=True, text=True)
if result_bib.returncode != 0:
    print("Error during bibtex compilation:")
    print(result_bib.stderr)
    exit(1)

# Step 3: Run pdflatex again to resolve references.
result = subprocess.run(['pdflatex', 'main.tex'], capture_output=True, text=True)
if result.returncode != 0:
    print("Error during second pdflatex compilation:")
    print(result.stderr)
    exit(1)

# Optionally run pdflatex one more time if necessary.
result = subprocess.run(['pdflatex', 'main.tex'], capture_output=True, text=True)
if result.returncode != 0:
    print("Error during final pdflatex compilation:")
    print(result.stderr)
    exit(1)

print("PDF generated successfully!")

PDF generated successfully!


# Markdown Format Display (alternate)

In [11]:
res = await strict_json_async(f'''Generate a research report in markdown format for the query: ```{user_output}```
If format is specified, follow format strictly.
You must do in-line citation with the [[1]], [[2]], [[3]] ... whenever possible. 
Link the citation url in the [[1]], [[2]], [[3]]
Use as many sources as possible for each section of the report
At the end of the report, list out all the sources using:
```[source_number]: APA citation```

Citation Details: ```{bibtex_dict}```''',
        consolidated_info,
        output_format = {"Research Report": "Include citations, be as detailed as possible, type: str"},
        llm = llm)

In [12]:
report = res["Research Report"]

In [13]:
from IPython.display import Markdown, display
display(Markdown(report))

# What is Memory?

## 1. Introduction
Memory is a fundamental process present in both biological and artificial systems. It encompasses the mechanisms through which information—whether temporal events, spatial features, or sensory inputs—is encoded, stored, and later retrieved [[1]](https://arxiv.org/abs/2412.15292), [[2]](https://arxiv.org/abs/q-bio/0403025), [[8]](https://arxiv.org/abs/2006.12616). In biological systems, memory is intimately tied to the adaptability of neural circuits and learning processes, whilst in artificial intelligence, it informs model architectures that must both store past experiences and dynamically update to meet new contexts [[9]](https://arxiv.org/abs/2403.01518). This report examines these multiple aspects of memory across various systems and proposes potential future research directions.

## 2. Types of Memory
Memory manifests in several forms and architectures:

- **Temporal and Scale Invariant Memory:** Deep reinforcement learning models deploy mechanisms, such as recurrent neural networks (RNNs) and Long Short-Term Memory networks (LSTMs), with innovations like scale invariant (or log-compressed) representations of time, facilitating robust performance across varying temporal conditions [[1]](https://arxiv.org/abs/2412.15292).

- **Synaptic Memory in Neural Networks:** Biological systems model memory through synaptic plasticity. Processes like long-term potentiation (LTP) and long-term depression (LTD) enable networks to continuously reshape stored information, offering both stability for frequently used patterns and flexibility through synaptic weakening [[2]](https://arxiv.org/abs/q-bio/0403025), [[7]](https://arxiv.org/abs/0905.2125).

- **Volatile vs. Non-volatile Memory:** In self-organizing systems, memory can be volatile—requiring ongoing activity to maintain—or non-volatile, where information is embedded in the network connectivity even after removal of sustained activation [[5]](https://arxiv.org/abs/2303.12225).

- **Distributed Memory in Language Models:** Modern language models utilize two kinds of memory: one stored in the model's weights (long-term storage) and another in the transient activation states (context memory), each contributing differently to model performance [[9]](https://arxiv.org/abs/2403.01518).

- **Episodic and Integrated Memory:** In memory-augmented architectures, distinct episodic memories store separate, pattern-separated experiences which can later be integrated during inferential reasoning [[10]](https://arxiv.org/abs/2001.10913), as well as frameworks that use saliency-augmented memory for continual learning to prevent catastrophic forgetting [[6]](https://arxiv.org/abs/2212.13242).

## 3. How Memory can be Adaptive
Adaptivity is central to effective memory systems:

- **Scale Invariance:** In computational models, memory is made adaptive by ensuring that the representation of temporal history shifts proportionally when the time scale is altered, maintaining functionality across different environments without the need for readjustment [[1]](https://arxiv.org/abs/2412.15292).

- **Synaptic Plasticity and Selective Updating:** Biological networks adapt by modulating synaptic strengths; pathways responsible for errors are downregulated, while those that yield successful responses are reinforced, allowing quick reconfiguration in response to new information [[2]](https://arxiv.org/abs/q-bio/0403025), [[7]](https://arxiv.org/abs/0905.2125).

- **Dynamic Adaptation in Oscillator Networks:** Memory associated with potential landscapes can shift in response to external forcing, enabling the network to adapt its stored states dynamically to accommodate new patterns [[3]](https://arxiv.org/abs/2008.07448).

- **Online and Continuous Adaptation:** Approaches like dynamic evaluation enable large language models to update their weight memory during inference, thus effectively adapting to distribution shifts and extending their working context [[9]](https://arxiv.org/abs/2403.01518).

## 4. How Memory Schema is Created
Memory schema creation is a multi-step process across different systems:

- **Computational Transformations:** In deep learning, particularly for temporal data, schemas can be derived using mathematical tools such as the Laplace and inverse Laplace transform. This process builds a compressed yet sequential representation of past inputs, mirroring time cells observed in the mammalian brain [[1]](https://arxiv.org/abs/2412.15292).

- **Synaptic Organization:** In biological and bio-inspired networks, repetitive exposure to stimuli leads to the self-organization of neural circuits where bottom-up sensory information, lateral connections, and top-down feedback converge to form stable memory representations [[2]](https://arxiv.org/abs/q-bio/0403025), [[7]](https://arxiv.org/abs/0905.2125).

- **Potential Landscapes:** In oscillator networks, memory schemas emerge as valleys in amplitude or phase potential landscapes derived from differential equations. These landscapes offer stable attractors corresponding to remembered states, which can be recalled through associative dynamics [[3]](https://arxiv.org/abs/2008.07448).

- **Sparse and Salient Representations:** In continual learning models, schemas are constructed by filtering input data through saliency maps, retaining only the most informative features. These features are stored sparsely, and later reconstructed using inpainting methods to alleviate memory storage issues and address catastrophic forgetting [[6]](https://arxiv.org/abs/2212.13242).

- **Evolutionary Processes:** Evolutionary mechanisms enable the formation of memory in recurring processes, where neuroevolution helps shape networks that can sustain and recall short-term memories across multiple time steps [[4]](https://arxiv.org/abs/1204.3221).

## 5. Future Focus Areas on Memory
Future research on memory can be directed along several innovative paths:

- **Integration of Dynamic Discounting:** Investigating the integration of scale invariant temporal discounting with adaptive memory architectures to improve learning efficiency and reduce the need for hyperparameter tuning [[1]](https://arxiv.org/abs/2412.15292).

- **Optimizing Synaptic Balance:** Further exploration of methods to balance synaptic plasticity (LTP and LTD) to prevent issues like runaway potentiation, ensuring both adaptability and long-term stability in biological and artificial systems [[2]](https://arxiv.org/abs/q-bio/0403025).

- **Advanced Neuromorphic Systems:** Extending current methods by exploring global feedback, synchrony in neural oscillator models, and the impact of noise on memory stability could bridge the gap between theoretical models and practical neuromorphic applications [[3]](https://arxiv.org/abs/2008.07448), [[5]](https://arxiv.org/abs/2303.12225).

- **Enhanced Online Adaptation:** With large language models becoming ubiquitous, developing more efficient strategies for online adaptation while balancing computational costs and memory efficiency represents a key research challenge [[9]](https://arxiv.org/abs/2403.01518).

- **Scalable Memory Architectures:** For memory-augmented networks, future work should aim at scalable architectures that can handle longer sequences, deeper relationships, and multi-modal data while ensuring rapid retrieval and minimal interference [[10]](https://arxiv.org/abs/2001.10913), [[6]](https://arxiv.org/abs/2212.13242).

## 6. Conclusion
Memory is a dynamic, multifaceted attribute crucial for both biological cognition and artificial intelligence. From synaptic plasticity in biological systems to advanced computational schemas involving Laplace transforms and dynamic evaluation, memory enables systems to store, retrieve, and adapt information for improved decision-making and learning. The convergence of insights from neuroscience, dynamical systems, and machine learning suggests that future advances in memory research could lead to AI systems capable of continual learning and robust performance in changing environments.

---

### List of Sources

[1]: Banerjee, K. et al. (2024). Deep reinforcement learning with time-scale invariant memory. Retrieved from https://arxiv.org/abs/2412.15292

[2]: Wakeling, J. R. (2004). Adaptivity and `Per learning. Retrieved from https://arxiv.org/abs/q-bio/0403025

[3]: Hoppensteadt, F. (2020). A Frequency-Phase Potential for a Forced STNO Network: an Example of Evoked Memory. Retrieved from https://arxiv.org/abs/2008.07448

[4]: Lakhman, K., & Burtsev, M. (2012). Neuroevolution Results in Emergence of Short-Term Memory for Goal-Directed Behavior. Retrieved from https://arxiv.org/abs/1204.3221

[5]: Neves, F. S., & Timme, M. (2023). Volatile Memory Motifs: Minimal Spiking Neural Networks. Retrieved from https://arxiv.org/abs/2303.12225

[6]: Bai, G., Ling, C., Gao, Y., & Zhao, L. (2022). Saliency-Augmented Memory Completion for Continual Learning. Retrieved from https://arxiv.org/abs/2212.13242

[7]: Jitsev, J., & von der Malsburg, C. (2010). Experience-driven formation of parts-based representations in a model of layered visual memory. Retrieved from https://arxiv.org/abs/0905.2125

[8]: Schillaci, G., Miranda, L., & Schmidt, U. (2020). Prediction error-driven memory consolidation for continual learning. Retrieved from https://arxiv.org/abs/2006.12616

[9]: Rannen-Triki, A., Bornschein, J., Pascanu, R., Hutter, M., György, A., Galashov, A., Teh, Y. W., & Titsias, M. K. (2024). Revisiting Dynamic Evaluation: Online Adaptation for Large Language Models. Retrieved from https://arxiv.org/abs/2403.01518

[10]: Banino, A., Puigdomènech Badia, A., Köster, R., Chadwick, M. J., Zambaldi, V., Hassabis, D., Barry, C., Botvinick, M., Kumaran, D., & Blundell, C. (2020). MEMO: A Deep Network for Flexible Combination of Episodic Memories. Retrieved from https://arxiv.org/abs/2001.10913

# Convert Markdown to PDF

In [14]:
with open("main.md", "w", encoding="utf-8") as file:
    file.write(report)