### Installing required packages

In [None]:
%%capture
!pip3 install openai pandas sentence-transformers transformers superlinked==19.21.1

### Setting up the Imports and setting up the intial library checks

In [None]:
import pandas as pd
import superlinked.framework as sl
from datetime import timedelta
from sentence_transformers import SentenceTransformer
from openai import OpenAI
import os
from abc import ABC, abstractmethod
from typing import Any, Optional, Dict
from tqdm import tqdm
from google.colab import userdata

# Abstract Tool Class
class Tool(ABC):
    @abstractmethod
    def name(self) -> str:
        pass

    @abstractmethod
    def description(self) -> str:
        pass

    @abstractmethod
    def use(self, *args, **kwargs) -> Any:
        pass


# Get API key from Google Colab secrets
try:
    api_key = userdata.get('OPENAI_API_KEY')
except KeyError:
    raise ValueError("OPENAI_API_KEY not found in user secrets. Please add it using Tools > User secrets.")

# Initialize OpenAI Client
api_key = os.environ.get("OPENAI_API_KEY", "your-openai-key")  # Replace with your OpenAI API key
if not api_key:
    raise ValueError("Please set the OPENAI_API_KEY environment variable.")

client = OpenAI(api_key=api_key)
model = "gpt-4"

  from .autonotebook import tqdm as notebook_tqdm


### Downloading the relevant data

In [None]:
import pandas as pd

!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1FCR3TW5yLjGhEmm-Uclw0_5PWVEaLk1j' -O arxiv_ai_data.csv

--2025-03-13 22:39:51--  https://drive.google.com/uc?export=download&id=1FCR3TW5yLjGhEmm-Uclw0_5PWVEaLk1j
Resolving drive.google.com (drive.google.com)... 142.250.207.238
Connecting to drive.google.com (drive.google.com)|142.250.207.238|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1FCR3TW5yLjGhEmm-Uclw0_5PWVEaLk1j&export=download [following]
--2025-03-13 22:39:52--  https://drive.usercontent.google.com/download?id=1FCR3TW5yLjGhEmm-Uclw0_5PWVEaLk1j&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 142.250.193.225
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|142.250.193.225|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13941471 (13M) [application/octet-stream]
Saving to: ‘arxiv_ai_data.csv’


2025-03-13 22:40:13 (1.81 MB/s) - ‘arxiv_ai_data.csv’ saved [13941471/13941471]



### Data Loading and Truncation

To improve loading times, especially for users on the free Colab tier, the dataset is truncated to 100 rows by default. This means that only the first 100 entries from the original CSV file are loaded initially. You can change it to use the full dataset if you want!

In [None]:
df = pd.read_csv('arxiv_ai_data.csv').head(100)
df.head()

Unnamed: 0,authors,categories,comment,doi,entry_id,journal_ref,pdf_url,primary_category,published,summary,title,updated
0,[arxiv.Result.Author('M. L. Ginsberg')],['cs.AI'],See http://www.jair.org/ for an online appendi...,,http://arxiv.org/abs/cs/9308101v1,"Journal of Artificial Intelligence Research, V...",http://arxiv.org/pdf/cs/9308101v1,cs.AI,1993-08-01 00:00:00+00:00,Because of their occasional need to return to ...,Dynamic Backtracking,1993-08-01 00:00:00+00:00
1,[arxiv.Result.Author('M. P. Wellman')],['cs.AI'],See http://www.jair.org/ for any accompanying ...,,http://arxiv.org/abs/cs/9308102v1,"Journal of Artificial Intelligence Research, V...",http://arxiv.org/pdf/cs/9308102v1,cs.AI,1993-08-01 00:00:00+00:00,Market price systems constitute a well-underst...,A Market-Oriented Programming Environment and ...,1993-08-01 00:00:00+00:00
2,"[arxiv.Result.Author('I. P. Gent'), arxiv.Resu...",['cs.AI'],See http://www.jair.org/ for any accompanying ...,,http://arxiv.org/abs/cs/9309101v1,"Journal of Artificial Intelligence Research, V...",http://arxiv.org/pdf/cs/9309101v1,cs.AI,1993-09-01 00:00:00+00:00,We describe an extensive study of search in GS...,An Empirical Analysis of Search in GSAT,1993-09-01 00:00:00+00:00
3,"[arxiv.Result.Author('F. Bergadano'), arxiv.Re...",['cs.AI'],See http://www.jair.org/ for any accompanying ...,,http://arxiv.org/abs/cs/9311101v1,"Journal of Artificial Intelligence Research, V...",http://arxiv.org/pdf/cs/9311101v1,cs.AI,1993-11-01 00:00:00+00:00,As real logic programmers normally use cut (!)...,The Difficulties of Learning Logic Programs wi...,1993-11-01 00:00:00+00:00
4,"[arxiv.Result.Author('J. C. Schlimmer'), arxiv...",['cs.AI'],See http://www.jair.org/ for an online appendi...,,http://arxiv.org/abs/cs/9311102v1,"Journal of Artificial Intelligence Research, V...",http://arxiv.org/pdf/cs/9311102v1,cs.AI,1993-11-01 00:00:00+00:00,To support the goal of allowing users to recor...,Software Agents: Completing Patterns and Const...,1993-11-01 00:00:00+00:00


In [None]:
df.columns

Index(['authors', 'categories', 'comment', 'doi', 'entry_id', 'journal_ref',
       'pdf_url', 'primary_category', 'published', 'summary', 'title',
       'updated'],
      dtype='object')

In [None]:
df = pd.read_csv('arxiv_ai_data.csv').head(100)

# Convert to datetime but keep it as datetime (more readable and usable)
df['published'] = pd.to_datetime(df['published'])

# Ensure summary is a string
df['summary'] = df['summary'].astype(str)

# Add 'text' column for similarity search
df['text'] = df['title'] + " " + df['summary']

In [None]:
# Sort the DataFrame by published date in descending order (newest first)
df_sorted = df.sort_values(by='published', ascending=False)

# Display the 5 most recent papers
print("5 Most Recent Papers:")
print("-" * 50)
for idx, row in df_sorted.head().iterrows():
    print(f"Date: {row['published'].strftime('%Y-%m-%d')}")
    print(f"Title: {row['title']}")
    print("-" * 50)

# Optional: Display the oldest papers too
print("\n5 Oldest Papers:")
print("-" * 50)
for idx, row in df_sorted.tail().iterrows():
    print(f"Date: {row['published'].strftime('%Y-%m-%d')}")
    print(f"Title: {row['title']}")
    print("-" * 50)

5 Most Recent Papers:
--------------------------------------------------
Date: 2011-06-30
Title: Phase Transitions and Backbones of the Asymmetric Traveling Salesman Problem
--------------------------------------------------
Date: 2011-06-30
Title: A Comprehensive Trainable Error Model for Sung Music Queries
--------------------------------------------------
Date: 2011-06-30
Title: Finding Approximate POMDP solutions Through Belief Compression
--------------------------------------------------
Date: 2011-06-30
Title: Ordered Landmarks in Planning
--------------------------------------------------
Date: 2011-06-30
Title: On Prediction Using Variable Order Markov Models
--------------------------------------------------

5 Oldest Papers:
--------------------------------------------------
Date: 1993-11-01
Title: Software Agents: Completing Patterns and Constructing User Interfaces
--------------------------------------------------
Date: 1993-11-01
Title: The Difficulties of Learning Logic

## Understanding Time-Aware Paper Discovery with RecencySpace

When searching through research papers, finding relevant content is only half the story - knowing when it was published can be just as important. That's why we've implemented a smart time-aware search system using RecencySpace. Imagine you're organizing your digital library: you want to find papers not just by what they're about, but also by when they were written. Our system does exactly that, but automatically and intelligently.

We've set up our search to look at both what a paper contains and when it was published. Using different time windows (like papers from the last few months, the last year, or even older), our system can prioritize papers based on their publication dates while still keeping track of their relevance to your search. It's like having a research assistant who knows exactly how to balance the importance of time and content in your search results.

For our collection of AI research papers this time-aware approach helps us understand how ideas evolved during these foundational years. When you search for topics like "quantum computing" or "neural networks," the system doesn't just find papers containing these terms - it also considers their place in the timeline of AI development. This makes it easier to trace how concepts developed and changed over time, giving us a clearer picture of AI's historical progression.

In [None]:
class PaperSchema(sl.Schema):
    text: sl.String
    published: sl.Timestamp  # This will handle datetime objects properly
    entry_id: sl.IdField
    title: sl.String
    summary: sl.String

paper = PaperSchema()

# Define spaces
text_space = sl.TextSimilaritySpace(
    text=sl.chunk(paper.text, chunk_size=200, chunk_overlap=50),
    model="sentence-transformers/all-mpnet-base-v2"
)
recency_space = sl.RecencySpace(
    timestamp=paper.published,
    period_time_list=[
        sl.PeriodTime(timedelta(days=365)),      # papers within 1 year
        sl.PeriodTime(timedelta(days=2*365)),    # papers within 2 years
        sl.PeriodTime(timedelta(days=3*365)),    # papers within 3 years
    ],
    negative_filter=-0.25
)

### Creating the index and ingesting the relevant data. we are using the in-memory superlinked executor to ingest the data


In [None]:
# Create the index
paper_index = sl.Index([text_space, recency_space])

# Parser to map DataFrame columns to schema fields
parser = sl.DataFrameParser(
    paper,
    mapping={
        paper.entry_id: "entry_id",
        paper.published: "published",
        paper.text: "text",
        paper.title: "title",
        paper.summary: "summary",
    }
)

# Set up in-memory source and executor
source = sl.InMemorySource(paper, parser=parser)
executor = sl.InMemoryExecutor(sources=[source], indices=[paper_index])
app = executor.run()

# Load the DataFrame with a progress bar using batches
batch_size = 10
data_batches = [df[i:i + batch_size] for i in range(0, len(df), batch_size)]
for batch in tqdm(data_batches, total=len(data_batches), desc="Loading Data into Source"):
    source.put([batch])

### Query definition

In [None]:
# Define the query
knowledgebase_query = (
    sl.Query(
        paper_index,
        weights={
            text_space: sl.Param("relevance_weight"),
            recency_space: sl.Param("recency_weight"),
        }
    )
    .find(paper)
    .similar(text_space, sl.Param("search_query"))
    .select(paper.entry_id, paper.published, paper.text, paper.title, paper.summary)
    .limit(sl.Param("limit"))
)

### Defining the tools for the kernel agent

### Retrieval Tool

In [None]:
class RetrievalTool(Tool):
    def __init__(self, df, app, knowledgebase_query, client, model):
        self.df = df
        self.app = app
        self.knowledgebase_query = knowledgebase_query
        self.client = client
        self.model = model

    def name(self) -> str:
        return "RetrievalTool"

    def description(self) -> str:
        return "Retrieves a list of relevant papers based on a query using Superlinked."

    def use(self, query: str) -> pd.DataFrame:
        result = self.app.query(
            self.knowledgebase_query,
            relevance_weight=1.0,
            recency_weight=0.5,
            search_query=query,
            limit=5
        )
        df_result = sl.PandasConverter.to_pandas(result)
        # Ensure summary is a string
        if 'summary' in df_result.columns:
            df_result['summary'] = df_result['summary'].astype(str)
        else:
            print("Warning: 'summary' column not found in retrieved DataFrame.")
        return df_result

### Summarization Tool

In [None]:
class SummarizationTool(Tool):
    def __init__(self, df, client, model):
        self.df = df
        self.client = client
        self.model = model

    def name(self) -> str:
        return "SummarizationTool"

    def description(self) -> str:
        return "Generates a concise summary of specified papers using an LLM."

    def use(self, query: str, paper_ids: list) -> str:
        papers = self.df[self.df['entry_id'].isin(paper_ids)]
        if papers.empty:
            return "No papers found with the given IDs."
        summaries = papers['summary'].tolist()
        summary_str = "\n\n".join(summaries)
        prompt = f"""
        Summarize the following paper summaries:\n\n{summary_str}\n\nProvide a concise summary.
        """
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7,
            max_tokens=500
        )
        return response.choices[0].message.content.strip()

### Question Answer Tool

In [None]:
class QuestionAnsweringTool(Tool):
    def __init__(self, retrieval_tool, client, model):
        self.retrieval_tool = retrieval_tool
        self.client = client
        self.model = model

    def name(self) -> str:
        return "QuestionAnsweringTool"

    def description(self) -> str:
        return "Answers questions about research topics using retrieved paper summaries or general knowledge if no specific context is available."

    def use(self, query: str) -> str:
        df_result = self.retrieval_tool.use(query)
        if 'summary' not in df_result.columns:
            # Tag as a general question if summary is missing
            prompt = f"""
            You are a knowledgeable research assistant. This is a general question tagged as [GENERAL]. Answer based on your broad knowledge, not limited to specific paper summaries. If you don't know the answer, provide a brief explanation of why.

            User's question: {query}
            """
        else:
            # Use paper summaries for specific context
            contexts = df_result['summary'].tolist()
            context_str = "\n\n".join(contexts)
            prompt = f"""
            You are a research assistant. Use the following paper summaries to answer the user's question. If you don't know the answer based on the summaries, say 'I don't know.'

            Paper summaries:
            {context_str}

            User's question: {query}
            """
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7,
            max_tokens=500
        )
        return response.choices[0].message.content.strip()

### Setting up the kernel agent

In [None]:
class KernelAgent:
    def __init__(self, retrieval_tool: RetrievalTool, summarization_tool: SummarizationTool, question_answering_tool: QuestionAnsweringTool, client, model):
        self.retrieval_tool = retrieval_tool
        self.summarization_tool = summarization_tool
        self.question_answering_tool = question_answering_tool
        self.client = client
        self.model = model

    def classify_query(self, query: str) -> str:
        prompt = f"""
        Classify the following user prompt into one of the three categories:
        - retrieval: The user wants to find a list of papers based on some criteria (e.g., 'Find papers on AI ethics from 2020').
        - summarization: The user wants to summarize a list of papers (e.g., 'Summarize papers with entry_id 123, 456, 789').
        - question_answering: The user wants to ask a question about research topics and get an answer (e.g., 'What is the latest development in AI ethics?').

        User prompt: {query}

        Respond with only the category name (retrieval, summarization, question_answering).
        If unsure, respond with 'unknown'.
        """
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7,
            max_tokens=10
        )
        classification = response.choices[0].message.content.strip().lower()
        print(f"Query type: {classification}")
        return classification

    def process_query(self, query: str, params: Optional[Dict] = None) -> str:
        query_type = self.classify_query(query)
        if query_type == 'retrieval':
            df_result = self.retrieval_tool.use(query)
            response = "Here are the top papers:\n"
            for i, row in df_result.iterrows():
                # Ensure summary is a string and handle empty cases
                summary = str(row['summary']) if pd.notna(row['summary']) else ""
                response += f"{i+1}. {row['title']} \nSummary: {summary[:200]}...\n\n"
            return response
        elif query_type == 'summarization':
            if not params or 'paper_ids' not in params:
                return "Error: Summarization query requires a 'paper_ids' parameter with a list of entry_ids."
            return self.summarization_tool.use(query, params['paper_ids'])
        elif query_type == 'question_answering':
            return self.question_answering_tool.use(query)
        else:
            return "Error: Unable to classify query as 'retrieval', 'summarization', or 'question_answering'."

In [None]:
retrieval_tool = RetrievalTool(df, app, knowledgebase_query, client, model)
summarization_tool = SummarizationTool(df, client, model)
question_answering_tool = QuestionAnsweringTool(retrieval_tool, client, model)

# Initialize KernelAgent
kernel_agent = KernelAgent(retrieval_tool, summarization_tool, question_answering_tool, client, model)

In [None]:
# Test query
print(kernel_agent.process_query("Find papers on quantum computing in last 10 years"))

Query type: retrieval
Here are the top papers:
1. Quantum Computing and Phase Transitions in Combinatorial Search 
Summary: We introduce an algorithm for combinatorial search on quantum computers that
is capable of significantly concentrating amplitude into solutions for some NP
search problems, on average. This is done by...

2. The Road to Quantum Artificial Intelligence 
Summary: This paper overviews the basic principles and recent advances in the emerging
field of Quantum Computation (QC), highlighting its potential application to
Artificial Intelligence (AI). The paper provi...

3. Solving Highly Constrained Search Problems with Quantum Computers 
Summary: A previously developed quantum search algorithm for solving 1-SAT problems in
a single step is generalized to apply to a range of highly constrained k-SAT
problems. We identify a bound on the number o...

4. The model of quantum evolution 
Summary: This paper has been withdrawn by the author due to extremely unscientific
errors.

In [None]:
print(kernel_agent.process_query("Summarize this paper", params={"paper_ids": ["http://arxiv.org/abs/cs/9311101v1"]}))

Query type: summarization
The paper discusses the challenges of learning logic programs that contain the cut predicate. It suggests a method of first generating a base program without cut and then adding cut where necessary to make it consistent with the positive examples. The paper concludes that learning programs with cut is difficult due to the need for intensional evaluation, and current induction techniques may need to be restricted to purely declarative logic languages.
