In [5]:
import pandas as pd
import numpy as np
import json
import os
import re

from ollama import chat
from ollama import ChatResponse

In [60]:
MODEL_NAME = 'qwen3:4b' # qwen3:4b, qwen3:8b, qwen3:14b

# The embeddings and the dataframe created and saved in Part 1
PATH_TO_EMBEDS = 'compressed_array.npz'
PATH_TO_DF = 'compressed_dataframe.csv.gz'

# Define the API clients

In [None]:
from dotenv import load_dotenv
import os

load_dotenv(os.path.expanduser("~/Desktop/dot-env-api-keys/my-api-keys.env"))

TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")

In [18]:
# Tavily web search

from tavily import TavilyClient

tavily_client = TavilyClient(api_key=TAVILY_API_KEY)

# Helper functions

In [66]:
def initialize_message_history(system_message):

    message_history = [
                        {
                            "role": "system",
                            "content": system_message
                        }
                    ]

    return message_history

In [64]:
def create_message_history(system_message, user_input):

    """
    Create a message history messages list.
    Args:
        system_message (str): The system message
        user_query (str): The user input
    Returns:
        A list of dicts in OpenAi chat format
    """

    message_history = [
                        {
                            "role": "system",
                            "content": system_message
                        },
                        {
                            "role": "user",
                            "content": user_input
                        }
                    ]

    return message_history



In [65]:
def initialize_message_history(system_message):

    """
    Create a message history messages list.
    Args:
        system_message (str): The system message
        user_query (str): The user input
    Returns:
        A list of dicts in OpenAi chat format
    """

    message_history = [
                        {
                            "role": "system",
                            "content": system_message
                        }
                    ]

    return message_history



In [22]:
# Function to separate the thinking and the response

def process_response(text):
    
    text1 = text.split('</think>')[0]
    text2 = text.split('</think>')[1]
    
    thinking_text = text1 + '</think>'
    response_text = text2.strip()

    return thinking_text, response_text

## System message

In [23]:
chat_agent_system_message = f"""
You are a helpful research assistant.

Your knowledge cutoff date: October 2023
Current date: August 2025

1. You provide polite answers to simple questions.
If the user's input requires only a simple answer, then output your answer as JSON.

Example session:

Question: Hello. How are you?

You output:

{{
"Answer": "I'm fine, thanks.",
"Status": "DONE"
}}

2. You can also run in a loop of Thought, Action, PAUSE, Observation.
At the end of the loop, you output an Answer.
Use Thought to describe your thoughts about the question you have been asked.
Use Action to run one of the actions available to you - then return PAUSE.
Observation will be the result of running those actions.
Output your response as a JSON string.

Your available actions are:

find_arxiv_research_papers:
e.g. find_arxiv_research_papers: [list of search keywords and phrases for a RAG search of the ArXiv database.]
Returns research papers from the ArXiv database.

run_web_search:
e.g. run_web_search: [list of search keywords and phrases for a web search]
Returns text content from search results.

You can only call one action at a time.

Example session:

Question: What are the latest techniques for detecting Pneumonia on x-rays using AI?
{{
"Thought": "I should look for relevant research papers in the ArXiv database by using find_arxiv_research_papers.",
"Action": {{"function":"find_arxiv_research_papers", "input": ["Pneumonia detection with AI", "Computer vision", "Object detection"]}},
"Status": "PAUSE"
}}

You will be called again with this:

Observation: <results>A list of research papers and their content</results>

You then output:
{{
"Answer": "Your final report.",
"Status": "DONE"
}}
""".strip()


# Set up the LLM

In [24]:
def make_llm_api_call(message_history):

    model_name = 'qwen3:4b'

    response: ChatResponse = chat(model=model_name, 
                                  messages=message_history,
                                )

    output_text = response['message']['content']

    thinking_text, response_text = process_response(output_text)

    return response_text


# Example

system_message = "Your name is Molly."
user_message = "What's your name?"

message_history = create_message_history(system_message, user_message)

response_text = make_llm_api_call(message_history)

print(response_text)

#print(response['message']['content'])

My name is Molly. How can I assist you today? 😊


## Set up the tools



### ArXiv RAG search tool

In [25]:
def run_faiss_search(query_text, top_k):
    
    # Run FAISS exhaustive search
    
    query = [query_text]

    # Vectorize the query string
    query_embedding = sent_model.encode(query)

    # Run the query
    # index_vals refers to the chunk_list index values
    scores, index_vals = faiss_index.search(query_embedding, top_k)
    
    # Get the list of index vals
    index_vals_list = index_vals[0]
    
    return index_vals_list
    

def run_rerank(index_vals_list, query_text):
    
    chunk_list = list(df_data['prepared_text'])

    # Replace the chunk index values with the corresponding strings
    pred_strings_list = [chunk_list[item] for item in index_vals_list]

    # Format the input for the cross encoder
    # The input to the cross_encoder is a list of lists
    # [[query_text, pred_text1], [query_text, pred_text2], ...]

    cross_input_list = []

    for item in pred_strings_list:

        new_list = [query_text, item]

        cross_input_list.append(new_list)


    # Put the pred text into a dataframe
    df = pd.DataFrame(cross_input_list, columns=['query_text', 'pred_text'])

    # Save the orginal index (i.e. df_data index values)
    df['original_index'] = index_vals_list

    # Now, score all retrieved passages using the cross_encoder
    cross_scores = cross_encoder.predict(cross_input_list)

    # Add the scores to the dataframe
    df['cross_scores'] = cross_scores

    # Sort the DataFrame in descending order based on the scores
    df_sorted = df.sort_values(by='cross_scores', ascending=False)
    
    # Reset the index (*This was missed previously*)
    df_sorted = df_sorted.reset_index(drop=True)

    pred_list = []

    for i in range(0,len(df_sorted)):

        text = df_sorted.loc[i, 'pred_text']

        # Get the arxiv id
        # original_index refers to the index values in df_filtered
        original_index = df_sorted.loc[i, 'original_index']
        arxiv_id = df_data.loc[original_index, 'id']
        cat_text = df_data.loc[original_index, 'cat_text']
        title = df_data.loc[original_index, 'title']

        # Crete the link to the research paper pdf
        link_to_pdf = f'https://arxiv.org/pdf/{arxiv_id}'

        item = {
            'arxiv_id': arxiv_id,
            'link_to_pdf': link_to_pdf,
            'cat_text': cat_text,
            'title': title,
            'abstract': text
        }

        pred_list.append(item)

    return pred_list


def print_search_results(pred_list, num_results_to_print):
    
    for i in range(0,num_results_to_print):
        
        pred_dict = pred_list[i]
        
        link_to_pdf = pred_dict['link_to_pdf']
        abstract = pred_dict['abstract']
        cat_text = pred_dict['cat_text']
        title = pred_dict['title']

        print('Title:',title)
        print('Categories:',cat_text)
        print('Abstract:',abstract)
        print('Link to pdf:',link_to_pdf)
        print()
    
   
def run_arxiv_search(query_text, top_k=50):
    
    # Run a faiss greedy search
    pred_index_list = run_faiss_search(query_text, top_k)

    # This returns a list of dicts with length equal to top_k
    pred_list = run_rerank(pred_index_list, query_text)
    
    # Print the results
    #print_search_results(pred_list, num_results_to_print)
    
    return pred_list
    

In [28]:
# Load the compressed array
embeddings = np.load(PATH_TO_EMBEDS)

# Access the array by the name you specified ('my_array' in this case)
embeddings = embeddings['array_data']

embeddings.shape

(2421966, 384)

In [29]:
# Load the compressed DataFrame

df_data = pd.read_csv(PATH_TO_DF, compression='gzip')

print(df_data.shape)

df_data.head()

(2421966, 6)


  df_data = pd.read_csv(PATH_TO_DF, compression='gzip')


Unnamed: 0,id,title,abstract,categories,cat_text,prepared_text
0,704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturbati...,hep-ph,High Energy Physics - Phenomenology,Calculation of prompt diphoton production cros...
1,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG,"Combinatorics, Computational Geometry",Sparsity-certifying Graph Decompositions {titl...
2,704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is describe...,physics.gen-ph,General Physics,The evolution of the Earth-Moon system based o...
3,704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle n...,math.CO,Combinatorics,A determinant of Stirling cycle numbers counts...
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\Lam...,math.CA math.FA,"Classical Analysis and ODEs, Functional Analysis",From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...


In [30]:
# Initialize FAISS

import faiss

embed_length = embeddings.shape[1]

faiss_index = faiss.IndexFlatL2(embed_length)

# Add the embeddings to the index
faiss_index.add(embeddings)

faiss_index.is_trained

True

In [33]:
# Initialize sentence_transformers

from sentence_transformers import SentenceTransformer

sent_model = SentenceTransformer("all-MiniLM-L6-v2")

In [34]:
# Initialize the cross_encoder for reranking

from sentence_transformers import CrossEncoder

# We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [35]:
# Example

query_text = """
I want to build an invisibility cloak like the one in Harry Potter.
"""


# RUN THE SEARCH
num_results_to_print = 20 # top_k = 300
pred_list = run_arxiv_search(query_text, top_k=5)

In [36]:
pred_list[0]

{'arxiv_id': 1101.0904,
 'link_to_pdf': 'https://arxiv.org/pdf/1101.0904',
 'cat_text': 'Classical Physics',
 'title': "Harry Potter's Cloak",
 'abstract': 'Harry Potter\'s Cloak {title} The magic "Harry Potter\'s cloak" has been the dream of human beings for really long time. Recently, transformation optics inspired from the advent of metamaterials offers great versatility for manipulating wave propagation at will to create amazing illusion effects. In the present work, we proposed a novel transformation recipe, in which the cloaking shell somehow behaves like a "cloaking lens", to provide almost all desired features one can expect for a real magic cloak. The most exciting feature of the current recipe is that an object with arbitrary characteristics (e.g., size, shape or material properties) can be invisibilized perfectly with positive-index materials, which significantly benefits the practical realization of a broad-band cloaking device fabricated with existing materials. Moreover, 

### Tavily web search

In [37]:
def run_tavily_search(query, num_results=10):

    """
    Uses the Tavily API to run a web search
    Args:
        query (str): The user query
        num_results (int): Num search results
    Returns:
        tav_response (json string): The search results in json format
    """

    # For basic search:
    tav_response = tavily_client.search(query=query, max_results=num_results)

    return tav_response


# Example

query = "Who is the current UK Prime Minister?"

results = run_tavily_search(query, num_results=2)

print(results)

{'query': 'Who is the current UK Prime Minister?', 'follow_up_questions': None, 'answer': None, 'images': [], 'results': [{'url': 'https://en.wikipedia.org/wiki/Prime_Minister_of_the_United_Kingdom', 'title': 'Prime Minister of the United Kingdom - Wikipedia', 'content': "Prime Minister of the United Kingdom ; Incumbent Keir Starmer. since 5 July 2024 ; Government of the United Kingdom · Prime Minister's Office", 'score': 0.87782305, 'raw_content': None}, {'url': 'https://www.ox.ac.uk/about/oxford-people/british-prime-ministers', 'title': 'British Prime Ministers | University of Oxford', 'content': 'British Prime Ministers ; Sir Keir Starmer (1962- ), St Edmund Hall, Jul 2024-, Labour ; Rishi Sunak (1980- ), Lincoln, Oct 2022-Jul 2024, Conservative.', 'score': 0.8051581, 'raw_content': None}], 'response_time': 0.9, 'request_id': '6e71a307-c656-4865-9ec3-326dc5173277'}


# Set up the Agents

In [46]:
def run_chat_agent(message_history):

    model_name = MODEL_NAME

    response: ChatResponse = chat(model=model_name, 
                                  messages=message_history,
                                )

    output_text = response['message']['content']

    thinking_text, response_text = process_response(output_text)

    print(thinking_text)

    return response_text


# Example

user_message = "Hello"

message_history = create_message_history(chat_agent_system_message, user_message)

response_text = run_chat_agent(message_history)

print(response_text)

#print(response['message']['content'])

<think>
Okay, the user said "Hello". I need to respond politely. Let me check the guidelines. The first example shows a simple answer. So I should just say "I'm fine, thanks." and mark it as DONE. No need for any actions here. Just a straightforward reply.
</think>
{
"Answer": "I'm fine, thanks.",
"Status": "DONE"
}


In [40]:
def run_router_agent(llm_response):

    """
    Route to web search or not.
    Args:
        state (dict): The current graph state
    Returns:
        str: Next node to call
    """

    print("---ROUTER AGENT---")

    # Extract the status
    json_response = json.loads(llm_response)
    status = json_response['Status']
    
    print("Status:", status)

    if status == 'PAUSE':
        print("Route: to_research_agent")
        return "to_research_agent"

    else:
        print("Route: to_final_answer")
        return "to_final_answer"
            



# Example

user_message = "hello"

message_history = create_message_history(chat_agent_system_message, user_message)

# Prompt the chat_agent
response = run_chat_agent(message_history)

# Run router_agent
route = run_router_agent(response)

print(route)

---ROUTER AGENT---
Status: DONE
Route: to_final_answer
to_final_answer


In [41]:
def run_research_agent(llm_response):

    print("---RESEARCH AGENT---")

    # Extract the status
    json_response = json.loads(llm_response)
    action_dict = json_response['Action']
    func_to_run = action_dict['function']
    func_input_list = action_dict['input']
    
    answer_list = []

    if func_to_run == "find_arxiv_research_papers":
        for search_query in func_input_list:
            answer = run_arxiv_search(search_query, top_k=5)
            answer_list.append(answer)
    else:
        for search_query in func_input_list:
            answer = run_tavily_search(search_query, num_results=5)
            answer_list.append(answer)

    print("func_to_run:", func_to_run)
    print("func_arg:", func_input_list)
    print("Output:", answer_list)

    return answer_list



# Example

user_message = "Has OpenAi released any new open source models?"

message_history = create_message_history(chat_agent_system_message, user_message)

# Prompt the chat_agent
response = run_chat_agent(message_history)

# Run router_agent
route = run_router_agent(response)


if route == "to_research_agent":
    answer = run_research_agent(response)

    # Update message history
    #message = {"role": "user", "content": f"Observation: {answer}"}
    #message_history.append(message)



---ROUTER AGENT---
Status: DONE
Route: to_final_answer


In [57]:
def check_output_type(output):
    try:
        json.loads(output)
        return "is_json"
    except json.JSONDecodeError:
        return "is_plain_text"

In [58]:
def run_final_answer_agent(llm_response):

    # Check if the output is JSON
    output_type = check_output_type(llm_response)

    print("---FINAL ANSWER AGENT---")

    if output_type == 'is_json':

        json_response = json.loads(llm_response)
        final_answer = json_response['Answer']
    
        print("Final answer:", final_answer)

    # The model has ouput plain text that's
    # not being formatted as per the system message.
    else:
        print(llm_response)


# Run the system

In [61]:

user_input = "What is the current year?"

message_history = create_message_history(chat_agent_system_message, user_input)

for i in range(0,10):

    # Prompt the chat_agent
    llm_response = run_chat_agent(message_history)
    
    # Update message history
    message = {"role": "assistant", "content": llm_response}
    message_history.append(message)

    # Run router_agent
    route = run_router_agent(llm_response)


    if route == "to_research_agent":

        answer = run_research_agent(llm_response)
        
        user_input = f"Observation: {answer}"
        message = {"role": "user", "content": user_input}
        message_history.append(message)

    else:

        run_final_answer_agent(llm_response)

        break



<think>
Okay, the user is asking for the current year. Let me think. The user's question is straightforward. The current date is August 2025, so the answer should be 2025. I don't need to use any actions here because it's a simple question that requires only the current year. The answer is clear and doesn't need any external data. So I'll just provide the answer as specified.
</think>
---ROUTER AGENT---
Status: DONE
Route: to_final_answer
---FINAL ANSWER AGENT---
Final answer: 2025


# Run a chat loop

In [62]:
# NOTES
# 1. The model thinking is being printed out.


# Initialize the message history
message_history = initialize_message_history(chat_agent_system_message)

while True:

    print()
    print("==========")
    user_input = input("Enter something (or 'q' to quit): ")
    print("==========")

    # Update message history
    message = {"role": "user", "content": user_input}
    message_history.append(message)
    
    i = i + 1

    if user_input.lower() == 'q':
        print("Exiting the loop. Goodbye!")
        break  # Exit the loop


    for i in range(0,10):
           
        llm_response = run_chat_agent(message_history)
        
        # Update message history
        message = {"role": "assistant", "content": llm_response}
        message_history.append(message)

        # Run router_agent
        route = run_router_agent(llm_response)
            

        if route == "to_research_agent":

            answer = run_research_agent(llm_response)
            
            user_input = f"Observation: {answer}"
            message = {"role": "user", "content": user_input}
            message_history.append(message)

        else:

            run_final_answer_agent(llm_response)

            break






Enter something (or 'q' to quit):  Hello


<think>
Okay, the user said "Hello". I need to respond politely. Let me check the guidelines. The first example shows a simple answer. So I should just reply with a friendly message. No need for any actions here. Just a standard greeting.
</think>
---ROUTER AGENT---
Status: DONE
Route: to_final_answer
---FINAL ANSWER AGENT---
Final answer: Hello! How can I assist you today?



Enter something (or 'q' to quit):  How are you?


<think>
Okay, the user asked, "How are you?" I need to respond politely. Since they're asking about my well-being, I should say I'm fine and offer help. The previous example showed a JSON response with "Answer" and "Status". I'll follow that structure. No need for any actions here because it's a straightforward question. Just a simple reply.
</think>
---ROUTER AGENT---
Status: DONE
Route: to_final_answer
---FINAL ANSWER AGENT---
Final answer: I'm fine, thanks. How can I assist you today?



Enter something (or 'q' to quit):  Has OpenAi released any new open source models recently?


<think>
Okay, the user is asking if OpenAI has released any new open-source models recently. Let me think about how to approach this.

First, I need to recall what I know about OpenAI's recent activities. OpenAI is known for releasing models like GPT-3, GPT-4, and others, but they've been cautious about open-source models, especially given their business model and security concerns. I remember that they have released some models under the GPT family, but they aren't typically open-source. For example, GPT-3.5 and GPT-4 are available through their API, but the code isn't open-source. 

Wait, the user specifically mentioned "open-source models." OpenAI has a policy where they don't release the code for their models, even if they are publicly available. So, maybe there haven't been any new open-source models recently. But I should verify this. 

I should check the latest news or announcements from OpenAI. Since my knowledge cutoff is October 2023, I need to rely on that. As of that date, 

Enter something (or 'q' to quit):  Please run a web search


<think>
Okay, the user is asking if OpenAI has released any new open-source models recently. My previous answer was based on information up to October 会被2023, but the current date is August 2025. I need to check for any updates since then.

First, I should run a web search to get the latest information. The user wants a web search, so I'll use the run_web_search action. The keywords should be "OpenAI new open source models recent" to find the most up-to-date news or announcements. 

Wait, maybe I should also include "2025" in the search terms to ensure the results are from the latest period. Let me adjust the search query to "OpenAI new open source models 2025" to get the most relevant information. That should help find any announcements made in August 2025. 

I need to make sure the search is comprehensive. Maybe adding "latest" or "announcements" could help. But the user just said "run a web search," so sticking to the main keywords. Let's proceed with "OpenAI new open source models 

Enter something (or 'q' to quit):  What are the latest Qwen models?


<think>
Okay, the user is asking about the latest Qwen models. Let me start by recalling what I know about Qwen. Qwen is a series of large language models developed by Alibaba Cloud. The latest versions, as of my knowledge cutoff in October 2023, would be Qwen3 and Qwen3.5. But since the current date is August 2025, there might have been updates beyond that.

Wait, the user's previous interaction was about OpenAI's new models in 2025. The assistant provided information about OpenAI releasing gpt-oss-120b and gpt-oss-20b. Now the user is switching to Qwen. I need to make sure I provide accurate information up to 2025.

I should check if there have been any recent updates to Qwen after October 2023. However, since my knowledge is up to October 2023, I can't confirm anything beyond that. The user might be expecting information on the latest versions, but if there's no data beyond that, I should state that and suggest checking official sources.

Also, the user might be confused between Ope

Enter something (or 'q' to quit):  Please run a web search


<think>
Okay, the user asked about the latest Qwen models. I need to check if there's recent information beyond my knowledge cutoff in October 2023. Since the user is asking for the latest, I should run a web search to get up-to-date data.

First, I'll use the run_web_search action with the query "latest Qwen models 2025". That should fetch the most recent information. Let me make sure the search terms are relevant. The results might include official announcements, blog posts, or tech news articles from 2025. 

After getting the search results, I'll need to parse the information to find details about new Qwen models. The user might be interested in model versions, features, release dates, or deployment options. I should check the sources for credibility, like official Alibaba Cloud pages or trusted tech websites. 

If the search results mention newer models like Qwen3.5 or Qwen4, or any updates in capabilities, I'll summarize that. Also, check if there are any changes in functionality,

Enter something (or 'q' to quit):  q


Exiting the loop. Goodbye!


In [63]:
# Model output formatting is a issue.
# The model does not always format the output in the
# specified JSON format.
# I suspect that as the size of the chat history increases,
# the models ability to follow system message instructions degrades.
# This is an issue for both the 4B and the 14B models.

#print(llm_response)