In [None]:
#imports
import faiss
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from typing import List
from langchain_core.tools import tool
from langchain_ollama import ChatOllama
from langchain.agents import create_tool_calling_agent
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.tools import tool
from langchain.agents import AgentExecutor

In [None]:
#OPTIONS (used for finding the best way to save the chat to memory after each session)
# "whole_chat" or "individual_messages" or "pairs"
save_chat_to_memory_method = "whole_chat" 
# save_chat_to_memory_method = "individual_messages"
# save_chat_to_memory_method = "pairs"

In [None]:
#create index (aka create a database for vectors)
index = faiss.IndexFlatL2(384) #384 is the size of the vectors

In [None]:
#load embedding model and corresponding tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
stored_memory = [] 

In [None]:
#create log file for debugging
log_file = open("log.txt", "w", encoding="utf-8")

In [None]:
#tokenise plaintext (aka slice up text in small pieces and convert those small pieces to numerical data)
def get_tokenised_sections(texts):
    tokenised_sections = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    return tokenised_sections


#tokenise each text in the array of texts in batches of 16
# def get_tokenised_sections(texts):
#     tokenised_sections = []
#     for i in range(0, len(texts), 16):
#         batch = texts[i:i+16]
#         tokenised_batch = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
#         tokenised_sections.append(tokenised_batch)
#     return tokenised_sections
    

In [None]:
#embed the texts using the tokens (aka convert the numerical data to vectors that represent the semantics of the text)
def get_embeddings(tokenised_sections):
    with torch.no_grad():
        model_output = model(**tokenised_sections)
        # Use the embedding of the [CLS] token (first token) for each input
        embeddings = model_output.last_hidden_state[:, 0, :].detach().cpu().numpy()
    return embeddings


# def get_embeddings(tokenised_sections):
#     embeddings = []
#     for section in tokenised_sections:
#         with torch.no_grad():
#             model_output = model(**section)
#             embeddings.append(model_output.last_hidden_state[:,0,:].detach().cpu().numpy())
#     return torch.cat(embeddings).numpy()


# def get_embeddings(texts):
#     tokenised_sections = get_tokenised_sections(texts)
#     embeddings = []
#     for tokenised_batch in tokenised_sections:
#         with torch.no_grad():
#             model_output = model(**tokenised_batch)
#             embeddings.append(model_output.last_hidden_state[:,0,:].detach().cpu().numpy())
#     return torch.cat(embeddings).numpy()

In [None]:
#test different things: test memory of whole chat dialog vs memory of each individual message vs memory of pairs of question and answer
#  
#putting whole chat as a memory entry
def add_to_index_wholechat(chat):
    combined_chat = ""
    for i, message in enumerate(chat):
        if i % 2 == 0: #then it is from the agent
            combined_chat += "<START OF AGENT MESSAGE>" + message + "<END OF AGENT MESSAGE>"
        else:
            combined_chat += "<START OF USER MESSAGE>" + message + "<END OF USER MESSAGE>"
        
    tokenised_chat = get_tokenised_sections([combined_chat])
    embeddings = get_embeddings(tokenised_chat)
    index.add(embeddings)
    stored_memory.append(combined_chat)

#putting each individual message as a memory entry
def add_to_index_individual_messages(chat):
    for i, message in enumerate(chat):
        if i % 2 == 0:
            tokenised_message = get_tokenised_sections(["<START OF AGENT MESSAGE>" + message + "<END OF AGENT MESSAGE>"])
        else:
            tokenised_message = get_tokenised_sections(["<START OF USER MESSAGE>" + message + "<END OF USER MESSAGE>"])
        embeddings = get_embeddings(tokenised_message)
        index.add(embeddings)
        stored_memory.append(tokenised_message)

#putting each pair of user message and agent message as a memory entry
def add_to_index_pairs(chat):
    for i in range(1, len(chat), 2):
        message_pair = ["<START OF USER MESSAGE>" + chat[i-1] + "<END OF USER MESSAGE>", "<START OF AGENT MESSAGE>" + chat[i] + "<END OF AGENT MESSAGE>"]
        tokenised_pair = get_tokenised_sections(message_pair)
        embeddings = get_embeddings(tokenised_pair)
        index.add(embeddings)
        stored_memory.append(message_pair)


In [None]:
#add individual messages to index (can be used for metadata or important things the user wants the bot to remember for the current session.
#  this includes instructions for the session, user preferences, etc.)
def add_to_index(data):
    tokenised_data = get_tokenised_sections([data])
    embeddings = get_embeddings(tokenised_data)
    index.add(embeddings)
    stored_memory.append(data)

In [None]:
#similarity search
def search(query, k):
    tokenised_query = get_tokenised_sections([query])
    query_embedding = get_embeddings(tokenised_query)
    D, I = index.search(query_embedding, k)
    return D, I

In [None]:
#create tool that can be called by the llm to fetch data from memory
@tool
def fetch_From_Memory(query, k=3):
    """
    Fetch data from memory that can be used to generate a response to the user
    query: any string you think will have the highest similarity to the data you want to fetch. THIS SHOULD BE AS INFORMATIVE AS POSSIBLE TO GET THE BEST RESULTS
    k: the number of entries you want to fetch. BEST IS TO KEEP BELOW 5. 
    return: the data that has the highest similarity to the query
    """
    
    #test if k is a number
    int_k = 0
    try:
        int_k = int(k)
    except:
        return "Please enter a valid number for k"
    
    D, I = search(query, int_k)
    # if len(I) > 0 and I[0][0] != -1:
    #     stored_data = "Previously stored information: " + str(I[0])  # Convert memory to readable format
    #     detokenised_data = tokenizer.decode(stored_data)
    #     return detokenised_data
    information = ""
    # if len(I) > 0 and I[0][0] != -1:
    #     # Retrieve stored text (assuming you stored them in a list)
    #     stored_text = stored_memory[I[0][0]]  # Map index back to original text
    #     return f"Previously stored information: {stored_text}"

    #add all the 10 most similar entries to the response
    for i in range(int_k):
        if I[0][i] != -1:
            information += stored_memory[I[0][i]] + "\n"
        
    if information == "":
        return "No information found"
    
    #write to log file
    log_file.write("Query: " + query + "\n")
    log_file.write("Information: " + information + "\n")
    log_file.write("\n")
    
    return information
        
    
    
    

In [None]:
#create tool that can be called by the llm to save data to memory
@tool
def save_data_to_memory(data):
    """
    Save data to memory that can be fetched later to generate a response to the user. 
    call this function when you want to save important data like user preferences, user instructions, user personal information.
    data: any string you want to save to memory. please format it in a way that it can be easily fetched later.
    """
    #write to log file
    log_file.write("Data saved: " + data + "\n")
    log_file.write("\n")
    
    add_to_index(data)

In [None]:
#used after the session ends to save the chat to memory for future sessions. This is not done by the llm but automatically after each session
def save_chat_to_memory(chat):
    if save_chat_to_memory_method == "whole_chat":
        add_to_index_wholechat(chat)
    elif save_chat_to_memory_method == "individual_messages":
        add_to_index_individual_messages(chat)
    elif save_chat_to_memory_method == "pairs":
        add_to_index_pairs(chat)
    else:
        print("Invalid save_chat_to_memory_method: " + save_chat_to_memory_method)

        

In [None]:
# #without tools
# llm = ChatOllama(
#     model="mistral",
#     temperature=0,
# )

# #with tools
# llm = ChatOllama(
#     model="mistral",
#     temperature=0,
# ).bind_tools([fetch_From_Memory, save_data_to_memory])

In [None]:
#save the chat so it can be put into the memory after the session ends
chat_messages = []  

In [None]:
############################################
### TESTING WITH SOME RANDOM MEMORY DATA ###
############################################

user_data = "name: Jordy, age: 21, very enthousiastic, prefers reading practice questions, favourite colour: green"
user_instructions = "jordy wants to practice translating sentences about school from chinese to english, he wants you to give him feedback on his translations and also provice a new sentence after each feedback"
save_data_to_memory(user_data)
save_data_to_memory(user_instructions)

user_data = "name: John, age: 25, very calm, prefers reading books"
user_instructions = "john wants to practice translating sentences about animals from chinese to english, he wants to have conversations with you in chinese, his level is quite advanced, he has studied for 3 years already, last conversation he has held a conversation with you in chinese about studying abroad"
save_data_to_memory(user_data)
save_data_to_memory(user_instructions)

user_data = "name: 玛丽, age: 30, very energetic, prefers reading stories"
user_instructions = "玛丽 is a chinese teacher, she is preparing for a class about tones, she wants you to make example sentences about the weather specifying the tones of each character"
save_data_to_memory(user_data)
save_data_to_memory(user_instructions)

############################################
### TESTING WITH SOME RANDOM MEMORY DATA ###
############################################

In [None]:
# #send a message to the llm
# def send_message(message):
#     messages = [
#     ("system", """YOU ARE A CONVERSATIONAL AGENT WHOSE TASK IT IS TO HELP PEOPLE WITH STUDYING LANGUAGES. 
#      YOUR NAME IS AILLP.
#       YOU HAVE ACCESS TO A MEMORY THAT YOU CAN USE TO FETCH FROM PAST SESSIONS AND WRITE USER INFO TO REMEMBER.
#       THE CONTENTS OF THE MEMORY IS ALL FROM THE USER AND THEIR PREFERENCES OR THEIR INSTRUCTIONS OR THEIR INFO OR THEIR PAST SESSIONS WITH YOU.
#       EACH TIME YOU START TALKING TO A NEW USER, YOU SHOULD FETCH FROM THE MEMORY TO SEE IF YOU HAVE ANY INFO ABOUT THE USER SUCH AS THEIR NAME, AGE, PREFERENCES, INSTRUCTIONS, OR PAST SESSIONS.
#       DON'T MENTION ANYTHING ABOUT THE TOOLS OR YOUR MEMORY TO THE USER.
#       IF YOU DON'T USE YOUR MEMORY THEN JUST ANSWER THE USERS QUESTION WITHOUT SAYING ANYTHING ABOUT THE MEMORY."""),
#     ("user", message),
#     ]
#     chat_messages.append(messages)
#     response = llm.invoke(messages)
#     chat_messages.append(response)
#     return response

# #stream output from llm (TODO LATER)
# def stream_messages(messages):
#     messages = [
#     ("system", "if needed you can try to fetch from memory using the tools provided. It is not necessary to use the tools. DON'T MENTION ANYTHING ABOUT THE TOOLS TO THE USER."),
#     ("user", "Return the words Hello World!"),
#     ]
#     for chunk in llm.stream(messages):
#         print(chunk.text(), end="")
    

# #testing phase 
# response = send_message("hey, im John, i want to practice chinese, can we continue from where we left of last time?")
# print(response.content)

In [None]:
#test
#(fetch_From_Memory, {"query": "John"})

print(fetch_From_Memory("John"))



In [None]:
#use langchain agents for an integration of tool calling into the llm
agents_llm = ChatOllama(model="llama3.1", temperature=1.0)

llm_with_tools = agents_llm.bind_tools([fetch_From_Memory, save_data_to_memory])

tools = [fetch_From_Memory, save_data_to_memory]

prompt = ChatPromptTemplate.from_messages([
    # ("system", """YOU ARE A CONVERSATIONAL AGENT WHOSE TASK IT IS TO HELP PEOPLE WITH STUDYING LANGUAGES. 
     
    #  YOUR NAME IS AILLP.
     
    #   YOU HAVE ACCESS TO A MEMORY THAT YOU CAN USE TO FETCH FROM PAST SESSIONS AND WRITE USER INFO TO REMEMBER.
    #   THE CONTENTS OF THE MEMORY IS ALL FROM THE USER AND THEIR PREFERENCES OR THEIR INSTRUCTIONS OR THEIR INFO OR THEIR PAST SESSIONS WITH YOU.
    #   EACH TIME YOU START TALKING TO A NEW USER, YOU ABSOLUTELY MUST FETCH FROM THE MEMORY TO SEE IF YOU HAVE ANY INFO ABOUT THE USER SUCH AS THEIR NAME, AGE, PREFERENCES, INSTRUCTIONS, OR PAST SESSIONS.
    #   FOR EXAMPLE YOU CAN FETCH \"NAME EVA USER PREFERENCES\" TO GET THE USER PREFERENCES OF EVA.
    #   GIVE A BRIEF RESPONSE TO THE USER.
    #   IF YOU DON'T USE YOUR MEMORY THEN JUST ANSWER THE USERS QUESTION WITHOUT SAYING ANYTHING ABOUT THE MEMORY."""),
    #("system", "if needed you can try to fetch from memory using the tools provided. It is not necessary to use the tools."),
    #("system", "you have to first read the users message!. Then if you know how to answer and in what way you have to answer, you can answer the user. If you don't know how to answer or in what way you have to use the tools to fetch from your memory by providing a query to the fetch_From_Memory tool. Then wait for the response of the tool and then you can answer the user. do not mention anything about using your tools to the user."),
    ("system", """YOU ARE A CONVERSATIONAL AGENT WHO HELPS USERS WITH LANGUAGE LEARNING. 
     YOUR NAME IS AILLP.
     YOU HAVE ACCESS TO A MEMORY TOOL TO FETCH PAST USER SESSIONS AND PREFERENCES.
     
     AT THE START OF EVERY CONVERSATION, YOU MUST CALL THE `fetch_From_Memory` TOOL USING THE USER'S NAME.
     YOU MUST CALL THIS TOOL BEFORE RESPONDING.
     
     NEVER GUESS PAST SESSIONS. ALWAYS FETCH MEMORY FIRST.
     IF MEMORY IS FOUND, SUMMARIZE IT BEFORE ASKING THE USER HOW THEY WANT TO CONTINUE.
     IF NO MEMORY IS FOUND, CONTINUE AS USUAL WITHOUT MENTIONING MEMORY.

     NEVER MENTION THAT YOU ARE USING A TOOL TO THE USER."""),
    ("user", "{input}"),
    MessagesPlaceholder("agent_scratchpad")
])

agent = create_tool_calling_agent(llm_with_tools, tools, prompt)

In [None]:
# User input
user_input = "hey, im John"


agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

# Execute the agent
result = agent_executor.invoke({"input": user_input})

# Final response
print("\nFinal Response:", result["output"])