In [None]:
import os
from dotenv import load_dotenv

In [None]:
load_dotenv(os.path.join('..', '.env'))

# 1. OpenAI API and its Cost

In [None]:
import openai

Let's look at how the OpenAI API works:

In [None]:
question = "What is the closest star to Earth?"

messages = [
    {
        "role": "user",
        "content": question
    }
]

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=messages,
    temperature=0
)

In [None]:
response.keys()

In [None]:
response.choices[0].keys()

In [None]:
print(response.choices[0].message)

This is the answer "message". Note that the AI is called "assistant".

Another important information is the number of tokens used, as that tells us the cost of the API call:

In [None]:
print(response.usage)

Looking up the costs on openai website, we find:
- 0.0015 USD per 1000 prompt tokens
- 0.0020 USD per 1000 completion tokens

So, the total cost is:

In [None]:
prompt_cost = 0.0015
completion_cost = 0.0020

cost = (response.usage.prompt_tokens * prompt_cost + response.usage.completion_tokens * completion_cost)/1000

In [None]:
print(f"{cost:.6f} USD")

Let's wrap this together in a class:

In [None]:
class BaseAgent:
    
    def __init__(self, model="gpt-3.5-turbo", prices=[0.0015, 0.0020]):
        self.model = model
        self.prompt_cost = prices[0]
        self.completion_cost = prices[1]
        self.cost_list = []
        self.total_cost = 0
        
    def reply(self, question):
        
        messages = [
            {
                "role": "user",
                "content": question
            }
        ]
        
        response = openai.ChatCompletion.create(
            model=self.model,
            messages=messages,
            temperature=0
        )
        
        self.calculate_cost(response.usage)
        
        message = response.choices[0].message
        
        print(message.content)

    def calculate_cost(self, usage):
        cost = (usage.prompt_tokens * self.prompt_cost + usage.completion_tokens * self.completion_cost)/1000
        self.cost_list.append(cost)
        self.total_cost += cost

Test it:

In [None]:
agent = BaseAgent()

In [None]:
agent.reply("What is the closest star to Earth?")

In [None]:
print(f"{agent.total_cost:.6f} USD")

Let's ask it another question:

In [None]:
agent.reply("What does 'USD' stand for in the forex context?")

In [None]:
print(f"{agent.total_cost:.6f} USD")

We can see that `total_cost` is accumulating the costs of all questions asked.

In [None]:
sum(agent.cost_list)

# 2. It's Hard to have a Conversation with Someone that doesn't Listen

In [None]:
agent = BaseAgent()

In [None]:
agent.reply("Hi, my name is Andrea.")

In [None]:
agent.reply("What is my name?")

Problem: the each call to the LLM is independent than the previous one, so the AI is ignoring the history of the conversation. To fix this, we need to give it a "memory":

In [None]:
class Memory:
    
    def __init__(self):
        self.messages = []
        
    def add(self, message):
        self.messages.append(dict(message))

    def __str__(self):
        return "\n".join([str(message) for message in self.messages])

Let's modify the `BaseAgent` class to make use of the memory:

In [None]:
class BaseAgent:
    
    def __init__(self, memory, model="gpt-3.5-turbo", prices=[0.0015, 0.0020]):
        self.memory = memory
        self.model = model
        self.prompt_cost = prices[0]
        self.completion_cost = prices[1]
        self.cost_list = []
        self.total_cost = 0
        
    def reply(self, question):
        
        human_message = {
            "role": "user",
            "content": question
        }
        self.memory.add(human_message)
        
        response = openai.ChatCompletion.create(
            model=self.model,
            messages=self.memory.messages,
            temperature=0
        )
        
        self.calculate_cost(response.usage)
        
        agent_message = response.choices[0].message
        self.memory.add(agent_message)
        
        print(agent_message.content)

    def calculate_cost(self, usage):
        cost = (usage.prompt_tokens * self.prompt_cost + usage.completion_tokens * self.completion_cost)/1000
        self.cost_list.append(cost)
        self.total_cost += cost

Test it:

In [None]:
memory = Memory()
agent = BaseAgent(memory=memory)

In [None]:
agent.reply("Hi, my name is Andrea.")

In [None]:
print(agent.memory)

In [None]:
agent.reply("What is my name?")

In [None]:
print(agent.memory)

With each exchange in the conversation, the whole history has to be sent as input to the LLM. Problems:

1. LLMs have a finite input size
2. providers (such as OpenAI) charge based on the number of tokens

To solve these issues, as the memory grows, we must start deleting the oldest history.

To solve the problem, we define a `delete_history` method in the `Memory` class, which counts the number of tokens of the conversation and, if that is larger than `max_tokens`, it deletes the first message.

In [None]:
import tiktoken

In [None]:
class Memory:
    
    def __init__(self, max_tokens=3000):
        self.messages = []
        self.max_tokens = max_tokens
        self.encoding = tiktoken.get_encoding("cl100k_base")
        
    def add(self, message):
        self.messages.append(dict(message))

    def delete_history(self):
        while True:
            total_tokens = 0
            for message in self.messages:
                message_tokens = len(self.encoding.encode(message["content"]))
                total_tokens += message_tokens
            if total_tokens > self.max_tokens:
                self.messages = self.messages[1:]
            else:
                return

    def __str__(self):
        return "\n".join([str(message) for message in self.messages])

We then modify `BaseAgent` so as to delete the history every time the API is called. We do so by moving the logic to a new method `generate_response`, so that `reply` is cleaner:

In [None]:
class BaseAgent:
    
    def __init__(self, memory, model="gpt-3.5-turbo", prices=[0.0015, 0.0020]):
        self.memory = memory
        self.model = model
        self.prompt_cost = prices[0]
        self.completion_cost = prices[1]
        self.cost_list = []
        self.total_cost = 0
        
    def reply(self, question):

        human_message = {
            "role": "user",
            "content": question
        }
        self.memory.add(human_message)
        
        agent_message = self.generate_response()
        self.memory.add(agent_message)

        print(agent_message.content)

    def generate_response(self):
        
        self.memory.delete_history()
        
        response = openai.ChatCompletion.create(
            model=self.model,
            messages=self.memory.messages,
            temperature=0
        )

        self.calculate_cost(response.usage)

        agent_message = response.choices[0].message
        
        return agent_message
    
    def calculate_cost(self, usage):
        cost = (usage.prompt_tokens * self.prompt_cost + usage.completion_tokens * self.completion_cost)/1000
        self.cost_list.append(cost)
        self.total_cost += cost

Test it:

In [None]:
memory = Memory(max_tokens=30)
agent = BaseAgent(memory=memory)

In [None]:
agent.reply("My name is Andrea.")

In [None]:
print(agent.memory)

In [None]:
agent.reply("What is 1+1?")

In [None]:
print(agent.memory)

In [None]:
agent.reply("What is the most common word in English?")

In [None]:
print(agent.memory)

In [None]:
agent.reply("What is my name?")

By playing with `max_tokens`, we can control the trade-off between larger context (and hence more reliable answer) and lower costs.

# 3. Initial Prompt and Prompt Engineering

Until now we have not specified any "initial instruction" to the AI, so the only input it takes is the user's question. But what if we want the agent to act in a specific way, e.g., to talk only in one language, or to only focus on a specific subject?

This can be done by passing to it an initial prompt, which in OpenAI is nothing but a message with the "system" role. Let's therefore add this to the `Memory` class, and modify the `delete_history` so as never to delete this first message.

In [None]:
class Memory:
    
    def __init__(self, initial_prompt=None, max_tokens=3000):
        if initial_prompt is None:
            initial_prompt = ""
        self.messages = [
            {
                "role": "system",
                "content": initial_prompt
            }
        ]
        self.max_tokens = max_tokens
        self.encoding = tiktoken.get_encoding("cl100k_base")
        
    def add(self, message):
        self.messages.append(dict(message))

    def delete_history(self):
        while True:
            total_tokens = 0
            for message in self.messages:
                message_tokens = len(self.encoding.encode(message["content"]))
                total_tokens += message_tokens
            if total_tokens > self.max_tokens:
                self.messages[1:] = self.messages[2:]
            else:
                return

    def __str__(self):
        return "\n".join([str(message) for message in self.messages])

Test it:

In [None]:
initial_prompt = """
You are a finance expert. Answer the user's financial question providing technical details if needed.
If a question is not about finance, politely decline to answer, as that is beyond your scope and expertise.
"""

memory = Memory(initial_prompt=initial_prompt)
agent = BaseAgent(memory=memory)

In [None]:
agent.reply("What are some of the main drivers of FX volatility?")

In [None]:
agent.reply("How many stars are there in the Milky Way?")

This is just modifying the style of the AI, but you can use the prompt to fundamentally alter the nature of the agent. For example, you may use it as an "intention classifier":

In [None]:
initial_prompt = """
The user will provide a restaurant review.
If it's positive, output 1; if it's negative, output -1; in all other cases, output 0.
"""

memory = Memory(initial_prompt=initial_prompt)
agent = BaseAgent(memory=memory)

In [None]:
user_review = """
Our visit was perfect! The place is outstanding and comfy. People are gentle and well educated.
Food is amazing!! They serve a 5-course menu and everything is delicious! Their wine menu is one of the best!!!
We had a very good time and took home sweet memories! For sure on our next visit to the city we’ll go back there!
"""

In [None]:
agent.reply(user_review)

In some cases, a little prompt engineering can make the model from bad to good:

In [None]:
initial_prompt = """
The user will provide a restaurant review. From it, determine if the restaurant should be recommended to a vegan person.
Provide a reasoning for your answer. Output your reasoning. Then in the new line output 1 if yes, -1 if no, and 0 in all other cases.
"""

memory = Memory(initial_prompt=initial_prompt)
agent = BaseAgent(memory=memory)

agent.reply(user_review)

The final use case we look in, is if we want to have a conversation based on a specific context. For example, you could pass in the prompt a user-provided text, which the user can then query via the agent:

In [None]:
webpage_content = """
Ebury Logo
About us
Corporates
Institutions
Partner with us
Careers
Login
Powering growth beyond borders.
From payments, collections, risk management, financing and more – we help businesses maximise their global growth potential.

OUR STORY

We make international trade more accessible, simple and personal.
We believe in a world where any business, big or small, should be able to transact globally with the same ease
and reliability they experience locally. We bring together our in-house platform,
in-depth expertise and custom solutions to help businesses go borderless and achieve their ambitions faster.


£ 27 B transacted in FY2022

1 M+ payments processed in the last 12 months

50,000 + clients served worldwide

1,600 + employees in 21 countries

Meet our team Leadership:

Peter Holmes, SVP of Client Onboarding
Richard Hughes, SVP of Credit Risk
Toby Young, Group Technology Director
Venancio Gallego, Strategic Advisor
Zafeer Ahmed, Global Head of Dealing
Ana Muñoz Fenollosa, Group Financial Director
Duane Swailes, SVP of Sales Acceleration & Marketing
Enrique Colin, SVP of Product and Data
Enrique Diaz-Alvarez, Chief Risk Officer
Fernando Pierri, Chief Commercial Officer
Juan Lobato, Founder & CEO

JOIN OUR TEAM
Join us as we build the international trade platform of the future and transform how businesses transact globally.

Company
Our story
Press room
Our global presence
Careers
Resource Hub
Blog
Podcast
Ebury Labs
Help Centre
Corporate solutions
E-commerce
NGO's and charities
Mass Payments
Corporate products
Payments and collections
Digital platforms
Business lending
FX risk management
Institutions
Ebury Institutional Solutions
Partner with us
White Label Solution
Branded affiliates
Affiliates

Get in touch with us
We’re happy to help! Contact us to learn more.

Subscribe to our blog   
Expert insights to grow your business globally.

FX Talk an Ebury podcast   
Get a breakdown of the global markets from our experts.

Join our team   
Explore open roles across 32+ offices worldwide.

Legal Privacy Notice Cookie Notice Manage cookies
United Kingdom - English

Ebury Partners UK Ltd is authorised and regulated by the Financial Conduct Authority as an Electronic Money Institution. Reference number: 900797. Ebury Partners UK Ltd is registered with the Information Commissioners Office, with registration number: ZA345828. Ebury Partners Markets Ltd is authorised and regulated by the Financial Conduct Authority as an Investment Firm to provide advice and execute trades in MiFID derivative products. Reference number: 784063. EBURY and EBURY What Borders? are trademarks.

Ebury Partners UK Ltd © 2023
"""

In [None]:
initial_prompt = f"""
Answer the user's questions based solely on the following context, which comes from Ebury's website.

CONTEXT:
'''
{webpage_content}
'''
"""

memory = Memory(initial_prompt=initial_prompt)
agent = BaseAgent(memory=memory)

In [None]:
agent.reply("What is the website about?")

In [None]:
agent.reply("Who is Ebury's responsible for Client Onboarding?")

# 4. Embedding Vectors and Vector Databases

Consider a very domain-specific question, such as "Can I export luxury goods to Russia in 2023?". Likely, the LLM does not know that:

In [None]:
memory = Memory()
agent = BaseAgent(memory=memory)

agent.reply("Can I export luxury goods to Russia in 2023?")

How can we use the ideas above to improve the answers that the AI gives?

Suppose we have access to Ebury's internal documentation: the answer is in there, but we cannot paste the whole corpus into the context! We then need to identify the document which contains the answer, and in it, the paragraph that contains the answer. We are then going to provide only that as context.

The way to "find the relevant paragraph in the corpus" is to use **embedding vectors**.

Let's start with the simplest type of embedding vector: a "word embedding".

In [None]:
import gensim.downloader as api

In [None]:
model = api.load("glove-wiki-gigaword-100")

In [None]:
vec = model.get_vector("beer")

len(vec)

In [None]:
vec[:5]

In [None]:
model.most_similar(vec)

Let's try to compose vectors, and see if we find something interesting in their meaning.

In [None]:
vec1 = model.get_vector("beer")
vec2 = model.get_vector("germany")
vec3 = model.get_vector("italy")

new_vec = vec1 - vec2 + vec3

model.most_similar(new_vec)

Word embeddings generalize to document embedding: a sentence (and even a full text) can be transformed into a vector, which captures its semantic content.

In [None]:
from langchain.embeddings import OpenAIEmbeddings

In [None]:
embeddings = OpenAIEmbeddings()

vec = embeddings.embed_query("The dog plays with the ball.")

len(vec)

In [None]:
vec2 = embeddings.embed_query("The hound plays with the ball.")
vec3 = embeddings.embed_query("The cat plays with the ball.")
vec4 = embeddings.embed_query("Spain is hereby established as a social and democratic State.")

In [None]:
import numpy as np

In [None]:
def cosine_similarity(a, b):
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [None]:
cosine_similarity(vec, vec2)

In [None]:
cosine_similarity(vec, vec3)

In [None]:
cosine_similarity(vec, vec4)

Now, the idea is simple: given all the documents in the corpus, we split them into text chunks (the "value"), and compute the embedding vector of each chunk (the "key"). We save each key-value pair in a database which, given its structure, is called a **vector database**.

Then, when the user asks a question, we turn the question into an embedding vector, and look for the the most similar vectors among the database keys: we then retrieve the corresponding values (the text chunks).

Let's start constructing the vector database:

In [None]:
from langchain.text_splitter import TokenTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader

In [None]:
text_splitter = TokenTextSplitter(chunk_size=300, chunk_overlap=0)

In [None]:
files = [
    os.path.join('..', 'data', 'sanctions-russia.pdf'),
    os.path.join('..', 'data', 'X_FAQ.pdf'),
    os.path.join('..', 'data', 'FX_FAQ.pdf')
]

all_docs = []
for file in files:
    loader = UnstructuredPDFLoader(file)
    docs = loader.load_and_split(text_splitter=text_splitter)
    all_docs = all_docs + docs

In [None]:
print(all_docs[5].page_content)

In [None]:
from langchain.vectorstores import FAISS

In [None]:
embeddings = OpenAIEmbeddings()

vdb = FAISS.from_documents(all_docs, embeddings)

In [None]:
vdb.save_local(os.path.join('..', 'src', 'vector_databases', 'my_database'))

Test it:

In [None]:
doc_list = vdb.similarity_search("Can I export luxury goods to Russia in 2023?", k=5)

In [None]:
print(doc_list[0].page_content)

Assuming that this piece of text contains the answer to the question, we would now like to pass this to the chatbot as context. Of course, we can to this by hand (copy-paste), but then the bot is hardly automatic.

Rather, we must give the LLM the **option** to search the vector database. In OpenAI, we can do so via the feature of **function call**.

# 5. Funcion Call for RAG

How do we integrate the vector database search seen above with our chatbot?

In OpenAI API, we do so using the following structure:

In [None]:
search_documentation_string = """
{
    "name": "search_documentation",
    "description": "Access information from internal documentation.",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "The user's query."
            }
        },
        "required": ["query"]
    }
}
"""

In [None]:
import json

search_documentation_json = json.loads(search_documentation_string)

We now pass this string to the `openai.ChatCompletion.create` call (together with the messages):

In [None]:
question = "Can I export luxury goods to Russia in 2023?"

messages = [
    {
        "role": "user",
        "content": question
    }
]

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=messages,
    functions=[search_documentation_json],
    temperature=0
)

Let's look at the response:

In [None]:
answer = response["choices"][0]
print(answer)

The LLM did not output a message content, but rather a "function call". Inside it, it tells us the name of the function it wants to call, and the arguments.

As you see, the LLM cannot **directly** run a function: it can only output text! But we can parse this output to actually run the function, and provide it the answer as a 'role' = 'function' message:

In [None]:
query = json.loads(answer.message.function_call.arguments)['query']
doc_list = vdb.similarity_search("Can I export luxury goods to Russia in 2023?", k=5)
function_output = "\n\n".join([doc.page_content for doc in doc_list])

In [None]:
messages = [
    {
        "role": "user",
        "content": question
    },
    answer.message,
    {
        "role": "function",
        "name": "search_documentation",
        "content": function_output
    }
]

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=messages,
    functions=[search_documentation_json],
    temperature=0
)

In [None]:
answer = response["choices"][0]
print(answer)

Now, we automate this process, giving the bot the possibility of querying the interal documenation, which we call the **knowledge base**.

First, we define a new class `KnowledgeBase`:

In [None]:
import re

In [None]:
class KnowledgeBase:
    
    def __init__(self, vdb, max_chunks=5):
        self.vdb = vdb
        self.max_chunks = max_chunks
        self.function_name = "search_documentation"
        self.function = json.loads(self.search.__doc__)
        
    def search(self, query):
        """
        {
            "name": "search_documentation",
            "description": "Access information from internal documentation.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "The user's query."
                    }
                },
                "required": ["query"]
            }
        }
        """
        retrieved = self.vdb.similarity_search(query, k=self.max_chunks)
        context = {}
        for i, doc in enumerate(retrieved):
            file_path = doc.metadata["source"]
            file_name = os.path.normpath(file_path).split(os.sep)[-1]
            title = f"INFORMATION {i + 1} (from {file_name})"
            content = re.sub("\s+", " ", doc.page_content)
            context[title] = content
        return str(context)

The docstring of this function, jsonized into the attribute `function`, is what we pass to OpenAI's API: it tells the LLM that, if it wants to access information from internal documentation, it can do so by calling the function "search_documentation" with input the query (a string).

In order for this to work, we must modify the `BaseAgent`:

In [None]:
class BaseAgent:
    
    def __init__(self, memory, knowledge_base, model="gpt-3.5-turbo", prices=[0.0015, 0.0020]):
        self.memory = memory
        self.knowledge_base = knowledge_base
        self.model = model
        self.prompt_cost = prices[0]
        self.completion_cost = prices[1]
        self.cost_list = []
        self.total_cost = 0
        
    def reply(self, question):

        human_message = {
            "role": "user",
            "content": question
        }
        self.memory.add(human_message)

        answer = False
        while not answer:
            
            agent_message = self.generate_response()
            self.memory.add(agent_message)
            
            if agent_message.content is not None:
                answer = agent_message.content
            
            else:
                function_call = agent_message.function_call
                function_name = function_call.name
                kwargs = json.loads(function_call.arguments)
                
                print(f"[Agent calling function {function_name} with arguments {kwargs}]\n")
                
                if function_name == self.knowledge_base.function_name:
                    function_output = self.knowledge_base.search(**kwargs)
                    function_message = {
                        "role": "function",
                        "name": function_name,
                        "content": function_output
                    }
                    self.memory.add(function_message)
                
                else:
                    function_output = "WARNING: Function not found!"
        
        print(answer)

    def generate_response(self):
        
        self.memory.delete_history()
        
        response = openai.ChatCompletion.create(
            model=self.model,
            messages=self.memory.messages,
            functions=[self.knowledge_base.function],
            temperature=0
        )

        self.calculate_cost(response.usage)

        agent_message = response.choices[0].message
        
        return agent_message
    
    def calculate_cost(self, usage):
        cost = (usage.prompt_tokens * self.prompt_cost + usage.completion_tokens * self.completion_cost)/1000
        self.cost_list.append(cost)
        self.total_cost += cost

Finally, we need to make a modification to the `Memory` class:

In [None]:
class Memory:
    
    def __init__(self, initial_prompt=None, max_tokens=3000):
        if initial_prompt is None:
            initial_prompt = ""
        self.messages = [
            {
                "role": "system",
                "content": initial_prompt
            }
        ]
        self.max_tokens = max_tokens
        self.encoding = tiktoken.get_encoding("cl100k_base")
        
    def add(self, message):
        self.messages.append(dict(message))

    def delete_history(self):
        while True:
            total_tokens = 0
            for message in self.messages:
                if message["content"] != None:
                    message_tokens = len(self.encoding.encode(message["content"]))
                    total_tokens += message_tokens
            if total_tokens > self.max_tokens:
                self.messages[1:] = self.messages[2:]
            else:
                return

    def __str__(self):
        return "\n".join([str(message) for message in self.messages])

Test it:

In [None]:
initial_prompt = """
Answer the user's question/request: Consult the internal documentation by calling the function 'search_documentation'.
"""

memory = Memory(initial_prompt=initial_prompt)
knowledge_base = KnowledgeBase(vdb)
agent = BaseAgent(memory=memory, knowledge_base=knowledge_base, model='gpt-4')

In [None]:
agent.reply("Can I export luxury goods to Russia in 2023?")

We can take a look at the memory, to see what happened:

In [None]:
print(agent.memory)

# 6. A Webapp for your LLM

Now we have all the core pieces for our chatbot, and can therefore move out of the notebook to an actual script. What we want to do is

1. Put some order in the code
2. Give the chatbot an interface, i.e., build a webapp

I have already done that: all the code can be found in the `src` folder.

# 7. Competition: Can your Chatbot Answer Correctly to our Questions?

Competition! You have 20 minutes to play around with the prompt (or, if you have other ideas, feel free to try them!): the purpose is to answer how many questions you can. Some questions will:

- Be based on general knowledge.
- Involve translations (when someone says 'English', do they mean British English or American English?)
- Require a little mathematical reasoning.
- Be related to the provided internal documentation.