In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_community.retrievers import TFIDFRetriever
from langchain_community.vectorstores import FAISS
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings

#### Instantiate the embeddings model

In [4]:
embedding = OllamaEmbeddings(model="nomic-embed-text:latest")  # 768 dims

#### Load the previously made vector stores

In [5]:
#TF-IDF
tfidf_retriever = TFIDFRetriever.load_local("tfidf_aoe2.pkl", allow_dangerous_deserialization=True)

#Chroma
chroma_vectorstore = Chroma(embedding_function=embedding, persist_directory="chroma_aoe2")
chroma_retriever = chroma_vectorstore.as_retriever(search_kwargs={"k": 5})

#Faiss
faiss_retriever = FAISS.load_local("faiss_aoe2", embedding, allow_dangerous_deserialization=True).as_retriever()

In [6]:
from langchain_core.retrievers import BaseRetriever
from langchain_ollama import ChatOllama
llm = ChatOllama(model="llama3.1:latest", format="json", temperature=0)

In [7]:
def make_llama_3_prompt(user, system="", context=""):
    if system != "":
        system_prompt = (
            f"<|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|>"
        )
    return f"<|begin_of_text|>{system_prompt}<|start_header_id|>user<|end_header_id|>\n\n{user}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

In [8]:
class BasicAgent:
    def __init__(self, llm:ChatOllama, retriever:BaseRetriever) -> None:
        self.retriever = retriever
        self.llm = llm

    def custom_retriever(self, user_query:str, k):
        try:
            self.retriever.k = k
        except Exception as e:
            self.retriever.search_kwargs['k'] = k
        retrieved_docs = self.retriever.invoke(user_query)
        context = ""
        for doc in retrieved_docs:
            context += f"Extracted from page {doc.metadata['page']} \n{doc.page_content} \n\n"
        return context, retrieved_docs

    def query(self, user_query:str, k=5):
        context, retrieved_docs = self.custom_retriever(user_query, k)
        system_prompt = ("You are helpful assistant, your role is to assist people getting their way around the rules and mechanics of the famous game Age of Empires 2."
                        "You have the task to answer using the following context"
                        f"<CONTEXT>{context}</CONTEXT>"
                        "Keep you answers brief, make reference to the pagees used and keep the answer at 50 words at max."
                        "If the answer is not contained in the context, say you don't know")
        prompt = self.make_llama_3_prompt(user_query, system_prompt)
        answer = self.llm.invoke(prompt)
        return answer.content, context, retrieved_docs
    
    def make_llama_3_prompt(self, user, system="", context=""):
        if system != "":
            system_prompt = (
                f"<|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|>"
            )
        return f"<|begin_of_text|>{system_prompt}<|start_header_id|>user<|end_header_id|>\n\n{user}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

In [9]:
TFIDF_Agent = BasicAgent(llm, tfidf_retriever)
Chroma_Agent = BasicAgent(llm, chroma_retriever)
Faiss_Agent = BasicAgent(llm, faiss_retriever)

# Evaluation

Create synthetic testing datasets

In [10]:
llm_eval_generator = ChatOllama(model="mistral:instruct",  format="json", temperature=0)

In [50]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output::: JSON with two keys, the --question-- and the --answer--
{{"question": (your factoid question), "answer": (your answer to the factoid question)}}

Now here is the context.

Context: {context}\n
Output:::
"""

In [75]:
JSON_Corrector_prompt = """
Your task is to correct malformed JSON objects
Given a this damaged JSON: {bad_json}\n

Correct any formatting mistakes. Output the corrected JSON with no text before or after the JSON.
"""

In [13]:
docs = chroma_vectorstore.get()

In [16]:
docs.keys()

dict_keys(['ids', 'embeddings', 'metadatas', 'documents', 'uris', 'data', 'included'])

In [24]:
docs['metadatas'][0:10]

[{'page': 96, 'source': 'docs/Age_of_Empires_2_Manual.pdf'},
 {'page': 62, 'source': 'docs/Age_of_Empires_2_Manual.pdf'},
 {'page': 126, 'source': 'docs/Age_of_Empires_2_Manual.pdf'},
 {'page': 60, 'source': 'docs/Age_of_Empires_2_Manual.pdf'},
 {'page': 126, 'source': 'docs/Age_of_Empires_2_Manual.pdf'},
 {'page': 89, 'source': 'docs/Age_of_Empires_2_Manual.pdf'},
 {'page': 71, 'source': 'docs/Age_of_Empires_2_Manual.pdf'},
 {'page': 22, 'source': 'docs/Age_of_Empires_2_Manual.pdf'},
 {'page': 72, 'source': 'docs/Age_of_Empires_2_Manual.pdf'},
 {'page': 85, 'source': 'docs/Age_of_Empires_2_Manual.pdf'}]

In [18]:
docs['documents'][:10]

['even faster than Double-Bit Axe.\nThe bow saw had a rounded handle like a bow with the saw blade connecting the bow ends. The bow saw was\na more precise tool than previous saws. W oodcutters using it got more usable wood from each tree by reducing\nwaste.\nTwo-Man Saw\nT wo-Man Saw (at the Lumber Camp) makes villagers chop\nwood even faster than the Bow Saw.',
 '60 Chapter VII  -  Units\nMan-at-Arms\nStronger than Militia; cheap and quick to create.\n/c67/c114/c101/c97/c116/c101/c100/c32/c97/c116 Barracks\n/c83/c116/c114/c111/c110/c103/c32/c118/c115 /c46 skirmishers, camels, Light Cavalry\n/c87 /c101/c97/c107/c32/c118/c115 /c46 archers, scorpions, cavalry archers, mangonels, Cataphracts\n/c85/c112/c103/c114/c97/c100/c101/c115 Attack — Forging, Iron Casting, Blast Furnace (Blacksmith)',
 'and combat stances  43custom  43described  42setting  42types of  43Fortified Wall  57\nFrankish Technology Tree\n120\nG\ngame\ninstalling  2mastering basics of  3\ngame types  12.\nSee also\nindivi

In [19]:
## Let's take a random sample of 30 chunks

In [40]:
from random import seed
import numpy as np
temp_all_docs = docs['documents']
temp_all_metadatas = docs['metadatas']
all_docs = [{"page_content":doc, "metadata":metadata} for doc, metadata in zip(temp_all_docs, temp_all_metadatas)]
np.random.seed(1234)
docs_sample = np.random.choice(all_docs, 30, replace=False)

In [33]:
len(docs_sample)

30

In [43]:
docs_sample[0:10]

array([{'page_content': 'The heavy camel was an especially experienced warrior and camel rider who wore some armor . They were\nused by desert civilizations of the Middle East who fought against archers from the Byzantine Empire and\nhorse archers raiding down from the steppes of Asia.', 'metadata': {'page': 78, 'source': 'docs/Age_of_Empires_2_Manual.pdf'}},
       {'page_content': 'the game. You can display the objectives again during a game by clicking the Objectives\nbutton at the top of the screen.\nStandard victory\nYou can win any Random Map or Death Match game by being the first player or team todefeat your enemies in military conquest, control all relics, or build a Wonder. You andyour opponents do not have to pursue the same victory condition. For example, you may', 'metadata': {'page': 18, 'source': 'docs/Age_of_Empires_2_Manual.pdf'}},
       {'page_content': '138 Indexcavalry units  73\nCeltic Technology Tree  116Chain Barding Armor  104Chain Mail Armor  98Champion  61Chem

In [85]:
import json
def json_corrector(llm, json_to_correct, MAX_TRIES):
    correction_prompt = JSON_Corrector_prompt.format(bad_json=json_to_correct)
    
    for i in range(MAX_TRIES):
        corrected_json = llm.invoke(correction_prompt).content
        print(f"Try {i}: output {corrected_json}")
        try:
            corrected_json = json.loads(corrected_json)
            print('SUCCESS')
            break
        except Exception as e:
            print(e)
            print('FAILED')
            corrected_json = None
            continue
    return corrected_json

In [86]:
json_corrector(llm_eval_generator, "{question: 'how do i make a cookie?, 'answer':'you bake it'}", 3)

Try 0: output  {
      "question": "how do i make a cookie?",
      "answer": "you bake it"
   }
SUCCESS


{'question': 'how do i make a cookie?', 'answer': 'you bake it'}

In [107]:
import json
qas = []

MAX_TRIES = 3
for doc in docs_sample:
    prompt = QA_generation_prompt.format(context=doc['page_content'])
    tries = 0
    qa = llm_eval_generator.invoke(prompt).content
    try:
        qa_dict = json.loads(qa)
    except:
        qa_dict = json_corrector(llm_eval_generator, qa, MAX_TRIES)
    temp_doc = doc.copy()
    temp_doc['evalqa'] = qa_dict
    qas.append(temp_doc)
        

Try 0: output  {
      "question": "Which religious groups embraced the practice of living in monasteries?",
      "answer": "Monastic life was embraced by several religions, including Christianity and Buddhism."
   }
SUCCESS
Try 0: output  {
      "question": "What does a well-built stone wall offer protection against in a civilization?",
      "answer": "A well-built stone wall offers protection against raiders because it can be broken down only by a determined effort."
   }
SUCCESS


In [104]:
qas[6]

{'page_content': 'invasion of 1066.  The Normans also invaded Sicily and southern Italy. Many of the Crusades involved sea\nmovement from France to the Holy Land.  The English brought armies into France several times during theHundred Y ear’ s W ar .\nDemolition Ship & Heavy Demolition Ship\nFilled with explosives. Pilot near enemy ships and detonate to wrest\ncontrol of the sea from an entrenched opponent.\n/c66/c117/c105/c108/c116/c32/c97/c116 Dock',
 'metadata': {'page': 86, 'source': 'docs/Age_of_Empires_2_Manual.pdf'},
 'evalqa': {'question': 'Who were the Normans and where did they invade besides England in 1066?',
  'answer': 'The Normans, also known as the Northmen, were a group of Vikings who settled in northern France. They invaded Sicily and southern Italy besides England in 1066.'}}

Let's dump it into a JSON file and inspect it manually, we might need to remove some of the samples.
We are striving to get a high quality evaluation set

In [108]:
with open("eval_original.json", "w") as f:
    f.write(json.dumps(qas))

From the original 30 questions, 5 were removed due to their poor quality. This poor quality questions were partialy due to problmes during the pdf parsing process. It was later found that this text was extracted from a table, which explains the lack of formatting and cohesiveness of the extracted text. More advanced techniques should be implemented in order to make this process better

```json
{
        "page_content": "BRACERDOUBLE-BIT AXECOINAGE BANKING\nCARTOGRAPHYTRADE CART\nBOW SAWHORSE COLLAR HEAVY PLOW\nGUILDS\nTWO-MAN SAWCROP ROTATION\nWATCH TOWERFORTIFIED WALL\nGUARD TOWER KEEPBOMBARDTOWERSTONE WALL\nMILITIAFISHING SHIP\nMAN-AT-ARMSTRACKING\nSPEARMAN\nELITESKIRMISHERCAVALRYARCHER\nCROSSBOWMANHEAVY CAVALRYARCHER HANDCANNONEERARBALESTPIKEMAN SQUIRES\nLONGSWORDSMANTWO-HANDEDSWORDSMANCHAMPIONKNIGHT CAVALIER PALADIN",
        "metadata": {
            "page": 112,
            "source": "docs/Age_of_Empires_2_Manual.pdf"
        },
        "evalqa": {
            "question": "What are the types of cavalry mentioned in the context?",
            "answer": "The types of cavalry mentioned in the context are ARCHER, HEAVY CAVALRY, SKIRMISHER, and CROSSBOWMAN"
        }
    }
```