# **ZakaBot v1.0.0**

**Last updated 11-05-2023 by Wissam M.**





In [1]:
!pip install -q langchain==0.0.150 tiktoken transformers openai faiss-cpu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m648.4/648.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import openai
import pathlib
from transformers import GPT2TokenizerFast
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

In [3]:
os.environ["OPENAI_API_KEY"] = ""

In [4]:
%mkdir 'TrainingData'
directory_path = '/content/TrainingData'
directory_files = os.listdir(directory_path)

In [19]:
## Manual step: drop the AI_Bootcamp_Syllabus.txt and AI_Certification_Syllabus.txt inside /content/TrainingData

In [6]:
model = "gpt-3.5-turbo"
min_match_error = 0.4

In [27]:
class Zaka_Bot:
    vector_db = None
    qa_chain = None

    def __init__(self):
        print("# Initializing Zaka_Bot class...")
        self.initialize_bot()
        
    def initialize_bot(self):
        training_chunks = self.read_and_chunk_training_data()
        self.initialize_vector_indices(training_chunks)
        self.initialize_qa_chain()

    def read_and_chunk_training_data(self) -> str:
        training_data_dir_path = './TrainingData/'
        training_data_files = os.listdir(training_data_dir_path)

        training_text = ''
        for file_name in training_data_files:
            if pathlib.Path(file_name).suffix != '.txt':
                continue

            with open(training_data_dir_path + file_name, 'r') as f:
                print('> Reading file:', file_name)
                text = f.read()
            
            training_text += '\n' + text

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = 500,
            chunk_overlap  = 24,
            length_function = self.count_tokens)

        chunks = text_splitter.create_documents([training_text])

        print("> Training data chunked")
        return chunks

    def initialize_qa_chain(self):
        chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")
        self.qa_chain = chain
        print("> QA chain loaded")

    def initialize_vector_indices(self, chunks):
        embeddings = OpenAIEmbeddings()

        vector_db = self.get_local_vector_indices(embeddings)

        if not vector_db:
            vector_db = FAISS.from_documents(chunks, embeddings)
            self.save_vector_indices(vector_db)
        else:
            print('Existing local vector database detected')

        self.vector_db = vector_db
        print("> Vector database initialized")

    def get_local_vector_indices(self, embeddings):
        try:
            return FAISS.load_local("faiss_index", embeddings)
        except:
            print('No local vector database detected')
            return None

    def save_vector_indices(self, vector_db):
        vector_db.save_local("faiss_index")
        print("> Vector database saved locally")

    def count_tokens(self, text: str) -> int:
        tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
        return len(tokenizer.encode(text))

    def assess_docs_and_score(self, docs_and_scores) -> bool:
        for doc_and_score in docs_and_scores:
            if doc_and_score[1] <= min_match_error:
                return True
        return False

    def get_docs_from_docs_and_scores(self, docs_and_scores):
        return [doc for doc, score in docs_and_scores]

    def get_answer(self, question: str) -> str:
        print('> Processing question...')
        answer = ''

        if self.vector_db is None:
            raise Exception("Error: vector index not initialized")
        if self.qa_chain is None:
            raise Exception("Error: qa_chain not initialized")
        
        print("> Running similarity search")
        docs_and_scores = self.vector_db.similarity_search_with_score(question)

        match_found = self.assess_docs_and_score(docs_and_scores)
        print("> Found relevant information in vector corpus" 
                      if match_found 
                      else f"> No relevant information found in local vector corpus, querying {model}..")

        print("> Running QA chain...")
        docs = self.get_docs_from_docs_and_scores(docs_and_scores)
        answer = self.qa_chain.run(input_documents=docs, question=question)
        
        answer = self.augment_answer(question, answer, match_found)

        print("> Response received")
        return answer

    def augment_answer(self, question: str, answer: str, match_found: bool) -> str:
        identity_init = """
        You are 'Zaka Bot', a friendly chatbot and assistant who works for the company 'Zaka'
        """
        idendity_message = {"role": "system", "content": identity_init}

        rephrase_order = """
        Your next message will be your last message rephrased, without opening or trailing statements
        """

        topic_adjustive = """
        At the end of the answer, ask the user if they have any questions about Zaka
        """

        if match_found:
            response = openai.ChatCompletion.create(
                model=model,
                messages=[
                    idendity_message,
                    {"role": "user", "content": question},
                    {"role": "assistant", "content": answer},
                    {"role": "system", "content": rephrase_order}
                    ])
            
        else:
            response = openai.ChatCompletion.create(
                model=model,
                messages=[
                    idendity_message,
                    {"role": "user", "content": question},
                    {"role": "assistant", "content": topic_adjustive},
                    ])

        return response['choices'][0]['message']['content']

In [28]:
bot = Zaka_Bot()

# Initializing Zaka_Bot class...
> Reading file: AI_Bootcamp_Syllabus.txt
> Reading file: AI_Certification_Syllabus.txt
> Training data chunked
No local vector database detected
> Vector database saved locally
> Vector database initialized
> QA chain loaded


In [29]:
question = 'Who is Ronaldo?'

In [30]:
answer = bot.get_answer(question)

> Processing question...
> Running similarity search
> No relevant information found in local vector corpus, querying gpt-3.5-turbo..
> Running QA chain...
> Response received


In [31]:
answer

'Ronaldo can refer to several people with the same name, but it is commonly associated with the famous Portuguese professional footballer Cristiano Ronaldo. He is widely considered one of the greatest football players of all time and has won numerous awards throughout his career. Is there anything related to Zaka that you would like to know more about?'