In [1]:
# #!pip install chromadb
# %pip install -U langchain langchainhub openai --quiet

#  Initialise GenAI call

In [2]:
# GenAI Variables and models

import os

OPENAI_API_KEY = 'sxxx'
VERTEX_API_KEY = "xxx"

OPENAI_MODEL_GPT4_0125 = 'gpt-4-0125-preview'
OPENAI_MODEL_VISION_PREVIEW = 'gpt-4-1106-vision-preview'
OPENAI_MODEL_GPT3 = 'gpt-3.5-turbo-0125'

VERTEX_MODEL_GEMINI = 'gemini-1.0-pro'
VERTEX_MODEL_GEMINI_VISION = 'gemini-1.0-pro-vision'

VERTEX_MODEL_ANTROPIC_HAIKU = 'claude-3-haiku@20240307'
VERTEX_MODEL_ANTROPIC_SONNET = 'claude-3-sonnet@20240229'
VERTEX_MODEL_MISTRAL = 'mistralai/Mixtral-8x7B-v0.1'
OPENAI_EMBEDDING_3_LARGE = 'text-embedding-3-large'
OPENAI_EMBEDDING_3_SMALL = 'text-embedding-3-small'
OPENAI_EMBEDDING_ADA_002 = 'text-embedding-ada-002'


os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY


In [3]:
def list_file (dir:str, filter):
    return [os.path.join(root, file)  for root, dirs, files in os.walk(dir) for file in files if filter in file]

# Initilise ChromaDB

In [4]:
import chromadb
from chromadb.utils import embedding_functions
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

embedding_fn = OpenAIEmbeddingFunction(api_key=os.environ.get('OPENAI_API_KEY'), model_name=OPENAI_EMBEDDING_ADA_002)
# embedding_fn = embedding_functions.DefaultEmbeddingFunction()

class Collections:
    Name: str
    Documents: list
    Metadatas: list
    Ids: list
    count_items: int

    def __init__ (self, Name, Documents, Metadatas, Ids):
        if ( 
            len(Documents) != len (Ids) &
            len(Metadatas) != len(Documents)
        ):
            raise Exception("Error: Length of parameters do not match")
        
        self.Name = Name
        self.Documents = Documents
        self.Metadatas = Metadatas
        self.Ids = Ids
        self.count_items = len(Ids)

    def get_count(self):return self.count_items
    def get_Documents(self): return self.Documents
    def get_Metadatas(self): return self.Metadatas
    def get_Ids(self): return self.Ids
    def get_Name(self): return self.Name


class Document_vdb:
    vdb: chromadb
    collections: Collections

    def __init__ (
            self, 
            vdb = None, 
            collections = None, 
            persistent_dir = None
        ):
        if vdb == None:
            if persistent_dir == None:
                self.vdb = chromadb.Client()
            else:
                self.vdb = chromadb.PersistentClient(path = persistent_dir)
        else:
            self.vdb = vdb
        
        if collections != None:
            self.set_collections(collections)
        else: self.collections = None

    def get_vdb (self):
        return self.vdb
    
    def get_collection(
            self, 
            collection_name:str
        ):
        return self.vdb.get_collection(name = collection_name )

    def set_collections(
            self, 
            collections:Collections, 
            emb_fn = None
        ):

        if emb_fn == None: emb_fn = embedding_fn
        count_items = collections.get_count()

        collection = self.vdb.get_or_create_collection(
            name=collections.get_Name(), 
            embedding_function=emb_fn 
        )
        
        Documents = collections.get_Documents()
        Metadatas = collections.get_Metadatas()
        Ids = collections.get_Ids()

        for i in range(0, count_items):
            collection.add( 
                documents = Documents[i],
                metadatas = Metadatas[i],
                ids = Ids[i]
            )


In [41]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, TokenTextSplitter
import hashlib
import uuid 
import pypdf


text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=2000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
)

text_splitter_token = TokenTextSplitter(chunk_size=500, chunk_overlap=0)

def create_collections_dir (dir, collection_name) -> Collections:
    files = list_file(dir, ".pdf")
    Metadatas = []
    Ids = []
    Documents = []

    for file in files:
        with open(file, 'rb') as f:
            hash = str(hashlib.md5(f.read()).hexdigest())

        pdfFileObject = open(file, 'rb')
        pdfReader = pypdf.PdfReader(pdfFileObject)
        count = len(pdfReader.pages)

        for i in range(count):
            metadata = {
                'source': file,
                'page': i,
                'source_hash': hash,
            }
            page_chunks = text_splitter.split_text (pdfReader.pages[i].extract_text())
            for page_chunk in page_chunks:
                Ids.append(uuid.uuid1().hex)
                Documents.append(page_chunk)
                Metadatas.append(metadata)

    return Collections(
        Name = collection_name ,
        Documents=Documents,
        Metadatas=Metadatas,
        Ids=Ids
    )


def create_collection_token (file, collection_name) -> Collections:
    Metadatas = []
    Ids = []
    Documents = []

    with open(file, 'rb') as f:
        hash = str(hashlib.md5(f.read()).hexdigest())

    pdfFileObject = open(file, 'rb')
    pdfReader = pypdf.PdfReader(pdfFileObject)
    count = len(pdfReader.pages)

    for i in range(count):
        metadata = {
            'source': file,
            'page': i,
            'source_hash': hash,
        }
        page_chunks = text_splitter_token.split_text (pdfReader.pages[i].extract_text())
        for page_chunk in page_chunks:
            Ids.append(uuid.uuid1().hex)
            Documents.append(page_chunk)
            Metadatas.append(metadata)

    return Collections(
        Name = collection_name ,
        Documents=Documents,
        Metadatas=Metadatas,
        Ids=Ids
    )



def create_collection_char (file, collection_name) -> Collections:
    Metadatas = []
    Ids = []
    Documents = []

    with open(file, 'rb') as f:
        hash = str(hashlib.md5(f.read()).hexdigest())

    pdfFileObject = open(file, 'rb')
    pdfReader = pypdf.PdfReader(pdfFileObject)
    count = len(pdfReader.pages)

    for i in range(count):
        metadata = {
            'source': file,
            'page': i,
            'source_hash': hash,
        }
        page_chunks = text_splitter.split_text (pdfReader.pages[i].extract_text())
        for page_chunk in page_chunks:
            Ids.append(uuid.uuid1().hex)
            Documents.append(page_chunk)
            Metadatas.append(metadata)

    return Collections(
        Name = collection_name ,
        Documents=Documents,
        Metadatas=Metadatas,
        Ids=Ids
    )


In [42]:
from langchain.embeddings import OpenAIEmbeddings


dir = r'/Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling'
persist_dir = r'/Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling/chromadb_THL_pages_token'
collection_name = "Tan_Hooi_Ling"
files = list_file( dir, ".pdf")
no_files = len(files)
i = 1

for file in files:
    print (f"===== Processing file {i} / {no_files} -- {file}")
    FS_collection  = create_collection_token(file, collection_name=f"{collection_name}_{i}")
    test = Document_vdb(collections= FS_collection, persistent_dir= persist_dir)
    i = i + 1

===== Processing file 1 / 6 -- /Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling/Grab co-founder Tan Hooi Ling exits from operational roles by end 2023 | New Straits Times | Malaysia General Business Sports and Lifestyle News.pdf
===== Processing file 2 / 6 -- /Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling/Grab FY 2022 Form 20-F.pdf
===== Processing file 3 / 6 -- /Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling/Tan Hooi Ling - Wikipedia.pdf
===== Processing file 4 / 6 -- /Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling/Lunch With Sumiko: Grab whiz Tan Hooi Ling happy to stay low-key | The Straits Times.pdf
===== Processing file 5 / 6 -- /Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling/Grab co-founder Tan Hooi Ling to exit her operational roles | The Straits Times.pdf
===== Processing file 6 / 6 -- /Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling/0000950170-24-037611.pdf


In [43]:
from langchain.embeddings import OpenAIEmbeddings


dir = r'/Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling'
persist_dir_char = r'/Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling/chromadb_THL_pages_char'
collection_name = "Tan_Hooi_Ling"
files = list_file( dir, ".pdf")
no_files = len(files)
i = 1

for file in files:
    print (f"===== Processing file {i} / {no_files} -- {file}")
    FS_collection  = create_collection_char(file, collection_name=f"{collection_name}_{i}")
    test = Document_vdb(collections= FS_collection, persistent_dir= persist_dir_char)
    i = i + 1

===== Processing file 1 / 6 -- /Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling/Grab co-founder Tan Hooi Ling exits from operational roles by end 2023 | New Straits Times | Malaysia General Business Sports and Lifestyle News.pdf
===== Processing file 2 / 6 -- /Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling/Grab FY 2022 Form 20-F.pdf
===== Processing file 3 / 6 -- /Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling/Tan Hooi Ling - Wikipedia.pdf
===== Processing file 4 / 6 -- /Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling/Lunch With Sumiko: Grab whiz Tan Hooi Ling happy to stay low-key | The Straits Times.pdf
===== Processing file 5 / 6 -- /Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling/Grab co-founder Tan Hooi Ling to exit her operational roles | The Straits Times.pdf
===== Processing file 6 / 6 -- /Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling/0000950170-24-037611.pdf


In [26]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI


# OpenAIEmbeddings = OpenAIEmbeddings(api_key=os.environ.get('OPENAI_API_KEY'), model_name=OPENAI_EMBEDDING_ADA_002)

llm = ChatOpenAI(model_name=OPENAI_MODEL_GPT3, temperature=0)

vectordb = Chroma(persist_directory=persist_dir, collection_name = collection_name, embedding_function=OpenAIEmbeddings())

retriever = vectordb.as_retriever()


qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

question = """
    Question: Write a summary of Tan Hooi Ling?

    Output:
        Section 1: Tan Hooi Ling's summary
        Section 2: Tan Hooi Ling's educational background, including all schools she had attended or graduated from, in markdown format:
                    |Year | School | Degree | Course Description
    
    """


result = qa_chain({"query": question})
print (result['result'])

I can provide a summary of Tan Hooi Ling based on the information provided in the context. 

Section 1: Tan Hooi Ling is a Malaysian businesswoman based in Singapore, known as the co-founder and Chief Operating Officer (COO) of Grab Holdings Inc. She was born and raised in Kuala Lumpur, Malaysia, in a middle-class Chinese household. Tan attended the University of Bath in the United Kingdom, where she earned a bachelor's degree in mechanical engineering. She later co-founded Grab with Anthony Tan while attending Harvard Business School. Tan has been involved in various roles within Grab, focusing on product, human resources, and customer experience. She announced her decision to step down from her official Grab positions by the end of 2023.

Section 2: 
| Year | School | Degree | Course Description |
| ---- | ------ | ------ | ------------------- |
| University of Bath | Mechanical Engineering | Bachelor's Degree | Mechanical Engineering |
| Harvard Business School | - | - | Co-founded 

# Summarising Tan Hooi Ling

In [55]:
persist_dir_char = r'/Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling/chromadb_THL_pages_char'
persist_dir_token = r'/Users/peterwirija/Documents/GenAI/Data/Tan_Hooi_Ling/chromadb_THL_pages_token'

from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever


# OpenAIEmbeddings = OpenAIEmbeddings(api_key=os.environ.get('OPENAI_API_KEY'), model_name=OPENAI_EMBEDDING_ADA_002)

llm = ChatOpenAI(model_name=OPENAI_MODEL_GPT3, temperature=0)

client = chromadb.PersistentClient(path=persist_dir_char)  # or HttpClient()
collections = client.list_collections()

for collection in collections:
    print (collection.name)


    vectordb = Chroma(persist_directory=persist_dir, collection_name = collection.name, embedding_function=OpenAIEmbeddings())

    retriever = vectordb.as_retriever()


    customer_name = "Tan Hooi Ling"


    retriever_from_llm = MultiQueryRetriever.from_llm( retriever=retriever, llm=llm)


    question = """

        Assume the role of a research assistant researching  a biography of someone famour. Your current target is {customer_name}.
        You are to review the content in the text below. 

        Task: Review a document about {customer_name} and identify key biographical facts:

        1) Early Life:
            1a) Birthplace & Date
            1b) Hometown
            1c) Siblings (number)
        2) Education:
            2a) Schools attended (names & locations)
            2b) Degrees/Courses studied
            2c) Dates of attendance
        3) Career:
            3a) Employers & Positions
            3b) Employment Dates (start date and end date)


        Output:
         - Output will be markdown format.
         - Each of the 3 section will be a # Heading level 1
         - Refer to the output template in markdown format below

        <Output Template> 
        # Summary of {customer_name}
        <Place the summary of {Customer_Name} here>

        # Early Life       
        <place the {Customer_Name} early Life here in text>

        # Education 
        Output the Education background result in the following markdown format
        | School Name | Location | Course name | Degree type | Start Year | End Year |
        |---|---|---|---|---|---|
        
        # Career 
        Output the career result in the following markdown format
        | Employer Name | Location | Job title | Start Year | End Year |
        |---|---|---|---|---|


        contrainst:    
            Where you are unable to find the information, state them as "Not Available"
        """
    qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    # unique_docs = retriever_from_llm.from_llm(query=question)
    result = qa_chain({"query": question})

    print (result['result'])

    print ("================ END OF COLLECTION 1 =================")



Tan_Hooi_Ling_3
# Summary of Tan Hooi Ling
Tan Hooi Ling is a Malaysian businesswoman based in Singapore, best known as the co-founder and Chief Operating Officer (COO) of Grab Holdings Inc.

# Early Life
- Birthplace & Date: Not Available
- Hometown: Kuala Lumpur, Malaysia
- Siblings (number): One older brother

# Education
| School Name | Location | Course name | Degree type | Start Year | End Year |
|---|---|---|---|---|---|
| University of Bath | United Kingdom | Mechanical Engineering | Bachelor's degree | Not Available | 2006 |
| Harvard Business School | Not Available | Not Available | Master of Business Administration | Not Available | 2011 |

# Career
| Employer Name | Location | Job title | Start Year | End Year |
|---|---|---|---|---|
| McKinsey & Company | Malaysia | Business Analyst | Not Available | Not Available |
| Salesforce | San Francisco | Not Available | Not Available | Not Available |
| Grab Holdings Inc. | Singapore | Chief Operating Officer (COO) | 2015 | 2023 |

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


I'm sorry, but I cannot provide information on a specific customer's biography as the context provided does not mention any specific individual's name. If you have a particular person in mind, please provide their name, and I can assist you in gathering the required biographical facts.
Tan_Hooi_Ling_5


Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


# Summary of Tan Hooi Ling
Tan Hooi Ling co-founded Grab Holdings, a Singapore-based ride-hailing and food delivery company. She is stepping down from her operational roles in 2023 after leading the company's technology and corporate strategy teams.

# Early Life
- Birthplace & Date: Not Available
- Hometown: Not Available
- Siblings (number): Not Available

# Education
| School Name | Location | Course name | Degree type | Start Year | End Year |
|---|---|---|---|---|---|
| Not Available | Not Available | Not Available | Not Available | Not Available | Not Available |

# Career
| Employer Name | Location | Job title | Start Year | End Year |
|---|---|---|---|---|
| Grab Holdings | Singapore | Co-founder, Technology & Corporate Strategy Lead | Not Available | Not Available |


In [80]:

combined_results = "\n\n\n".join ([text for text in llm_results])


prompt_combine_result= rf"""
    You are summarising a set of result from LLM retrieval. The topic is the biography you like to write include {customer_name} early Life, Education, and Career
    Each result of the LLM is separated by three blank lines (\n\n\n).
    The documents were originally formatted in Markdown.and structured in the following manner 
    
    <RESULT STRUCTURE> 
    # Summary of document
    Place the summary of the document here
    
    # Summary of {customer_name}
    <Place a comprehensive summary of {customer_name} here>

    # Early Life       
    <place the {customer_name} early Life here in text>

    # Education 
    Output the Education background result in the following markdown format
    | School Name | Location | Course name | Degree type | Start Year | Graduation Year |
    |---|---|---|---|---|---|
    
    # Career 
    Output the career result in the following markdown format
    | Employer Name | Location | Job title | Start Year | End Year |
    |---|---|---|---|---|



    Task: 
    review the text between <RESULTS> and <\RESULTS>. 
    Combine the information for each section 
        - # Summary of {customer_name}
        - # Early Life  
        - # Education 
        - # Career 

    <RESULTS>
     {combined_results}
    <\RESULTS>

    contrainst:
        Ensure that # Summary of {customer_name} is comprehensive
        The output should retain the same structure in each section
        Do not review results that are "No information found!!"
        Do not include <RESULTS> or <\RESULTS> in the output
"""

result = llm.invoke(prompt_combine_result)

In [81]:
print (result.content)

# Summary of Tan Hooi Ling
Tan Hooi Ling is a co-founder of Grab Holdings Ltd, known for her instrumental role in leading various operations and technology teams within the company. She is stepping down from operational roles by the end of 2023 to transition into an advisory role.

# Early Life
- **Birthplace & Date:** Kuala Lumpur, Malaysia
- **Hometown:** Petaling Jaya
- **Siblings:** An older brother who is a software programmer

# Education
| School Name | Location | Course name | Degree type | Start Year | Graduation Year |
|---|---|---|---|---|---|
| University of Bath | Britain | Mechanical Engineering | Bachelor's | 2006 | Not Available |
| Harvard Business School | Not Available | Business | MBA | 2011 | Not Available |

# Career
| Employer Name | Location | Job title | Start Year | End Year |
|---|---|---|---|---|
| McKinsey & Company | Malaysia | Management Consultant | Not Available | Not Available |
| Grab Holdings Ltd | Singapore | Co-founder | 2012 | 2023 |


# Summary of Tan Hooi Ling
Tan Hooi Ling is a co-founder of Grab Holdings Ltd, known for her instrumental role in leading various operations and technology teams within the company. She is stepping down from operational roles by the end of 2023 to transition into an advisory role.

# Early Life
- **Birthplace & Date:** Kuala Lumpur, Malaysia
- **Hometown:** Petaling Jaya
- **Siblings:** An older brother who is a software programmer

# Education
| School Name | Location | Course name | Degree type | Start Year | Graduation Year |
|---|---|---|---|---|---|
| University of Bath | United Kingdom | Mechanical Engineering | Bachelor's | 2006 | Not Available |
| Harvard Business School | United States | Business Administration | Master's | Not Available | 2011 |

# Career
| Employer Name | Location | Job title | Start Year | End Year |
|---|---|---|---|---|
| McKinsey & Company | Malaysia | Management Consultant | Not Available | Not Available |
| Grab Holdings Ltd | Singapore | Co-founder | 2012 | 2023 |