## Imports

In [None]:
!pip install cohere
!pip install sentence_transformers
!pip install datasets
!pip install pinecone

In [2]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")

  from tqdm.autonotebook import tqdm, trange


In [None]:
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

In [4]:
with open("chohere_api_key.txt") as f:
    COHERE_API_KEY = f.read().strip()
with open("pinecone_api_key.txt") as f:
    PINECONE_API_KEY = f.read().strip()

## First Element - Embedding Model

In [6]:
def load_and_embedd_dataset(
        dataset_name: str = 'fancyzhx/dbpedia_14',
        split: str = 'train',
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'content',
        rec_num: int = 400
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """
    from datasets import load_dataset

    print("Loading and embedding the dataset")

    # Load the dataset
    dataset = load_dataset(dataset_name, split=split)

    # Embed the first `rec_num` rows of the dataset
    embeddings = model.encode(dataset[text_field][:rec_num])

    print("Done!")
    return dataset, embeddings

source to the model : https://huggingface.co/datasets/fancyzhx/dbpedia_14?row=16

In [7]:
DATASET_NAME = 'fancyzhx/dbpedia_14'

dataset, embeddings = load_and_embedd_dataset(
    dataset_name=DATASET_NAME,
    rec_num=40,
    model=model,
)
shape = embeddings.shape

Loading and embedding the dataset


Downloading readme:   0%|          | 0.00/7.64k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/106M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/560000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/70000 [00:00<?, ? examples/s]

Done!


In [8]:
pd_dataset = dataset.to_pandas()
pd_dataset.head(5)

Unnamed: 0,label,title,content
0,0,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...
1,0,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...
2,0,Q-workshop,Q-workshop is a Polish company located in Poz...
3,0,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...
4,0,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...


In [9]:
print(f"The embeddings shape: {embeddings.shape}")

The embeddings shape: (40, 384)


## Second Element - Vector Database
We will use Pinecone's free-to-use vectorDB

In [10]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [11]:
INDEX_NAME = 'db-pedia'

# Create the vector database
# We are passing the index_name and the size of our embeddings
pc = create_pinecone_index(INDEX_NAME, shape[1])

Creating a Pinecone index...
Done!


In [12]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'content',
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]

    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


In [13]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)

Upserting the embeddings to the Pinecone index...


100%|██████████| 1/1 [00:00<00:00,  1.18it/s]


## Third Element - LLM
We will use [Cohere's chat API](https://cohere.com/chat)

In [14]:
def LLM_query (query: str):
  """
  Query the Cohere API using the 'command-r-plus' model and print the response.

  Args:
      query (str): The query string to send to the Cohere API.

  Returns:
      None
          The function prints the response directly.
  """
  co = cohere.Client(api_key=COHERE_API_KEY)
  response = co.chat(
          model='command-r-plus',
          message=query,
      )
  print(f'Response without source knowledge: {response.text}')

In [15]:
def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]

    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=3,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata']['content'] for match in query_results]

    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)

    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge

In [16]:
def LLM_query_with_source(query:str):
  """
  Query the Cohere API using the 'command-r-plus' model with augmented prompt including source knowledge.

  Args:
      query (str): The query string to augment and send to the Cohere API.

  Returns:
      None
          The function prints the response with source knowledge and the source knowledge itself.
  """
  augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
  co = cohere.Client(api_key=COHERE_API_KEY)
  response = co.chat(
          model='command-r-plus',
          message=augmented_prompt,
      )
  print(f'Response with source knowledge: {response.text}')

  print('Source:')
  print(source_knowledge)

# Q&A Before and After Source Knowledge

In [17]:
q1= "In what year did Abbott of Farnham E D Abbott Limited close its operations?"
LLM_query(q1)
LLM_query_with_source(q1)

Response without source knowledge: 1991
Response with source knowledge: 1972
Source:
 Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.

 Donald Healey Motor Company Limited was a British car manufacturer.

 Witcomb Cycles formerly known as Witcomb Lightweight Cycles is the trading name of the Witcomb Trading Company. It was a British company based in Deptford South London specialising in custom handmade steel bicycle frames. The company was founded in 1949 by Ernie Witcomb and his wife Lily. The London shop closed in May 2009.


In [18]:
q2 = "When did Q-workshop establish its official website and online store?"
LLM_query(q2)
LLM_query_with_source(q2)

Response without source knowledge: Q-workshop established its official website and online store in 2012.
Response with source knowledge: Q-workshop established its official website and online store in 2005.
Source:
 Q-workshop is a Polish company located in Poznań that specializes in designand production of polyhedral dice and dice accessories for use in various games (role-playing gamesboard games and tabletop wargames). They also run an online retail store and maintainan active forum community.Q-workshop was established in 2001 by Patryk Strzelewicz – a student from Poznań. Initiallythe company sold its products via online auction services but in 2005 a website and online store wereestablished.

 I-innovate (UK) is a London-based independent record label that diversified from video production into music management from 2009. I-innovate was founded by Najero Okenabirhie in 2008. I-innovate work with freelance directors marketers and artists in music and graphic design providing ad hoc

In [19]:
q3= "In what year did The Unsigned Guide transition from a printed directory to an online-only resource?"
LLM_query(q3)
LLM_query_with_source(q3)

Response without source knowledge: 2013
Response with source knowledge: The Unsigned Guide transitioned from a printed directory to an online-only resource in 2011.
Source:
 The Unsigned Guide is an online contacts directory and careers guide for the UK music industry. Founded in 2003 and first published as a printed directory The Unsigned Guide became an online only resource in November 2011.

 SCAN Health Plan (SCAN) is a not-for-profit health plan founded in 1977 and based in Long Beach California. The organization serves more than 110000 people with Medicare in Kern Los Angeles Orange Riverside San Bernardino San Diego and Ventura counties California and Maricopa county Arizona. The company also offers a health plan for Medicare and Medicaid-eligible individuals as part of the state’s long term care program in Maricopa county.

 Cavity Search Records is a record label based in Portland Oregon formed in 1992 by Christopher Cooper and Denny Swofford. It is known for producing debut r