In [220]:
!pip install datasets
!pip install pincone
!pip install cohere
!pip install pandas
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
import pandas as pd
from IPython.display import display
warnings.filterwarnings("ignore")



You should consider upgrading via the 'c:\users\haito\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.
ERROR: Could not find a version that satisfies the requirement pincone (from versions: none)
ERROR: No matching distribution found for pincone
You should consider upgrading via the 'c:\users\haito\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'c:\users\haito\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'c:\users\haito\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [239]:
# get the input from the user
COHERE_API_KEY = input("Please enter your Cohere API key: ")
PINECONE_API_KEY = input("Please enter yiour Pinecone API key: ")

In [222]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

In [223]:
def load_and_embedd_dataset(
        directory_name,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'textuals',
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        directory_name: The name of the directory in which the data is stored
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """

    print("Loading and embedding the dataset")

    # Load the csvs from the directory, take only the "name" and "description" columns
    dataset = pd.DataFrame()
    for file in os.listdir(directory_name):
        if file.endswith(".csv"):
            to_add = pd.read_csv(os.path.join(directory_name, file))
            #add the name of the file as a column
            to_add['file'] = file[:-4]
            #if there are no name and description columns, skip
            if 'name' not in to_add.columns or 'description' not in to_add.columns:
                continue
            dataset = pd.concat([dataset, to_add[['name', 'description', 'file']]])

    dataset['textuals'] = dataset['name'] + ' (' + dataset['file'] +'): ' + dataset['description']

    #remove blanks in textuals
    dataset = dataset.dropna(subset=['textuals'])
    #drop empty strings in textuals
    dataset = dataset[dataset['textuals'].str.strip().astype(bool)]

    #reindex
    dataset = dataset.reset_index(drop=True)

    # Embed the first `rec_num` rows of the dataset  
    embeddings = model.encode(dataset[text_field])
    
    print("Done!")
    return dataset, embeddings

In [224]:
DIRECTORY_NAME = 'data\elden-ring-data'

dataset, embeddings = load_and_embedd_dataset(
    directory_name=DIRECTORY_NAME,
    model=model,
)
shape = embeddings.shape

Loading and embedding the dataset
Done!


In [225]:
print(f"The embeddings shape: {embeddings.shape}")

The embeddings shape: (2273, 384)


In [226]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [227]:
INDEX_NAME = 'tomershirshelly'

# Create the vector database
# We are passing the index_name and the size of our embeddings
pc = create_pinecone_index(INDEX_NAME, shape[1])

Creating a Pinecone index...
Done!


In [228]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'textuals',
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape
    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]
    
    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


In [229]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
print(index, embeddings.shape, dataset.shape)
index_upserted = upsert_vectors(index, embeddings, dataset)

<pinecone.data.index.Index object at 0x0000015F8425B5E0> (2273, 384) (2273, 4)
Upserting the embeddings to the Pinecone index...


100%|██████████| 18/18 [00:16<00:00,  1.08it/s]


In [238]:
import cohere

#First lets write a query for the LLM
queries = ["In Elden Ring, What is the starting class that has a club?", "What is a Deathbird?", "Who is the Black Blade?", "Who is able to mimic the abilities of the player?"]


for query in queries:
    co = cohere.Client(api_key=COHERE_API_KEY)
    response = co.chat(
            model='command-r-plus',
            message=query,
        )
    print('Query: ',query)
    print('\nAnswer: ',response.text)
    print('-------------------')

Query:  In Elden Ring, What is the starting class that has a club?

Answer:  The starting class that begins with a club in Elden Ring is the Vagabond.
-------------------
Query:  What is a Deathbird?

Answer:  A Deathbird is a fictional character in the Marvel Universe. She is a powerful and dangerous mutant with the ability to generate powerful energy blasts and manipulate darkforce energy. She is known for her bird-like appearance, with feathered wings and a bird-like head, and is usually depicted as an antagonist to the X-Men and other superheroes in the Marvel Universe.

Deathbird, whose real name is Cal'syee Neramani, is a member of the Shi'ar Empire, an alien civilization in the Marvel Universe. She is the sister of Lilandra, the Empress of the Shi'ar, and has often clashed with the X-Men and other superheroes due to her desire for power and control. She has served as both a villain and occasionally as an anti-hero, and has been a member of teams such as the Brotherhood of Mutant

In [235]:
def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]
    
    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=3,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata']['textuals'] for match in query_results]
    
    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)
    
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query. The contexts are in the format of 'name of entity (type of entity, i.e source file): description of entity'.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge

In [236]:
queries = ["What is the starting class that has a club?", "What is a Deathbird?", "Who is the Black Blade?", "Who is able to mimic the abilities of the player?"]


for query in queries:
    augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
    response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
    print('Query: ',query)
    print('\nAnswer: ',response.text)
    print('\nSources: ',source_knowledge)
    print('-------------------')

Query:  What is the starting class that has a club?

Answer:  Wretch.

Sources:  Wretch (classes): A poor purposeless osd naked as they day they were born. A nice club is all they have

Club (weapons): A thick, solid lump of wood. Wielding this striking weapon requires no skill. A simple, primitive weapon that requires only brute strength and persistence to hammer your foe into the ground.

Large Club (weapons): A thick, solid lump of wood. Wielding this striking weapon requires no skill. A simple, primitive weapon that requires only brute strength to swing about.
-------------------
Query:  What is a Deathbird?

Answer:  A Deathbird is a bipedal bird with arms and a bird skull for a head. It is found in various locations and carries a barbed rod called Death's Poker. They are fire keepers of graveyards and are known to rake out the ashes of the dead from their kilns.

Sources:  Deathbird (bosses): A bipedal bird with arms and a bird skull for a head and is found in various locations

