In [133]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import pandas as pd
import cohere
import numpy as np
import nltk
import string
nltk.download('punkt')
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /home/student/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##### First things first - APIs

In [134]:
with open("chohere_api_keys.txt") as f:
    COHERE_API_KEY = f.read().strip()
with open("pinecone_api_key.txt") as f:
    PINECONE_API_KEY = f.read().strip()

# Document Reading, Preprocessing and Chunking

In [135]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

In [136]:
def clean_text(text):
    # Convert the text to lowercase
    text = text.lower()

    # Tokenize the text using NLTK
    tokens = nltk.word_tokenize(text)
    
    # Filter out tokens that are punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Rejoin the tokens into a single string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

In [137]:
def chunking(records_text, sentences_per_segment, dataset):
    """
    The function creates a new dataset by breaking each paragraph into segments of 3 sentences with
    a stride of 1. In addition, it maps the meta data into the new dataset so the context could be
    retrieved.
    Args:
        records_text: List of the paragraphs
        sentences_per_segment: The size of the context window
        dataset: The original dataset

    Returns:
        new_records: A list of the "new paragraphs"
        meta_df: A DataFrame which contains the meta data of each "new paragraph"
    """

    new_records = []
    meta_df = pd.DataFrame(columns=dataset.column_names)

    for n, paragraph in enumerate(records_text):
        # Splitting each paragraph into it's sentences
        sentences = paragraph.replace('\n', '').split('. ')
        
        # Iterating over the paragraph and dividing it into segments while 
        # maintaining it's original meta data.
        for i in range(len(sentences)- sentences_per_segment + 1):
            new_segment = ''
            for j in range(sentences_per_segment):
             new_segment += sentences[i + j] + '. '
            meta_df.loc[len(meta_df)] = dataset[n]
            meta_df.loc[len(meta_df) - 1]['text'] = new_segment
            new_records.append(clean_text(new_segment[:-1]))
    
    return new_records, meta_df    

In [138]:
def load_and_embedd_dataset(
        dataset_name: str = 'aav-ds/Israel-HAMAS_war_news',
        split: str = 'train',
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'text',
        rec_num: int = 400
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """
    from datasets import load_dataset, Dataset

    
    print("Loading and embedding the dataset")
    
    # Load the dataset
    dataset = load_dataset(dataset_name, split=split)
    
    
    # Chunking each paragraph in the data into context windows of 3 and stride 1.
    chunked_dataset, meta_data = chunking(dataset[text_field][:rec_num], 3, dataset)
    
    # Embed the first `rec_num` rows of the dataset
    embeddings = model.encode(chunked_dataset)
    
    # Converting the pd.DataFrame object into Dataset one.
    ds = Dataset.from_pandas(meta_data)
    
    print("Done!")
    return ds, embeddings

In [139]:
DATASET_NAME = 'aav-ds/Israel-HAMAS_war_news'

dataset, embeddings = load_and_embedd_dataset(
    dataset_name=DATASET_NAME,
    rec_num=13103,
    model=model,
)
shape = embeddings.shape

Loading and embedding the dataset
Done!


Lets us look at the dataset and the embeddings' shape

In [140]:
pd_dataset = dataset.to_pandas()
pd_dataset.head(5)

Unnamed: 0,url,datetime,title,text,provider,source,__index_level_0__
0,https://www.bbc.com/news/live/world-middle-eas...,2023-11-18T18:05:00,A look at where things stand,Confusion over al-Shifa evacuation: The direct...,BBC,site-live-news,0
1,https://www.bbc.com/news/live/world-middle-eas...,2023-11-18T18:05:00,A look at where things stand,"Israel denied this, saying it had agreed to an...",BBC,site-live-news,1
2,https://www.bbc.com/news/live/world-middle-eas...,2023-11-18T18:05:00,A look at where things stand,(It has shown pictures of an alleged tunnel sh...,BBC,site-live-news,2
3,https://www.bbc.com/news/live/world-middle-eas...,2023-11-18T18:05:00,A look at where things stand,"Footage, which BBC Verify has analysed, shows ...",BBC,site-live-news,3
4,https://www.bbc.com/news/live/world-middle-eas...,2023-11-18T18:05:00,A look at where things stand,The United Arab Emirates (UAE) says it's plann...,BBC,site-live-news,4


In [141]:
print(f"The embeddings shape: {embeddings.shape}")

The embeddings shape: (17320, 384)


# Pinecone Vector Database

In [142]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [143]:
INDEX_NAME = 'israel-news'

# Create the vector database
# We are passing the index_name and the size of our embeddings
pc = create_pinecone_index(INDEX_NAME, shape[1])

Creating a Pinecone index...


Done!


In [144]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'text',
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape
    
    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]
    
    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))
    
    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


In [145]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)

Upserting the embeddings to the Pinecone index...


  0%|          | 0/136 [00:00<?, ?it/s]

100%|██████████| 136/136 [00:48<00:00,  2.78it/s]


Let's view the index statistics!

In [146]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 18176}},
 'total_vector_count': 18176}

# RAG

In [147]:
def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]
    
    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=3,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata']['text'] for match in query_results]
    
    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)
    
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge

In [148]:
def get_answers_for_query(query, model, index):
    
    # Generating a response **WITHOUT** the context
    co = cohere.Client(api_key=COHERE_API_KEY)
    response1 = co.chat(
            model='command-r-plus',
            message=query,
        )
    not_rag_answer = response1.text

    # Generating a response **WITH** the context
    augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
    response2 = co.chat(
            model='command-r-plus',
            message=augmented_prompt,
        )
    rag_answer = response2.text
    
    return not_rag_answer, rag_answer, source_knowledge
    

In [149]:
def query_summ(query, model, index, print_source=False):
    """
    The function is written to run the query with/without RAG easily
    Args:
        query: The query
        model: The embedding model
        index: The vectorstore object
        print_source: A flag to determine wether print the source knowledge or not
    """
    not_rag, rag, knowledge = get_answers_for_query(query, model=model, index=index)
    print("Not RAG: " + not_rag)
    print("RAG: " + rag)
    if print_source:
        print("\nKnowledge: \n" + knowledge)

In [150]:
query = 'answer as shortly as possible when was the recent conflict between Israel and Gaza (as accurate date as possible)?'

query_summ(query, model=model, index=index, print_source=True)

Not RAG: May 2021
RAG: The recent conflict between Israel and Gaza took place in October 2023.

Knowledge: 
What started as a covert ethnic cleansing and mass dispossession of Palestinians in 1948 has evolved into an overt Israeli policy in 2023. The challenge, however, lies in the West's reluctance to acknowledge Israeli actions for what they are, as openly voiced by Israeli leaders."Israel’s" war on Gaza, withal, is not about October 7 as Western mainstream media would suggest. It’s about the 75-year-old war to empty Palestine of its indigenous population. 

It was not immediately clear if there were any impacts or interceptions. A four-day truce between Israel and Hamas is due to take effect Friday at 7 a.m. local time (midnight ET), after Qatar helped broker a deal to free hostages held in Gaza since October 7.. 

In this episode of UpFront, we look back at the history and context leading up to the current Israeli war on Gaza.Nearly two months after the October 7 attack by Hamas, I

In [162]:
query = 'answer as shortly as possible how many israeli people were killed by hamas in October 2023?'

query_summ(query, model=model, index=index, print_source=True)

Not RAG: None.
RAG: 1,400

Knowledge: 
The number of Palestinians killed in Gaza since Hamas attacked Israel on October 7 has surpassed 10,000, the Hamas-controlled Ministry of Health in Gaza announced Monday.Ashraf Al Qudra, spokesperson for the ministry, said 10,022 Palestinians in the enclave had been killed by Israeli strikes, including 4,104 children, 2,641 women and 611 elderly people. Those numbers suggest about three-quarters of the dead are from vulnerable populations.The ministry reported another 25,408 people have been injured.It's unclear how many combatants are included in the total. CNN cannot independently verify the numbers released by the ministry in Gaza, which is sealed off by Israel and mostly sealed by Egypt.Some background: Israel declared war on Hamas after the Islamist militant group launched a brutal attack on October 7, killing 1,400 in Israel and kidnapping more than 240. 

The number of Palestinians killed in Gaza since Hamas attacked Israel on October 7 has

In [152]:
query = 'answer as shortly as possible which countries/organizations currently fighting Israel?'

query_summ(query, model=model, index=index, print_source=True)

Not RAG: As of June 2024, there is no active war between Israel and any other country or organization. However, Israel has ongoing tensions and conflicts with neighboring territories, including Palestinian militant groups in Gaza, such as Hamas, and Hezbollah in Lebanon.
RAG: Hamas, Hezbollah, Iran, possibly also Houthis, and assorted Iraqi militias.

Knowledge: 
With a consistent gap, "Israel" has repeatedly gone to war with all its neighbors. Even in the last two decades, "Israel" cheered the war in Iraq in 2003. 2006, it went to war with Lebanon; in 2008-2009 and 2020, it attacked Gaza, and the list continues. 

If this Gaza war was like all the others, a ceasefire would probably have been in force by now.But now the fault lines that divide the Middle East are rumbling. For at least two decades, the most serious rift in the region has been between the friends and allies of Iran, and the friends and allies of the United States.The core of Iran's network is made up of Hezbollah in Leb

In [153]:
query = 'answer as shortly as possible why Israel attacked Hamas in 2023?'
query_summ(query, model=model, index=index, print_source=True)

Not RAG: The Israel-Hamas conflict in 2023 was triggered by the escape of six Palestinian prisoners from an Israeli high-security prison in September 2022, which led to a series of events, including Israeli arrest raids in the West Bank and Hamas rocket attacks on Israel.
RAG: Israel responded to Hamas' surprise attack on Israeli settlements and citizens.

Knowledge: 
But what drove Hamas to carry out such a daring and risky act? Did they not anticipate the harsh and brutal response from "Israel"? Did they not expect the unconditional support of the United States for "Israel"? To answer these questions, we need to go beyond the political games and look for the deeper motives and roots of this issue. The political roots of this issue go back to the creation of "Israel" in 1948 and the subsequent occupation of parts of Palestine, Syria, Jordan, and Lebanon. Since then, the world has witnessed a series of disputes over sovereignty, borders, security, religion, and identity that have fuele