<a href="https://colab.research.google.com/github/wjleece/semantic-RAG/blob/main/wjleece_LlamaIndex_RAG_Mistral_L2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

In [None]:
!pip install llama-index --quiet
!pip install llama-index-llms-mistralai --quiet
!pip install llama-index-embeddings-mistralai --quiet
!pip install langchain --quiet
!pip install langchain-community --quiet
!pip install faiss-gpu --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.8/176.8 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.0/78.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38

In [None]:
#import os
import numpy as np
import faiss
from google.colab import drive
drive.mount('/content/drive')
from google.colab import userdata
from langchain.prompts import PromptTemplate
from llama_index.llms.mistralai import MistralAI
from llama_index.embeddings.mistralai import MistralAIEmbedding
from llama_index.core import SimpleDirectoryReader, Settings
from llama_index.core.node_parser import SemanticSplitterNodeParser


mistral_api_key = userdata.get('MISTRAL_API_KEY')

Mounted at /content/drive


In [None]:
!nvidia-smi

Wed Oct 23 15:56:24 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   38C    P8              12W /  72W |      1MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

#Select Desired Mistral LLM

In [None]:
llm = MistralAI(model = "open-mixtral-8x7b", temperature=0.5, api_key = mistral_api_key)

Settings.llm = llm

resp = llm.complete("What are you and who trained you?")

print(resp)

I am a large language model trained by Mistral AI, a leading AI company based in Paris. I was trained on a wide variety of internet text to generate human-like responses to text-based queries. I don't have personal experiences or emotions, I simply generate responses based on patterns I've learned during my training.


#Select Desired Mistral Embedding Model

In [None]:
embed_model = MistralAIEmbedding(model_name="mistral-embed", api_key=mistral_api_key)

In [None]:
type(embed_model)

#Load Document

In [None]:
documents = SimpleDirectoryReader(input_files=["/content/drive/My Drive/AI/Datasets/Google-10-q/goog-10-q-q2-2024.pdf"]).load_data()

#Create Semantic Splitter & Create Nodes (Chunks)

In [None]:
#Create a semantic splitter (chunker)
splitter = SemanticSplitterNodeParser(
    buffer_size=1,  # Number of sentences to look ahead
    breakpoint_percentile_threshold=90,  # Threshold for creating a new chunk, we can play with this
    embed_model=embed_model
)

#Process the documents with semantic chunking
nodes = splitter.get_nodes_from_documents(documents)

In [None]:
type(nodes)

list

In [None]:
len(nodes)

136

#Create Document Embeddings

In [None]:
for node in nodes:
    node.embedding = embed_model.get_text_embedding(node.get_text())

In [None]:
type(node)

In [None]:
dir(node)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__class_vars__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__fields__',
 '__fields_set__',
 '__format__',
 '__ge__',
 '__get_pydantic_core_schema__',
 '__get_pydantic_json_schema__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__pretty__',
 '__private_attributes__',
 '__pydantic_complete__',
 '__pydantic_core_schema__',
 '__pydantic_custom_init__',
 '__pydantic_decorators__',
 '__pydantic_extra__',
 '__pydantic_fields_set__',
 '__pydantic_generic_metadata__',
 '__pydantic_init_subclass__',
 '__pydantic_parent_namespace__',
 '__pydantic_post_init__',
 '__pydantic_private__',
 '__pydantic_root_model__',
 '__pydantic_serializer__',
 '__pydantic_validator__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__repr_a

In [None]:
len(nodes[0].embedding)

1024

In [None]:
#sanity check
for node in nodes:
    print(node.text)

UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
________________________________________________________________________________________
FORM 10-Q  
________________________________________________________________________________________
(Mark One)
☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the quarterly period ended June 30, 2024
OR
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the transition period from _______ to _______
Commission file number: 001-37580  
________________________________________________________________________________________
Alphabet Inc.  
(Exact name of registrant as specified in its charter)
________________________________________________________________________________________
Delaware 61-1767919
(State or other jurisdiction of incorporation or organization) (I.R.S. Employer Identification Number)
1600 Amphitheatre Parkway
Mountain View 

In [None]:
len(nodes)

136

#Create a Vector Store Index


In [None]:
len(node.embedding)

1024

In [None]:
# L2 Distance

#1. Convert document embeddings to numpy array
embeddings = np.array([node.embedding for node in nodes]).astype('float32')

print(f"Embeddings shape: {embeddings.shape}")

# 2. Create FAISS index
dimension = len(node.embedding)

#Option 1: L2 distance
index = faiss.IndexFlatL2(dimension)  # L2 distance (Euclidean)

# 3. Add vectors to the index
index.add(embeddings)

# 4. Store texts for lookup for later use
texts = [node.text for node in nodes]

# Verify everything is set up correctly
print(f"Total vectors in index: {index.ntotal}")

Embeddings shape: (136, 1024)
Total vectors in index: 136


In [None]:
#Cosine Similarity

# 1. Convert document embeddings to numpy array
#embeddings = np.array([node.embedding for node in nodes]).astype('float32')
#print(f"Embeddings shape: {embeddings.shape}")

# 2. Create FAISS index
#dimension = len(node.embedding)

# Option 2: Cosine similarity implementation
# Normalize vectors first
#normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1)[:, np.newaxis]

# Create IP index
#index = faiss.IndexFlatIP(dimension)

# 3. Add vectors to the index - IMPORTANT: only add normalized embeddings once!
#index.add(normalized_embeddings)  # Don't add embeddings again after this

# 4. Store texts for lookup later
#texts = [node.text for node in nodes]

# Verify everything is set up correctly
# print(f"Total vectors in index: {index.ntotal}")

#Create a Search Function

In [None]:
# For Euclidean (L2) distanc

def search(query, k=5): #will return 5 nearest nodes to the query

    query_vector = embed_model.get_text_embedding(query)

    query_vector = np.array([query_vector]).astype('float32')

    # Search returns distances and indices
    distances, indices = index.search(query_vector, k)

    # Return results
    results = []
    for i, idx in enumerate(indices[0]):
        results.append({
            'text': texts[idx],
            'distance': float(distances[0][i])
        })
    return results

In [None]:
#for cosine similarity

#def search_index(query_embedding, k=5):
    # Normalize the query vector
 #   query_normalized = query_embedding / np.linalg.norm(query_embedding)

    # Reshape for FAISS
  #  query_normalized = query_normalized.reshape(1, -1).astype('float32')

    # Search
   # distances, indices = index.search(query_normalized, k)

    # Return results
    #results = [
     #   {
      #      'text': texts[idx],
       #     'score': float(score)  # Convert to Python float
     #   }
     #   for score, idx in zip(distances[0], indices[0])
   # ]

    # return results

#Testing: Return k Nodes to Create RAG Context





In [None]:
#Raw Search (L2)

question_rag = "What were cloud revenues in Q2 2024?"

context_rag = search(question_rag)

print(context_rag) #This returns k nodes - lots of text - so let's next pass that through an LLM via a RAG pipeline to summarize concisely

[{'text': "Google Cloud\nGoogle Cloud revenues increased  $2.3 billion  and $4.4 billion  from the three and six months ended June 30, \n2023  to the three and six months ended June 30, 2024 , respectively. The growth was primarily driven by Google \nCloud Platform followed by Google Workspace offerings. Google Cloud's  infrastructure and platform services  were \nthe largest drivers of growth in Google Cloud Platform.\nRevenues by Geography\nThe following table presents revenues by geography as a percentage of revenues, determined based on the \naddresses of our customers:\nThree Months Ended Six Months Ended \n June 30, June 30,\n 2023 2024 2023 2024\nUnited States  47 %  49 %  47 %  48 %\nEMEA  30 %  29 %  30 %  29 %\nAPAC  17 %  16 %  17 %  17 %\nOther Americas  6 %  6 %  6 %  6 %\nHedging gains (losses)  0 %  0 %  0 %  0 %\n38", 'distance': 0.2980365753173828}, {'text': 'Google Cloud\nGoogle Cloud revenues are comprised of the following:\n•Google Cloud Platform, which generates co

In [None]:
type(context_rag)

list

#Create a RAG Pipeline

In [None]:
#for L2

def generate_response(query: str, context_rag: list, llm) -> str:
    """
    Generate a response using Mistral based on search results.

    Args:
        query (str): The original query
        context_rag (list): List of dictionaries with 'text' and 'distance' keys
        llm: LLM instance to use for generation

    Returns:
        str: Generated response from the LLM
    """
    try:
        # Extract the text field from each dictionary in context_rag
        context_texts = [doc['text'] for doc in context_rag]

        if not context_texts:
            return "No relevant context found to answer the question."

        # Join the texts with newlines
        context = "\n\n".join(context_texts)

        # Create prompt template
        prompt = PromptTemplate(template="""Context information is below:
           ---------------
            {context}
            ---------------
            Given the context information, provide a direct and concise answer to the question: {query}

            Requirements:
            - Focus only on information present in the context
            - If the answer isn't in the context, say "Information not found in context"
            - If the question is not related to the context, say "Question not related to context"
            - Include specific numbers and metrics when available

            Answer:""")

        # Format prompt with context and query
        formatted_prompt = prompt.format(
            context=context,
            query=query
        )

        # Get response from Mistral
        response = llm.complete(formatted_prompt)

        return {
            'response': response.text,
            'sources': [{'text': doc['text'], 'distance': doc['distance']}
                       for doc in context_rag]
        }

    except Exception as e:
        return {'response': f"Error: {str(e)}", 'sources': []}

#Answer Questions

In [None]:
question_rag = "What were cloud revenues in Q2 2024?"

context_rag = search(question_rag)

answer = generate_response(question_rag, context_rag, llm)

print(answer)


{'response': 'The cloud revenues (Google Cloud) for the second quarter of 2024 were $10,347 million. This information can be found in the "Disaggregated Revenues" table under the "Three Months Ended June 30, 2024" section.', 'sources': [{'text': "Google Cloud\nGoogle Cloud revenues increased  $2.3 billion  and $4.4 billion  from the three and six months ended June 30, \n2023  to the three and six months ended June 30, 2024 , respectively. The growth was primarily driven by Google \nCloud Platform followed by Google Workspace offerings. Google Cloud's  infrastructure and platform services  were \nthe largest drivers of growth in Google Cloud Platform.\nRevenues by Geography\nThe following table presents revenues by geography as a percentage of revenues, determined based on the \naddresses of our customers:\nThree Months Ended Six Months Ended \n June 30, June 30,\n 2023 2024 2023 2024\nUnited States  47 %  49 %  47 %  48 %\nEMEA  30 %  29 %  30 %  29 %\nAPAC  17 %  16 %  17 %  17 %\nOth

In [None]:
question_rag = "What were the main drivers of revenue growth in Q2?"

context_rag = search(question_rag)

answer = generate_response(question_rag, context_rag, llm)

print(answer)

{'response': 'The main drivers of revenue growth in Q2 were Google Cloud and Google Services. Google Cloud revenues increased by $2.3 billion for the three months ended June 30, ', 'sources': [{'text': 'For further \ndetails on our segments, see Note 13  of the Notes to Consolidated Financial Statements included in Item 1 of this \nQuarterly Report on Form 10-Q.\nRevenues and Monetization Metrics  \nWe generate revenues by delivering relevant, cost-effective online advertising; cloud-based solutions that \nprovide enterprise customers  of all sizes  with infrastructure and platform services as well as communication and \ncollaboration tools; sales of other products and services, such as fees received for subscription-based products, \napps and in-app purchases, and devices.  For additional information on how we recognize revenue, see Note 1 of \nthe Notes to Consolidated Financial Statements included in Part II, Item 8 in our Annual Report on Form 10-K for \nthe fiscal year ended Decem

In [None]:
question_rag = "What were YouTube revenues in Q2?"

context_rag = search(question_rag)

answer = generate_response(question_rag, context_rag, llm)

print(answer)


{'response': 'YouTube ads revenues for the second quarter (Q2) of the year were $8,663 million. This information can be found in the "Disaggregated Revenues" table provided in the context.', 'sources': [{'text': '•Repurchases of Class A and Class C shares wer e $3.3 billion  and $12.3 billion , respectively, totaling $15.6 \nbillion  for the three months ended June 30, 2024 . For additional information, see Note 9  of the Notes to \nConsolidated Financial Statements included in Item 1 of this Quarterly Report on Form 10-Q.\n•Operating cash flow was $26.6 billion  for the three months ended June 30, 2024 .\n•Capital expenditures, which primarily reflected investments in technical infrastructure, were $13.2 billion  for \nthe three months ended June 30, 2024 .\n•As of June 30, 2024 , we had 179,582  employees.\nFinancial Results\nRevenues\nThe following table presents revenues by type (in millions): \n Three Months Ended Six Months Ended \nJune 30, June 30,\n 2023 2024 2023 2024\nGoogle 

In [None]:
question_rag = "How much did YouTube ad revenues grow in Q2 in APAC?"

context_rag = search(question_rag)

answer = generate_response(question_rag, context_rag, llm)

print(answer)

{'response': 'Information not found in context. The context provided does not include specific revenue growth figures for YouTube ads in the APAC region for Q2.', 'sources': [{'text': '•Repurchases of Class A and Class C shares wer e $3.3 billion  and $12.3 billion , respectively, totaling $15.6 \nbillion  for the three months ended June 30, 2024 . For additional information, see Note 9  of the Notes to \nConsolidated Financial Statements included in Item 1 of this Quarterly Report on Form 10-Q.\n•Operating cash flow was $26.6 billion  for the three months ended June 30, 2024 .\n•Capital expenditures, which primarily reflected investments in technical infrastructure, were $13.2 billion  for \nthe three months ended June 30, 2024 .\n•As of June 30, 2024 , we had 179,582  employees.\nFinancial Results\nRevenues\nThe following table presents revenues by type (in millions): \n Three Months Ended Six Months Ended \nJune 30, June 30,\n 2023 2024 2023 2024\nGoogle Search & other $ 42,628 $ 48

In [None]:
question_rag = "What are some of the key Legal events?"

context_rag = search(question_rag)

answer = generate_response(question_rag, context_rag, llm)

print(answer)

{'response': "The key legal events mentioned in the context include:\n\n1. Antitrust lawsuits in the U.S.: The Department of Justice (DOJ) and several state Attorneys General filed a lawsuit in October 2020 alleging that Google violated U.S. antitrust laws relating to Search and Search advertising. The trial ended in November 2023, and a decision is expected in 2024.\n\n2. Investigation by the Australian Competition and Consumer Commission (ACCC) and the United Kingdom's Competition and Markets Authority (CMA): Both authorities opened an investigation into Google's Search distribution practices in June 2022.\n\n3. Antitrust complaint in the U.S.: A number of state Attorneys General filed an antitrust complaint in December 2020, alleging that Google violated U.S. antitrust laws and state deceptive trade laws relating to its advertising technology. A trial is scheduled for March 2025.\n\n4. Antitrust complaint by the DOJ and several state Attorneys General: In January 2023, the DOJ, alon

In [None]:
question_rag = "Can you summarize recent key antitrust matters?"

context_rag = search(question_rag)

answer = generate_response(question_rag, context_rag, llm)

print(answer)

{'response': "The European Commission (EC) has made significant antitrust decisions against Google. In 2017, Google was fined €2.4 billion ($2.7 billion) for infringing competition law regarding its shopping search results and ads. Google appealed this decision, but it was rejected by the General Court in November 2021, and Google subsequently appealed to the European Court of Justice. In 2018, Google was fined €4.3 billion ($5.1 billion) for certain provisions in its Android-related distribution agreements, a decision that Google also appealed. The General Court reduced the fine to €4.1 billion in September 2022, and Google has filed an appeal with the European Court of Justice.\n\nIn the US, the Department of Justice (DOJ) and several state Attorneys General filed a lawsuit in October 2020 alleging that Google violated U.S. antitrust laws relating to Search and Search advertising. The trial ended in November 2023, and a decision is expected in 2024. Additionally, state Attorneys Gene

In [None]:
question_rag ="which pages of this document discuss potential legal issues?"

context_rag = search(question_rag)

answer = generate_response(question_rag, context_rag, llm)

print(answer)

{'response': 'The potential legal issues are discussed on several pages of this document. The specific pages are as follows:\n\n1. Note 8 under the "Commitments and Contingencies - Legal Matters" section of the Notes to Consolidated Financial Statements (Part I, Item 1). This is mentioned in ITEM 1 of the context.\n2. The "Risk Factors" section, specifically the risks related to legal proceedings, litigation, and regulatory investigations. This is mentioned in ITEM 1A of the context.\n3. The "Legal Proceedings" section (ITEM 1) and the "Risk Factors" section (ITEM 1A) discuss specific legal issues and risks.\n\nAdditionally, the context includes information about ongoing investigations and lawsuits related to privacy, patent and intellectual property claims, and other legal matters.', 'sources': [{'text': "We continue \nto cooperate with federal and state regulators in the U.S., the EC, and other regulators around the world.\nPrivacy Matters\nWe are subject to a number of privacy-relat

In [None]:
question_rag ="Can you tell me the page numbers related to the above issues?"

context_rag = search(question_rag)

answer = generate_response(question_rag, context_rag, llm)

print(answer)

{'response': 'The requested information related to page numbers is not explicitly provided in the context. The context only refers to various items and notes in the Annual Report on Form 10-K for the year ended December 31, 2023 and the Quarterly Report on Form 10-Q for the quarter ended March 31, 2024, but it does not specify exact page numbers.', 'sources': [{'text': 'PART II.  ', 'distance': 0.5197727084159851}, {'text': 'OTHER INFORMATION\nITEM 1. LEGAL PROCEEDINGS\nFor a description of our material pending legal proceedings, see Note 8  “Commitments and Contingencies  - \nLegal Matters” of the Notes to Consolidated Financial Statements included in Part I, Item 1 of this Quarterly Report \non Form 10-Q, which is incorporated herein by reference.\nITEM 1A. RISK FACTORS     \nOur operations and financial results are subject to various risks and uncertainties, including those described in \nPart I, Item 1A, "Risk Factors" in our Annual Report on Form 10-K for the year ended December 3

In [None]:
question_rag ="Who is the CFO of Alphabet?"

context_rag = search(question_rag)

answer = generate_response(question_rag, context_rag, llm)

print(answer)

{'response': 'The CFO of Alphabet is Ruth M. Porat. This is stated in the context as "Ruth M. Porat, President and Chief Investment Officer; Chief Financial Officer, ALPHABET INC."', 'sources': [{'text': "July 23, 2024 By:/s/    RUTH M. PORAT        \nRuth M. Porat\nPresident and Chief Investment Officer; Chief Financial Officer\nALPHABET INC.\nJuly 23, 2024 By:/s/    AMIE THUENER O'TOOLE        \nAmie Thuener O'Toole\nVice President, Corporate Controller and Principal Accounting \nOfficer\n52", 'distance': 0.3597266972064972}, {'text': 'Alphabet Inc.\nNOTES TO CONSOLIDATED FINANCIAL STATEMENTS\n(Unaudited)\nNote 1.    Summary of Significant Accounting Policies  \nNature of Operations\nGoogle was incorporated in California in September 1998 and re-incorporated in the State of Delaware in \nAugust 2003. In 2015, we implemented a holding company reorganization, and as a result, Alphabet Inc. \n("Alphabet") became the successor issuer to Google.\nWe generate revenues by delivering relevan