In [10]:
import os
import sys
import openai

from dotenv import load_dotenv
from os import environ

# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio

nest_asyncio.apply()

# Load environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY")

print(openai_api_key)
print(llama_cloud_api_key)

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
llx-cx5VpLkbTlh3H6SDCSETByTTamfAQWETIRVMGOI31j3lFilc


In [11]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings

embed_model = OpenAIEmbedding(model="text-embedding-3-large")
llm = OpenAI(model="gpt-4o")

Settings.llm = llm
Settings.embed_model = embed_model

In [13]:
from llama_parse import LlamaParse

# Define the file path and data directory
file_path = "./data/wp_McKinsey - Exploring Generative AI for Business Success.pdf"

# Create an instance of LlamaParse and load the data
documents = LlamaParse(result_type="markdown").load_data(file_path)

Started parsing the file under job_id cac11eca-48ea-47ac-9f1a-795b3d978d41


In [14]:
from copy import deepcopy
from llama_index.core.schema import TextNode
from llama_index.core import VectorStoreIndex


def get_page_nodes(docs, separator="\n---\n"):
    """Split each document into page node, by separator."""
    nodes = []
    for doc in docs:
        doc_chunks = doc.text.split(separator)
        for doc_chunk in doc_chunks:
            node = TextNode(
                text=doc_chunk,
                metadata=deepcopy(doc.metadata),
            )
            nodes.append(node)

    return nodes

In [15]:
page_nodes = get_page_nodes(documents)

In [18]:
from llama_index.core.node_parser import MarkdownElementNodeParser

node_parser = MarkdownElementNodeParser(
    llm=OpenAI(model="gpt-4o"), num_workers=8
)

In [19]:
nodes = node_parser.get_nodes_from_documents(documents)

base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

objects[0].get_content()

23it [00:00, 23018.13it/s]
  self._core_bpe = _tiktoken.CoreBPE(mergeable_ranks, special_tokens, pat_str)
100%|██████████| 23/23 [00:18<00:00,  1.27it/s]


'**Table Title/Caption:** Evolution of GenAI Considerations\n\n**Table ID:** Not provided\n\n**Summary:** This table outlines the progression of questions and considerations regarding Generative AI (GenAI) from understanding its basics and distinguishing between hype and reality, to practical steps for implementation, governance, partnerships, risk management, talent, and technology implications.\n\n**Keep Table:** Yes,\nwith the following columns:\n'

In [20]:
# dump both indexed tables and page text into the vector index
recursive_index = VectorStoreIndex(nodes=base_nodes + objects + page_nodes)

In [24]:
print(page_nodes[1].get_content())

What a difference a few months makes!

|From a few months ago...|... to today|
|---|---|
|What is GenAI? What it is not?|Where and how should we start with GenAI?|
|Is it hype or reality?|How do we organize and govern GenAI?|
| |Which player(s) should we partner with?|
| |How do we balance risk and value creation?|
| |What are the talent and tech stack implications?|
| |How do we get going and learn fast?|

QuantumBlack, AI by McKinsey


In [26]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

reranker = FlagEmbeddingReranker(
    top_n=5,
    model="BAAI/bge-reranker-large",
)

recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=5, node_postprocessors=[reranker], verbose=True
)

  subpatternappend((LITERAL, _ord(this)))


tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [27]:
print(len(nodes))

64


In [33]:
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_files=["wp_McKinsey - Exploring Generative AI for Business Success.pdf"])
base_docs = reader.load_data()
raw_index = VectorStoreIndex.from_documents(base_docs)
raw_query_engine = raw_index.as_query_engine(
    similarity_top_k=5, node_postprocessors=[reranker]
)

could not convert string to float: '0.000000000-5820766' : FloatObject (b'0.000000000-5820766') invalid; use 0.0 instead
could not convert string to float: '0.000000000-5820766' : FloatObject (b'0.000000000-5820766') invalid; use 0.0 instead
could not convert string to float: '0.000000000-5820766' : FloatObject (b'0.000000000-5820766') invalid; use 0.0 instead
could not convert string to float: '0.000000000-5820766' : FloatObject (b'0.000000000-5820766') invalid; use 0.0 instead


In [35]:
query = "Examples of how companies have started incorporating GenAI. Please give examples from the document"

response_1 = raw_query_engine.query(query)
print("\n***********Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)


***********Basic Query Engine***********
Examples of how companies have started incorporating Generative AI include:

1. **Morgan Stanley**: Building an AI assistant using GPT-4 to help wealth managers quickly find and synthesize answers from a massive internal knowledge base.
2. **COVU**: Developing a GenAI solution to manage insurance operations, customer service, and cross-selling.
3. **OTP Bank**: Generated a Hungarian large language model to enable over 30 banking use cases, focusing on customer interactions, fraud detection, and cybersecurity.
4. **Stripe**: Leveraging Generative AI to improve Q&A search of its documentation database.
5. **Insilico Medicine**: Developed a GenAI model to predict clinical trial success rates with over 80% accuracy.
6. **AstraZeneca**: Accelerating drug discovery by training AI models on the "grammar" of biochemistry and digital pathology images to help generate new molecules.
7. **Jasper**: Using GPT-3 to generate marketing content such as copywri

In [36]:
query = "How can GenAI transform the business to interact with the technology?"

response_1 = raw_query_engine.query(query)
print("\n***********Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)


***********Basic Query Engine***********
GenAI can transform business interactions with technology by enabling natural language interfaces that allow users to write or speak in natural language to perform tasks automatically and achieve better outcomes. This shift from traditional command prompts and graphical user interfaces to AI-enabled natural language interfaces can streamline operations, enhance user experience, and improve efficiency.

***********New LlamaParse+ Recursive Retriever Query Engine***********
GenAI can transform business interactions with technology by enabling natural language interfaces that allow users to perform tasks automatically and more efficiently. This transformation includes the ability to write or speak in natural language to execute tasks, improving outcomes compared to traditional command prompts or graphical user interfaces. Additionally, businesses can leverage GenAI to enhance customer service, streamline operations, and generate new content, such 

In [37]:
query = "can you list the industries that will be impacted by GenAI? Maybe you can rank them by the level of impact in a tabular format?"

response_1 = raw_query_engine.query(query)
print("\n***********Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)


***********Basic Query Engine***********
Certainly! Here is a list of industries that will be impacted by Generative AI, ranked by the level of impact in a tabular format:

| Rank | Industry                                | Productivity Impact ($ billion) |
|------|-----------------------------------------|---------------------------------|
| 1    | Tech                                    | 240-460                         |
| 2    | Retail                                  | 240-390                         |
| 3    | Banking                                 | 200-340                         |
| 4    | Travel, Transport & Logistics           | 180-300                         |
| 5    | Advanced Manufacturing                  | 170-290                         |
| 6    | Consumer Packaged Goods                 | 160-270                         |
| 7    | Healthcare                              | 150-260                         |
| 8    | Administrative & Professional Services  | 150-250   

In [38]:
query = "Give me a summary of this document, like you would give in a presentation."

response_1 = raw_query_engine.query(query)
print("\n***********Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)


***********Basic Query Engine***********
This document, titled "Exploring Generative AI for Business Success," is a presentation by McKinsey & Company, specifically QuantumBlack, AI by McKinsey, delivered at The Secret Sauce Summit 2023. It explores the transformative potential of Generative AI (GenAI) in business, emphasizing its applications and benefits. Key highlights include:

1. **Generative AI Applications**: The document outlines the "4 C's" of GenAI value—Creative content, Customer engagement, Concision, and Coding & software. It details how GenAI can generate text, images, and other content, enhance customer service, draft marketing emails, and assist in software development and mainframe migration.

2. **Risks and Mitigation**: It identifies key risks associated with GenAI, such as hallucination, security vulnerabilities, privacy concerns, fairness, trust issues, and legal implications. The document suggests various controls and measures to mitigate these risks, including p