In [1]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.node_parser import SentenceSplitter

# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

# base node parser is a sentence splitter
text_splitter = SentenceSplitter()

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-mpnet-base-v2", max_length=512
)

from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model
Settings.text_splitter = text_splitter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["/home/huangj2/Documents/rag-in-action/90-文档-Data/复杂PDF/IPCC_AR6_WGII_Chapter03.pdf"]
).load_data()

In [4]:
nodes = node_parser.get_nodes_from_documents(documents)

In [5]:
base_nodes = text_splitter.get_nodes_from_documents(documents)

In [6]:
from llama_index.core import VectorStoreIndex

sentence_index = VectorStoreIndex(nodes)

In [7]:
base_index = VectorStoreIndex(base_nodes)

In [8]:
from llama_index.core.postprocessor import MetadataReplacementPostProcessor

query_engine = sentence_index.as_query_engine(
    similarity_top_k=2,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)
window_response = query_engine.query(
    "What are the concerns surrounding the AMOC?"
)
print(window_response)

There is low confidence in the quantification of AMOC changes in the 20th century due to low agreement in quantitative reconstructed and simulated trends. Additionally, direct observational records since the mid-2000s are considered too short to determine the relative contributions of internal variability, natural forcing, and anthropogenic forcing to AMOC change. Despite these uncertainties, it is very likely that the AMOC will decline over the 21st century for all Shared Socioeconomic Pathways (SSP) scenarios, but it is not expected to involve an abrupt collapse before 2100.


In [9]:
window = window_response.source_nodes[0].node.metadata["window"]
sentence = window_response.source_nodes[0].node.metadata["original_text"]

print(f"Window: {window}")
print("------------------")
print(f"Original Sentence: {sentence}")

Window: 4.3.2.2, 9.6.3 (Fox-Kemper 
et al., 2021; Lee et al., 
2021)
Extreme sea levels
Relative sea level rise is driving a global increase 
in the frequency of extreme sea levels (high 
confidence).
 9.6.4 (Fox-Kemper et al., 
2021)
Rising mean relative sea level will continue to 
drive an increase in the frequency of extreme sea 
levels (high confidence).
 9.6.4 (Fox-Kemper et al., 
2021)
Ocean circulation
Ocean stratification
‘The upper ocean has become more stably 
stratified since at least 1970 […] (virtually 
certain).’
9.2.1.3 (Fox-Kemper et al., 
2021)
‘Upper-ocean stratification will continue to 
increase throughout the 21st century (virtually 
certain).’
9.2.1.3 (Fox-Kemper et al., 
2021)
Eastern boundary 
upwelling systems
‘Only the California current system 
has experienced some large-scale 
upwelling-favourable wind intensification since 
the 1980s (medium confidence).’
9.2.5 (Fox-Kemper et al., 
2021)
‘Eastern boundary upwelling systems will 
change, with a dipole spatia

In [10]:
query_engine = base_index.as_query_engine(similarity_top_k=2)
vector_response = query_engine.query(
    "What are the concerns surrounding the AMOC?"
)
print(vector_response)

The concerns surrounding the AMOC include potential slowdown or collapse due to climate change, which could have significant impacts on regional and global climate patterns, including sea level rise, temperature changes, and extreme weather events.


In [11]:
query_engine = base_index.as_query_engine(similarity_top_k=5)
vector_response = query_engine.query(
    "What are the concerns surrounding the AMOC?"
)
print(vector_response)

Concerns surrounding the AMOC include potential slowdown or collapse due to climate change, which could lead to significant impacts on regional and global climate patterns, including changes in temperature, precipitation, and sea level rise.


In [12]:
for source_node in window_response.source_nodes:
    print(source_node.node.metadata["original_text"])
    print("--------")

2.3.3.4, 9.2.3 (Fox-Kemper 
et al., 2021; Gulev et al., 
2021)
The AMOC will decline over the 21st century 
(high confidence, but low confidence for 
quantitative projections).

--------
Over the 21st century, AMOC will very likely decline for all SSP 
scenarios but will not involve an abrupt collapse before 2100 (WGI 
AR6 Sections 4.3.2, 9.2.3.1; Fox-Kemper et al., 2021; Lee et al., 2021).

--------


In [13]:
for node in vector_response.source_nodes:
    print("AMOC mentioned?", "AMOC" in node.node.text)
    print("--------")

AMOC mentioned? False
--------
AMOC mentioned? False
--------
AMOC mentioned? False
--------
AMOC mentioned? False
--------
AMOC mentioned? False
--------


In [14]:
print(vector_response.source_nodes[2].node.text)

Heightened risk of non-indigenous species immigration 
from vessel traffic plus climate change further endangers MPA success 
(Iacarella et  al., 2020), a particular concern in the Mediterranean 
(D’Amen and Azzurro, 2020; Mannino and Balistreri, 2021), where 
the current MPA network is already highly vulnerable to climate 
change (Kyprioti et  al., 2021). This new evidence supports SROCC’s 
high confidence assessment that present governance arrangements, 
including MPAs, are too fragmented to provide integrated responses 
to the increasing and cascading risks from climate change in the ocean 
(SROCC SPMC1.2; IPCC, 2019c).
Strategic conservation planning can yield future MPA networks 
substantially more ready for climate change (e.g., Section  3.6.3.1.5; 
SROCC SPM C2.1; IPCC, 2019c; Frazão Santos et al., 2020; Rassweiler 
et  al., 2020). Global protection is increasing (Worm, 2017; Claudet 
et  al., 2020b) as nations pursue international targets (e.g., SDG14, 
Life Below Water aimed t

In [15]:
from llama_index.core.evaluation import DatasetGenerator, QueryResponseDataset

from llama_index.llms.openai import OpenAI
import nest_asyncio
import random

nest_asyncio.apply()

In [16]:
num_nodes_eval = 30
# there are 428 nodes total. Take the first 200 to generate questions (the back half of the doc is all references)
sample_eval_nodes = random.sample(base_nodes[:200], num_nodes_eval)
# NOTE: run this if the dataset isn't already saved
# generate questions from the largest chunks (1024)
dataset_generator = DatasetGenerator(
    sample_eval_nodes,
    llm=OpenAI(model="gpt-4"),
    show_progress=True,
    num_questions_per_chunk=2,
)

  dataset_generator = DatasetGenerator(


In [17]:
# eval_dataset = await dataset_generator.agenerate_dataset_from_nodes()
eval_dataset.save_json("data/ipcc_eval_qr_dataset.json")
# optional
eval_dataset = QueryResponseDataset.from_json("data/ipcc_eval_qr_dataset.json")

100%|█████████████████████████████████████████████████████████████████████████████| 30/30 [00:12<00:00,  2.42it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.24s/it]
100%|███████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.43s/it]
100%|███████████████████████████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.52s/it]
100%|███████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.74s/it]
100%|███████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.13s/it]
100%|███████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.28s/it]
100%|███████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.92s/it]
100%|███████████████████████████████████████████████████████████████████████████

FileNotFoundError: [Errno 2] No such file or directory: 'data/ipcc_eval_qr_dataset.json'

In [18]:
eval_dataset.save_json("ipcc_eval_qr_dataset.json")

In [19]:
import asyncio
import nest_asyncio

nest_asyncio.apply()

In [20]:
from llama_index.core.evaluation import (
    CorrectnessEvaluator,
    SemanticSimilarityEvaluator,
    RelevancyEvaluator,
    FaithfulnessEvaluator,
    PairwiseComparisonEvaluator,
)


from collections import defaultdict
import pandas as pd

# NOTE: can uncomment other evaluators
evaluator_c = CorrectnessEvaluator(llm=OpenAI(model="gpt-4"))
evaluator_s = SemanticSimilarityEvaluator()
evaluator_r = RelevancyEvaluator(llm=OpenAI(model="gpt-4"))
evaluator_f = FaithfulnessEvaluator(llm=OpenAI(model="gpt-4"))
# pairwise_evaluator = PairwiseComparisonEvaluator(llm=OpenAI(model="gpt-4"))

In [21]:
from llama_index.core.evaluation.eval_utils import (
    get_responses,
    get_results_df,
)
from llama_index.core.evaluation import BatchEvalRunner

max_samples = 30

eval_qs = eval_dataset.questions
ref_response_strs = [r for (_, r) in eval_dataset.qr_pairs]

# resetup base query engine and sentence window query engine
# base query engine
base_query_engine = base_index.as_query_engine(similarity_top_k=2)
# sentence window query engine
query_engine = sentence_index.as_query_engine(
    similarity_top_k=2,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

In [22]:
import numpy as np

base_pred_responses = get_responses(
    eval_qs[:max_samples], base_query_engine, show_progress=True
)
pred_responses = get_responses(
    eval_qs[:max_samples], query_engine, show_progress=True
)

pred_response_strs = [str(p) for p in pred_responses]
base_pred_response_strs = [str(p) for p in base_pred_responses]

100%|█████████████████████████████████████████████████████████████████████████████| 30/30 [00:03<00:00,  8.40it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 30/30 [00:15<00:00,  1.90it/s]


In [23]:
evaluator_dict = {
    "correctness": evaluator_c,
    "faithfulness": evaluator_f,
    "relevancy": evaluator_r,
    "semantic_similarity": evaluator_s,
}
batch_runner = BatchEvalRunner(evaluator_dict, workers=2, show_progress=True)

In [24]:
evaluator_dict = {
    "correctness": evaluator_c,
    "faithfulness": evaluator_f,
    "relevancy": evaluator_r,
    "semantic_similarity": evaluator_s,
}
batch_runner = BatchEvalRunner(evaluator_dict, workers=2, show_progress=True)

In [25]:
eval_results = await batch_runner.aevaluate_responses(
    queries=eval_qs[:max_samples],
    responses=pred_responses[:max_samples],
    reference=ref_response_strs[:max_samples],
)
base_eval_results = await batch_runner.aevaluate_responses(
    queries=eval_qs[:max_samples],
    responses=base_pred_responses[:max_samples],
    reference=ref_response_strs[:max_samples],
)
results_df = get_results_df(
    [eval_results, base_eval_results],
    ["Sentence Window Retriever", "Base Retriever"],
    ["correctness", "relevancy", "faithfulness", "semantic_similarity"],
)
display(results_df)

100%|███████████████████████████████████████████████████████████████████████████| 120/120 [01:24<00:00,  1.42it/s]
100%|███████████████████████████████████████████████████████████████████████████| 120/120 [01:26<00:00,  1.39it/s]


Unnamed: 0,names,correctness,relevancy,faithfulness,semantic_similarity
0,Sentence Window Retriever,4.5,1.0,1.0,0.933759
1,Base Retriever,4.366667,0.966667,0.966667,0.909008
