# RAG Evaluation Dataset Generation

In [2]:
import os
import datetime
import pandas as pd
import json
import random
from sqlalchemy import make_url
import nest_asyncio
nest_asyncio.apply() 

from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.core.vector_stores.types import VectorStoreQuery
from llama_index.llms.openai_like import OpenAILike

import sys
utils_path = "../../../utils"
if utils_path not in sys.path:
    sys.path.append(utils_path)

from helpers import (
    remove_recs_without_query,
    remove_duplicated_queries,
)

In [3]:
# PGVector DB params
DB_PORT = 5432
DB_USER = "demouser"
DB_PASSWD = "demopasswd"
DEFAULT_DB = "postgres"
DB_NAME = "vectordb"
DB_HOST = "localhost"
TABLE_NAME = "NASA_HISTORY_BOOKS"
CONNECT_STRING = f"postgresql://{DB_USER}:{DB_PASSWD}@{DB_HOST}:{DB_PORT}/{DEFAULT_DB}"

# LLamaIndex embedding model
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-base-en-v1.5",
)
EMBEDDING_SIZE = len(Settings.embed_model.get_text_embedding("hi"))

# LLM service settings
MIN_RESPONSE_LENGTH = 50

# LLM service settings
LLM_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1" # Use the most powerful LLM at hand
LLM_API_BASE = "vLLM_API_BASE" # Chenge this value with you'r URL
LLM_API_KEY = "NO_KEY" # vLLM does not require a key by default
GEN_TEMP=0.1    # Temperature generation
MAX_TOKENS=1024 # Max, number of tokens to generate
REP_PENALTY=1.03 # Word repetition penalty at generation time

# Initialize LLamaIndex LLM provider
Settings.llm = OpenAILike(
    model=LLM_MODEL,
    api_key=LLM_API_KEY,
    api_base=LLM_API_BASE,
    temperature=GEN_TEMP,
    max_tokens=MAX_TOKENS,
    repetition_penalty=REP_PENALTY,
)

# Number of workers for the dataset generator
NUM_WORKERS = 4

In [5]:
# Directory path to save the Q/A files
SAVE_DIR = f"../qa_datasets/{LLM_MODEL.split('/')[-1]}"
if os.path.isdir(SAVE_DIR):
    now = '_'.join(f"{datetime.datetime.now()}".split('.')[0].split())
    os.rename(SAVE_DIR, f"{SAVE_DIR}-{now}")
os.mkdir(SAVE_DIR)
print(f"Saving evaluation datasets at {SAVE_DIR}")

Saving evaluation datasets at ../qa_datasets/Mixtral-8x7B-Instruct-v0.1


In [6]:
%%time

# Open the connection to the pre-populated PGVector store
url = make_url(CONNECT_STRING)
vector_store = PGVectorStore.from_params(
    database=DB_NAME,
    host=url.host,  
    password=url.password,
    port=url.port,
    user=url.username,
    table_name=TABLE_NAME,
    embed_dim=EMBEDDING_SIZE, # embedding model dimension
    cache_ok=True,
    hybrid_search=True,
)

# Load all nodes from the vector store.  Adjust `similarity_top_k` to be >= number of nodes in the vector store
query = VectorStoreQuery(query_embedding=[0 for i in range(EMBEDDING_SIZE)], similarity_top_k=10_000_000)
result = vector_store.query(query)
nodes = result.nodes
print(f"Loaded {nodes} nodes from the vector store")

CPU times: user 851 ms, sys: 110 ms, total: 961 ms
Wall time: 1.09 s


In [8]:
# Set the number of Q/A pairs to generate at 10% of the total nodes
num_eval_nodes = int(round(len(nodes)*0.1, 0))
eval_nodes_sample = random.sample(nodes, num_eval_nodes)

In [9]:
%%time
# Initialize the dataset generator
dataset_generator = RagDatasetGenerator(
    nodes=eval_nodes_sample,
    show_progress=True,
    num_questions_per_chunk=1,
    workers=NUM_WORKERS
)

# Generate the Q/A pairs from nodes
rag_eval_dataset =  dataset_generator.generate_dataset_from_nodes() 

100%|██████████| 382/382 [06:25<00:00,  1.01s/it]
100%|██████████| 1/1 [00:03<00:00,  3.90s/it]
100%|██████████| 4/4 [00:20<00:00,  5.10s/it]
100%|██████████| 4/4 [00:08<00:00,  2.05s/it]
100%|██████████| 1/1 [00:09<00:00,  9.45s/it]
100%|██████████| 2/2 [00:15<00:00,  7.55s/it]
100%|██████████| 2/2 [00:07<00:00,  3.86s/it]
100%|██████████| 4/4 [00:27<00:00,  6.97s/it]
100%|██████████| 4/4 [00:06<00:00,  1.73s/it]
100%|██████████| 1/1 [00:06<00:00,  6.44s/it]
100%|██████████| 2/2 [00:10<00:00,  5.04s/it]
100%|██████████| 4/4 [00:25<00:00,  6.48s/it]
100%|██████████| 4/4 [00:21<00:00,  5.44s/it]
100%|██████████| 1/1 [00:14<00:00, 14.02s/it]
100%|██████████| 11/11 [00:37<00:00,  3.41s/it]
100%|██████████| 2/2 [00:09<00:00,  4.81s/it]
100%|██████████| 1/1 [00:01<00:00,  1.71s/it]
100%|██████████| 1/1 [00:02<00:00,  2.30s/it]
100%|██████████| 2/2 [00:07<00:00,  3.60s/it]
100%|██████████| 3/3 [00:16<00:00,  5.66s/it]
100%|██████████| 6/6 [00:20<00:00,  3.47s/it]
100%|██████████| 2/2 [00:45<

CPU times: user 39 s, sys: 4.13 s, total: 43.1 s
Wall time: 1h 18min 50s





In [14]:
# Adjust Pandas siaplay settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Set the bar for the minimum length (words) for an answer in a Q/A pair
MIN_RESPONSE_LENGHT = 100

# Convert the test set to Pandas
rag_eval_df = rag_eval_dataset.to_pandas()

# Keep real questions only
rag_eval_df = rag_eval_df[rag_eval_df["query"].str.contains("\?")]

# Filter out too short responses
too_short_responses = rag_eval_df["reference_answer"].str.count(' ') < MIN_RESPONSE_LENGHT
rag_eval_df = rag_eval_df[~too_short_responses]
clean_rag_eval_df = rag_eval_df.drop_duplicates(
    subset=["query"],
    ignore_index=True,
).copy()

# Serialize the dataset as CSV file
clean_rag_eval_df.to_csv(
    path_or_buf=f"{SAVE_DIR}/NASA_history_qa_only.csv",
    index=False,
)

In [16]:
# Display some examples
clean_rag_eval_df[['query', 'reference_answer', 'reference_contexts']].sample(20)

Unnamed: 0,query,reference_answer,reference_contexts
57,What was the significance of propeller cuffs in the refinement of aircraft propeller blade design and how did they contribute to the performance of the North American P-51 Mustang fighter and the cooling of radial engine designs such as the Republic P-47 Thunderbolt and the Boeing B-29 Superfortress long-range bomber?,"\nPropeller cuffs were significant in the refinement of aircraft propeller blade design as they allowed for the modification of existing blade designs, particularly in increasing the blade area. This modification enabled aircraft to reach higher speeds, as demonstrated by a single-engine fighter that was theoretically capable of 400 mph with the right propeller but could not yet reach that speed with a conventional propeller root-shape.\n\nThe North American P-51 Mustang fighter was modified with blade cuffs for increased performance, which maximized the aircraft's overall performance. Similarly, cuffed propellers improved the cooling of radial engine designs such as the Republic P-47 Thunderbolt and the four-engine Boeing B-29 Superfortress long-range bomber, which were known for their notorious cooling and engine fire problems.\n\nTherefore, propeller cuffs played a crucial role in enhancing the performance of fighter aircraft and improving the cooling of radial engine designs, contributing to their overall efficiency and safety.","[The NACA and Aircraft Propulsion, 1915–1958\n11\nNACA researchers also worked to refine the aerodynamic properties of a \npropeller blade along its entire length with airfoil sections called cuffs. Blade \ndesign reflected a compromise where most of the blade was an airfoil, but the \nportion where it attached to the hub, called the root, was round for structural \nstrength. In 1939, Langley researchers in the Full-Scale Tunnel investigated a \nsingle-engine fighter that was theoretically capable of 400 mph with the right \npropeller but could not yet reach that speed with a conventional propeller \nroot-shape. To each blade root, they attached airfoil-shaped cuffs that covered \nover 45 percent of the blade. The increased blade area enabled the fighter to \nreach 400 mph at 20,000 feet. Langley carried on with extensive research on \ncuffs that allowed for the modification of existing blade designs.49 The North \nAmerican P-51 Mustang fighter was modified with blade cuffs for increased \nperformance, and cuffed propellers likewise improved the cooling of radial \nengine designs such as the Republic P-47 Thunderbolt and—very significantly, \nbecause of its notorious cooling and engine fire problems—the four-engine \nBoeing B-29 Superfortress long-range bomber.\nFigure 1-2. The use of propeller cuffs on fighters like the P-51 Mustang maximized the \naircraft’s overall performance. (National Air and Space Museum, Smithsonian Institution, \nNASM 7A35592)]"
11,Question: What are the features of the ladies' dressing room on the aircraft?,"\nThe ladies' dressing room on the aircraft is furnished in the modern style and equipped with a hand lavatory with hot and cold running water, a dental lavatory, a large double dressing table complete with four lights, two large plateglass mirrors, two benches, receptacles for Kleenex, etc. A ladies' toilet compartment opens from this dressing room, and each toilet is equipped with a standard size toilet seat and is of the flushing type. The flushing water is the waste water from the hand and dental lavatories. As the toilet top is raised, a cylinder is filled with this water and when the lid is lowered, this cylinder discharges the water, flushing the toilet bowl, and then the contents are dumped outside the aircraft.","[347\nDocument 5-23 (a–c)\nfoodstuffs and galley supplies is 256 pounds[,] making a total of 490 pounds. In \naddition, there are two drinking fountains, one on each end of the passengers’ deck. \nIncidentally, the galley equipment was designed and built by Pan American Airways.\nThe ladies’ dressing room is furnished in the modern style and is equipped \nwith a hand lavatory with hot and cold running water, a dental lavatory and a large \ndouble dressing table complete with four lights, two large plateglass mirrors, two \nbenches, receptacles for Kleenex, etc. A ladies’ toilet compartment opens from this \ndressing room. Incidentally, each toilet on this ship is equipped with standard size \ntoilet seat and is of the flushing type. The flushing water is the waste water from the \nhand and dental lavatories. As the toilet top is raised, a cylinder is filled with this \nwater and when the lid is lowered this cylinder discharges the water, flushing the \ntoilet bowl[,] and then the contents are dumped outside the aircraft.\nThe men’s dressing room is fitted with two hand lavatories with hot and cold \nrunning water, one dental lavatory and, believe it or not, a stand up urinal. This \nurinal will no doubt be an interesting innovation to the passengers. A separate toilet \ncompartment opens off the men’s dressing room.\nThe interior decorations throughout the passenger deck are in keeping with \nthe present-day trend of modern design, simple, functional, and in refreshing color \narrangement. Fabrics for both upholstery and lining are interesting in texture and \ndesign and are all flameproofed. All compartment lining and soundproofing is \nquickly removable for inspection and cleaning and, therefore, can be kept in excel-\nlent condition.\nTime will not allow me to go further into the details of this interior, but there \ncan be no doubt to those who have seen the actual installation that the design objec-\ntive of unprecedented passenger comfort, spaciousness and luxury has been realized.\nAlthough safety appeared fourth on the list of design criteria, it was by no means \nthe least important and was, in fact, uppermost in our minds during the design and \nconstruction processes. No compromise with safety was ever made. Naturally, all \nmanufacturers and operators strive to make and operate their equipment as safe as \npossible and both the Pan American Airways and Boeing organizations set out to \nembody safety in the fundamental design and to make the 314 one of the safest air-\ncraft possible with present-day knowledge of the science of aviation. We believe we \nhave accomplished this aim, and in order to illustrate the extent and thoroughness \nof our endeavor along this line a few safety provisions will be discussed.\nNo doubt many of the items which will be mentioned have long since been \nconsidered and incorporated in certain equipment, but the thoroughness with \nwhich we have attacked them may be interesting.\nRoughly speaking, aircraft accidents come under two general headings—\nmechanical failures and personnel errors. Obviously, accidents under both these \nheadings are more or less the joint responsibility of the manufacturer and operator. \nTherefore, we as a manufacturer have attempted to reduce mechanical failures by \nproducing conservative structures; trouble free mechanical accessories; simple and]"
49,How does the planetary decadal survey process conducted by the Space Studies Board (SSB) differ from the process conducted by the National Academy of Sciences (NAC) in terms of their relationship with NASA and the public availability of their reports?,"\nThe Space Studies Board (SSB) and the National Academy of Sciences (NAC) are two different entities that provide recommendations to NASA, but they differ in their relationship with NASA and the public availability of their reports.\n\nThe NAC is an organization that is internal to NASA, meaning it is part of the agency. The NASA Administrator has the authority to implement NAC recommendations, or not. The NAC does not publish many formal studies, and the recommendations may or may not be made public.\n\nOn the other hand, the SSB is external to NASA and conducts studies for the agency on a contractual basis. Nearly all SSB reports are available to the public, and the board regularly briefs Congress on findings. This means that the SSB's recommendations are more likely to be made public and accessible to the general public.\n\nIn terms of the planetary decadal survey process, the SSB has been conducting decadal surveys for planetary science since the early 2000s, providing recommendations from the science community on programs within NASA and prioritizing the most important questions in planetary science. The SSB's decadal surveys have shifted the focus of programmatic decision-making for NASA, as they include more detailed programmatic recommendations than most previous SSB works.\n\nCongress has often instructed NASA to fund studies on various topics and now regularly includes language in legislation directing NASA to execute the recommendations of SSB decadal surveys. Excerpts from decadal surveys have appeared in congressional authorization legislation, indicating the high regard in which members of Congress now hold these reports.\n\nTherefore, the SSB's planetary decadal survey process is more transparent and publicly available than the NAC's process, and it has a more significant impact on NASA's programmatic decision-making.","[77\nCHAPTER 2 • FuNdINg PLANETARY SCIENCE: HISTORY ANd POLITICAL ECONOMY \nhas served to communicate the community’s interests to NASA and other \ngovernment stakeholders.\nWhen planetary science began to compete with the fields of astrophysics, \nEarth science, heliophysics, and microgravity research for limited resources, \nthe threat arose that these space science communities might pit themselves \nagainst one another, with the potential to disrupt the presentation of a uni-\nfied voice when communicating the interests of space science writ large. By \nthe early 2000s, witnessing the success of the SSB’s decadal survey process \nfor astronomy and astrophysics, NASA requested that the SSB undertake a \ndecadal survey for planetary science. The survey provided recommendations \nfrom the science community on programs within NASA and prioritized the \nmost important questions in planetary science. Prioritization allowed NASA \nflexibility in the event of budget cuts, cost overruns, or unexpected circum-\nstances, and directed the Agency toward a strategy that the planetary sci-\nence community supported. The survey also explained NASA’s rationale for \nits programmatic choices to Congress and demonstrated that those choices \nwere supported by the science community. The planetary survey represented \na shift in programmatic decision-making for NASA, as it included more \ndetailed programmatic recommendations than most previous SSB works.\nThat trend continues with the most recent decadal survey, Vision and \nVoyages for Planetary Science in the Decades 2013–2022,72 which outlines sci-\nentific and general programmatic recommendations and includes guidelines \non how priorities should shift in differing budget scenarios. Congress has \noften instructed NASA to fund studies on various topics and now regularly \nincludes language in legislation directing NASA to execute the recommen-\ndations of SSB decadal surveys. In recent years, excerpts from decadal sur-\nveys have appeared in congressional authorization legislation, indicating the \nhigh regard in which members of Congress now hold these reports.\nThe slow shift in focus from the NAC to the SSB in making program-\nmatic recommendations represents an interesting dynamic. The NAC is an \norganization independent from the NASA program offices, but it is internal \nto NASA. The NASA Administrator has the authority to implement NAC \nrecommendations, or not, and the council does not publish many formal \nstudies. The SSB, by contrast, is external to NASA and conducts studies for \nthe Agency on a contractual basis. Nearly all SSB reports are available to \nthe public, and the board regularly briefs Congress on findings. As a result, \n72. Committee on the Planetary Science Decadal Survey, Space Studies Board, National \nResearch Council, Vision and Voyages for Planetary Science in the Decades 2013–2022 \n(Washington, DC: The National Academies Press, 2012).]"
105,What was the purpose of flying the XV-15 behind and in close formation to the Lockheed YO-3A research aircraft?,"The purpose of flying the XV-15 behind and in close formation to the Lockheed YO-3A research aircraft was to obtain comparable free flight noise data to determine the effect of the wind tunnel walls on the measured sound during an acoustics test of the XV-15 metal-bladed proprotor in the acoustically treated test section of the Ames 80- by 120-foot wind tunnel. This was done by maintaining the YO-3A microphone location at a fixed distance and position with respect to the XV-15 proprotor, and by operating at the same prop-rotor operating condition. This allowed for a direct comparison between the flight data and wind tunnel test data.","[815\nDocuments 5-49 (a–d)\ncould be used to significantly reduce the noise level and footprint area during tilt \nrotor approaches.\nIn December 1995, with plans being developed for an acoustics test of the \nXV-15 metal-bladed proprotor in the acoustically treated test section of the Ames \n80- by 120-foot wind tunnel, a special flight investigation was required to obtain \ncomparable free flight noise data to determine the effect of the wind tunnel walls \non the measured sound. The evaluation involved flying the XV-15 behind, and in \nclose formation to[,] a quiet research aircraft (the Lockheed YO-3A) which was \nequipped with microphones and recording equipment. By maintaining the YO-3A \nmicrophone location at a fixed distance and position with respect to the XV-15 \nproprotor (shown in figure 60) [not reproduced] corresponding to a microphone \nlocation in the test section of the wind tunnel, and by operating at the same prop-\nrotor operating condition, a direct comparison (with corrections for the second \nproprotor) between the flight data and wind tunnel test data was obtained. This \nexperiment was conducted by Ames researchers. The tests involved a Bell flight \ncrew in the XV-15, and a NASA flight crew in the YO-3A.\nCOMPOSITE PROPROTOR BLADES\nFrom the very beginning of the TRRA project the proprotor blades were of \nspecial concern to the Government Project Office. The metal blades used on the \nXV-15 were designed in the late 1960s under Bell’s IR&D funding for the predeces-\nsor tilt rotor aircraft, the Bell Model 300. This aircraft had a design gross weight \nof 12,400 pounds, 600 pounds lighter than that of the XV-15. The concern was \nthat the proprotors would be too highly loaded, i.e. operating too close to aero-\ndynamic stall, to provide adequate reserve thrust for control when operating in \nhover at high gross weights. This could result in a reduction of control effectiveness \nor the need for a substantial increase in power when operating at the high gross \nweight condition.\nFlight tests of the XV-15, however, did not indicate deficiencies. The metal \nbladed proprotor, although sized for a smaller aircraft, performed well at all XV-\n15 operating weights and flight conditions. While performance was satisfactory, \nanother problem emerged that could threaten the future of the XV-15. This was the \npossibility that one or more blades could become unserviceable or unflightworthy \ndue to mishandling or deterioration of the blade’s structural integrity.\nConcern centered on the aft blade section, an aerodynamic fairing constructed \nof a lightweight aluminum honeycomb core covered with a thin steel skin (figure 61) \n[not reproduced]. Over the first few years of aircraft operations, minor surface dam-\nage was incurred due to ground handling. More significantly, small areas of separa-\ntion of the bond between the skin and the honeycomb was [sic] detected on several \nblades. While the size of these “voids” was monitored during frequent inspections, \nthe discovery of a rapid growth in size or an unacceptably large separation area]"
68,"What was the intended mission of the 1963 Soviet Lunar Probe, and what was the outcome of the mission?","\nThe intended mission of the 1963 Soviet Lunar Probe was to verify key technological systems during a simpler mission into deep space and back to Earth. The spacecraft, named ""Object-Probes"" (ob'yekt-zond), was designed to depart from Earth's ecliptic out to 12-16 million kilometers from Earth and then return back to Earth after about six months when its orbit intersected with that of Earth again. The spacecraft was aided by two mid-course corrections using its S5.45 main engine.\n\nHowever, the outcome of the mission was not successful. During the mission, the third and fourth stages separated abnormally, and after reaching Earth orbit, ground control lost telemetry from the Blok L upper stage designed to send the vehicle past the Moon. As a result, the spacecraft remained stranded in Earth orbit. The stage's main engine turbopump probably exploded upon ignition, destroying the spacecraft. With this mission, the Soviets began the practice of giving ""Kosmos"" designations to obscure the failure of lunar and planetary probes that remained stranded in Earth orbit. If the spacecraft had successfully departed from Earth orbit, it would probably have been called ""Zond 1.""","[35\n1963 \n“Object-Probes” (ob’yekt-zond) were designed to \nverify key technological systems during simpler \nmissions into deep space and back to Earth. A gov-\nernment decree on March 21, 1963 had approved \ntwo to three such “object-probe” missions, one of \nwhich (a 3MV-1A) was designed to depart from \nEarth’s ecliptic (the orbital plane of Earth around \nthe Sun) out to 12–16 million kilometers from \nEarth and then return back to Earth after about \nsix months when its orbit intersected with that of \nEarth again, aided by two mid-course corrections \nusing its S5.45 main engine. The latter, capable of \ntwo firings, was a lighter version of that used on \nthe 2MV model with higher specific impulse and \na longer burn time. During this mission, the third \nand fourth stages separated abnormally, and after \nreaching Earth orbit, ground control lost teleme-\ntry (at about 06:45:44 UT) from the Blok L upper \nstage designed to send the vehicle past the Moon. \nAs a result, the spacecraft remained stranded in \nEarth orbit. The stage’s main engine turbopump \nprobably exploded upon ignition destroying the \nspacecraft. With this mission, the Soviets began \nthe practice of giving “Kosmos” designations to \nobscure the failure of lunar and planetary probes \nthat remained stranded in Earth orbit. If the space-\ncraft had successfully departed from Earth orbit, it \nwould probably have been called “Zond 1.”]"
...,...,...,...
59,"What are the inferred composition and structure of Comet Tempel 1 based on the Deep Impact mission observations, and what evidence supports these inferences?","\n\nComet Tempel 1 has an albedo of 0.059, a bulk density of about 0.4 grams per cubic centimeter, an extent of 4.4-8.0 kilometers, and an abundance of jet activity. Some of the jets are collimated and produce water vapor (H2O), while others on the night side are rich in carbon dioxide (CO2). The ratio of cometary production rates of CO2 compared to H2O is about 7 percent, suggesting a heterogeneous nucleus.\n\nSpectral observations by the University of Maryland's Jessica Sunshine and colleagues detected water ice in the impact ejecta and in three small patches on the surface of the nucleus. However, these surface patches are too small to explain the total amount of atmospheric water vapor, so most of it must come from subsurface sources. The top few centimeters of the comet's surface are largely ice-free, and the bulk of the H2O and CO2 ices are likely within 1 meter of the surface.\n\nTo account for the observed surface layering and the diverse composition of these layers, Arizona researcher Michael Belton suggested a nucleus model consisting of a pile of randomly stacked layers. These diverse layers are thought to have been produced over time by impacts of comets that originated in differing regions of a non-uniform protoplanetary nebula. This model, nicknamed the TALPS model, postulates the formation of layered cometary nuclei by the successive collisions of diverse, smaller, primordial comets.\n\nThe evidence supporting these inferences includes the Deep Impact mission observations, spectral observations by the University of Maryland, and the TALPS model, which was developed based on these observations.","[A History of Near-Earth Objects Research\n200\ncraters, an albedo of 0.059, a bulk density of about 0.4 grams per cubic cen-\ntimeter, an extent of 4.4–8.0 kilometers, and an abundance of jet activity.72 \nSome of the jets were collimated, and most produced water vapor (H2O), but \nsome on the night side were rich in the more volatile carbon dioxide (CO2). \nThe ratio of cometary production rates of CO2 compared to H2O was about \n7 percent, and the differing sources of water and carbon dioxide suggested \na heterogeneous nucleus.73 From spectral observations, the University of \nMaryland’s Jessica Sunshine and colleagues detected water ice in the impact \nejecta and in three small patches on the surface of the nucleus. However, \nthese surface patches were far too small to explain the total amount of atmo-\nspheric water vapor, so most of it must have come from subsurface sources.74 \nA’Hearn and colleagues concluded that the top few centimeters of the comet’s \nsurface are largely ice-free and the bulk of the H2O and CO2 ices are likely \nwithin 1 meter of the surface.75 To account for the observed surface layer-\ning and the diverse composition of these layers, Arizona researcher Michael \nBelton suggested a nucleus model consisting of a pile of randomly stacked \nlayers. These diverse layers were thought to have been produced over time \nby impacts of comets that originated in differing regions of a non-uniform \nprotoplanetary nebula.76\n72. P. C. Thomas et al., “The Shape, Topography, and Geology of Tempel 1 from Deep \nImpact Observations,” Icarus 191 (2007): 51–62; J. E. Richardson, H. J. Melosh, \nC. M. Lisse, and B. Carcich, “A Ballistic Analysis of the Deep Impact Ejecta Plume: \nDetermining Comet Tempel 1’s Gravity, Mass, and Density,” Icarus 191 (2007): 176–\n209. Updates for some of Comet Tempel 1’s parameters are provided by Veverka et al., \n“Return to Comet Tempel 1: Overview of Stardust-NExT Results,” Icarus 222 (2013).\n73. L. M. Feaga et al., “Asymmetries in the Distribution of H2O and CO2 in the Inner \nComa of Comet 9P/Tempel 1 as Observed by Deep Impact,” Icarus 191 (2007): \n134–145.\n74. J. M. Sunshine et al., “The Distribution of Water Ice in the Interior of Comet \nTempel 1,” Icarus 191 (2007): 73–83; J. Sunshine et al., “Exposed Water Ice Deposits \non the Surface of Comet 9P/Tempel 1” Science 311 (2006): 1453–1455.\n75. M. F. A’Hearn et al., “Deep Impact and Sample Return,” Earth, Planets and Space 60 \n(2008): 61–66.\n76. M. Belton et al., “The Internal Structure of Jupiter Family Cometary Nuclei from Deep \nImpact Observations: The “TALPS” or “Layered Pile” Model,” Icarus 187 (2007): \n332–344. This model, which postulated the formation of layered cometary nuclei by \nthe successive collisions of diverse, smaller, primordial comets, was nicknamed the \nTALPS model (TALPS spelled backward is SPLAT).]"
27,How did CFD help in the redesign of the wing for the HiMAT flight test vehicle and the retrofitting of the engines on the Boeing 737?,"\nCFD helped in the redesign of the wing for the HiMAT (Highly Maneuverable Aircraft Technology) flight test vehicle by providing the necessary computational power to solve the complex Navier-Stokes equations that were too difficult to resolve using linear equations. This allowed engineers to model the behavior of blunt-body objects and break down the equations into usable subparts, establishing methodologies for resolving multiple variables concurrently. This led to the successful redesign of the wing for the HiMAT vehicle, a 1/3 scale model of a fighter plane used to test new aerodynamic and control concepts.\n\nSimilarly, CFD techniques were also key to the successful retrofitting of the engines on the Boeing 737. By using data from CFD studies, Boeing engineers were able to design the new fanjets for the engines for much greater efficiency. This helped make the Boeing 737 one of the most successful passenger airliners in history.\n\nIt's important to note that while CFD provides great capabilities to resolve Navier-Stokes equations, wind tunnels are necessary to validate CFD findings and demonstrate solutions in the real world. Wind tunnels, often exceedingly small ones that used tiny models, were ideal for that purpose.","[NACA to NASA to Now\n182\nhad been so complex and time-consuming that aerodynamicists had great \ndifficulty applying them efficiently to aircraft design. Engineers using linear \nequations—two-dimensional (2D)—could go only so far. The solving of many \nvariables using multiple equations concurrently required greater computational \npower than was available in the predigital age. \nEngineers at Ames Research Center had long been working to streamline \nthe process of resolving the problem of Navier-Stokes equations. An early key \nfigure at Ames was Harvard Lomax, who started experimenting with electronic \ncomputers from the late 1950s into the 1970s, using them to model the behav-\nior of blunt-body objects. By 1976, the possibilities of CFD had progressed suf-\nficiently for Ames Research Center Director Hans Mark to set up a group under \nengineer F. Ronald Bailey to advance the concept. Using the most powerful \ncomputers available—at the time the Cray-1 supercomputer—Ames research-\ners began to break down the equations into usable subparts and establish meth-\nodologies for resolving multiple variables concurrently. Three years later Ames \nformed the Numerical Aerodynamic Simulator (NAS) Projects Office with \nthe specific objective of advancing CFD capabilities. Later, Ames also put a \nCray X-MP and a Cray-2 to use for CFD analysis. Research activities at other \nNASA Centers also helped to transform the entire field. During this process the \nILLIAC IV became the first massively parallel computer; it found significant \nuse in advancing CFD capabilities at Ames Research Center and elsewhere. \nSome friction quickly emerged between traditionalists who had a strong \ncommitment to wind tunnel testing and the emerging capabilities of com-\nputational fluid dynamics. Early on, NASA’s “tunnel rats” resented the rising \namount of funding invested in CFD research, money that they believed could \nbe effectively used for wind tunnel work. Only with efforts to make sure that \nequitable funding existed—and with the training of wind tunnel personnel in \nCFD techniques—did this begin to change. Over time, it became obvious that \nCFD provided great capabilities to resolve Navier-Stokes equations, but that \nwind tunnels were necessary to validate CFD findings. Always, users found \nthat for all the great capabilities of CFD, at some point there also has to be \ndemonstration of solutions in the real world. Wind tunnels, often exceedingly \nsmall ones that used tiny models, were ideal for that purpose.\nCFD began to pay off in the late 1970s, with the successful redesign of the \nwing for the HiMAT (Highly Maneuverable Aircraft Technology) flight test \nvehicle, a 1/3 scale model of a fighter plane used to test new aerodynamic and \ncontrol concepts. CFD techniques were also key to the successful retrofitting of \nthe engines on the Boeing 737, making it one of the most successful passenger \nairliners in history. Using data from CFD studies, Boeing engineers were able to \ndesign the new fanjets for the engines for much greater efficiency.]"
109,"Question: What is the significance of the ""single string"" design in the development of single-stage-to-orbit vehicles (SSTO)?","\nThe ""single string"" design refers to a design approach where a vehicle has only one of each critical system, such as engines or avionics. In the context of single-stage-to-orbit vehicles (SSTO), this design approach has been a significant challenge due to the high reliability required for the vehicle to reach orbit. The failure of any single system could result in the mission being lost, making the development of SSTO vehicles with this design approach a complex and risky endeavor. Despite these challenges, the ""single string"" design has been pursued in various SSTO concepts, such as the X-33 Program, due to its potential benefits in terms of simplicity, weight reduction, and cost savings.","[Promise Denied\n388\nS\nSackheim, Robert, 187\nSänger, Eugen, 26\nScaled Composites, 64–65, 128, 297\nSchmucker, Christopher P., 317\nSchmucker, Robert H., 317\nSchriever, Bernard, 204\nScott, Robert Falcon, 37\nSexton, Jeff, 226–227, 242, 245–246\nShadoan, Michael D., 329, 333\nShenlong (“Divine Dragon”), 26\nShnayder, Taila, 349–350\nShoffner, Curt, 226, 272\nSierra Nevada Corporation, 296–299, 319n7\n“single string” design, 44–45, 144, 281–282, \n286, 303, 306, 312–313, 316, 318\nsingle-stage-to-orbit vehicles (SSTO), 1, 3, 18, \n32, 315, 318. See also individual vehicles\nair-breathing, 156\nalternatives, 69\nconcept competition, 39\ndevelopment, 24, 33\nfeasibility, 16, 26, 69\nJapan, 24–25\nlaunch needs, 303\nlogistical air-breathing vehicle, 5\nlogistical rocket vehicle, 4\npayloads, 15–16, 23\nPayton, Gary E., 273\nPhase I, 38\nPhase II, 37–38\nPhase III, 37\npropulsion systems, 16–17\nreusability, 17\nsuccess of, 274\ntechnical challenges, 317\nthermal protection system (TPS), 16\nX-30 National Aero-Space Plane (NASP), 54\nX-33 Program, 106\nSirangelo, Mark, 297\nSkeen, Joe, 232\nSkylong reusable launch, 26\nsmall booster technology demonstrator (SBTD), \n103\nSmith, Dane, 176\nSoutheast Kern Weekender, 232\nSpace Access, 71, 104, 107\nSpace Launch Initiative (SLI), 275–276, 283, \n286–287, 295, 299–300, 302, 318\nbudget, 313–314\ndefined, 304\npredictions, 295\nSpace Shuttle, viii, 7, 25, 75, 197, 275\naerodynamic database, 170\naging fleet, 1, 17, 20\nalternatives, 2, 17, 22, 27, 277–278, 314, \n317\nbias toward, 16\nbooster, 7\ncarrier aircraft, 78, 81, 83\nChallenger, 114\nColumbia, 114, 123\ncompared to X-34 Renewed Program, 132\ncosts, 7, 27, 277\ndescent, 127\nEndeavour, 52\nEnterprise, 128, 149\nfeasibility, 274\nfinal flight, 9\nflight rates, 277\nimprovements, 9, 27\nKennedy Space Center (KSC), 231\nLanding Facility, 227\nlaunch needs, 230–231, 277\nlaunch systems, 15, 128\nlessons learned, 4, 123\nloss of, 114\nMain Engine (SSME), 9, 17, 117, 190–191, \n197, 199\nmission failures, 118]"
76,"What is the significance of the Propulsion Control System (PSC) in reducing fuel usage at cruise conditions, maximizing excess thrust during accelerations and climbs, and extending engine life by reducing the fan turbine inlet temperature? How did the PSC contribute to the operational efficiencies and longevity of high-performance aircraft, and what was its impact on the F119-PW-100 engine used on the Lockheed Martin F-22 Raptor aircraft?","\nThe Propulsion Control System (PSC) played a significant role in improving the efficiency and longevity of high-performance aircraft. By reducing fuel usage at cruise conditions, maximizing excess thrust during accelerations and climbs, and extending engine life by reducing the fan turbine inlet temperature, the PSC contributed to greater operational efficiencies and longevity for these aircraft.\n\nThe PSC system was especially suited to high-performance military aircraft and was applied to a wide variety of aircraft. Pratt & Whitney used the self-tuning onboard model in its advanced engine controllers, including those on the F119-PW-100 engine used on the Lockheed Martin F-22 Raptor aircraft. This application of HIDEC technology in the F119-PW-100 engine increased performance and operational longevity.\n\nThe flight demonstration and evaluation performed at NASA Dryden in the F-15 HIDEC contributed to the rapid transition of the technology into operational use. Overall, the PSC system was a significant development in the field of aviation, leading to improvements in fuel efficiency, engine performance, and operational longevity.","[Propulsion Control Enters the Computer Era, 1976–1998\n145\nthe availability of peak engine and maneuvering performance at all times. The \noverall result was that PSC reduced fuel usage at cruise conditions, maximized \nexcess thrust during accelerations and climbs, and extended engine life by \nreducing the fan turbine inlet temperature. A byproduct was the capability to \nmonitor the degradation of engine components. When combined with regu-\nlarly scheduled preventative maintenance, the PSC enabled greater operational \nefficiencies and longevity for high-performance aircraft.19\nThe PSC system could be applied to a wide variety of aircraft but was \nespecially suited to high-performance military aircraft. Pratt & Whitney used \nthe self-tuning onboard model in its advanced engine controllers, including \nthose on the F119-PW-100 engine used on the Lockheed Martin F-22 Raptor \naircraft. The manufacturer applied other aspects of HIDEC technology in the \nimproved F100-PW-229, the most widely used fighter engine in the world, \nto increase performance and operational longevity. The flight demonstration \nand evaluation performed at NASA Dryden in the F-15 HIDEC contributed \nto the rapid transition of the technology into operational use.20\nFigure 5-4. Shown is the Lockheed Martin F-22A Raptor. (U.S. Air Force)\nResponse to Tragedy: \nToward Propulsion-Controlled Aircraft\nA series of aircraft accidents through the 1970s and 1980s illustrated the need \nfor better methods of flight control. One of the surprising outcomes was the]"


In [17]:
# Serialize the raw dataset as a JSON file
rag_eval_dataset.save_json(
    path=f"{SAVE_DIR}/NASA_history_qa_not_clean.json"
)

# Deduplicate, filter non-queries and serialize the dataset as JSON file
dataset_dict = json.loads(rag_eval_dataset.json())
dataset_dict['examples'] = remove_recs_without_query(
    dataset_dict['examples']
)
dataset_dict['examples'] = remove_duplicated_queries(
    dataset_dict['examples']
)

# Serialize the JSON object.
with open(f"{SAVE_DIR}/NASA_history_qa_only.json" , "w") as write:
    json.dump(dataset_dict , write)