- Fine-tuning Embeddings Model
- Expanding Context Window from Retrieved Node



### Nest Asyncio

In [None]:
import nest_asyncio

nest_asyncio.apply()

### Install Dependencies

In [None]:
!pip install openai llama_index==0.9.31 pypdf -q -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.4/221.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m917.6/917.6 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.6/277.6 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.0/143.0 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━

### Provide OpenAI API Key

In [None]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")

Enter Your OpenAI API Key: ··········


## Loading Data

The data can be found in [this GitHub repo](https://github.com/AI-Maker-Space/DataRepository/tree/main/high-performance-rag).

It is a collection of Academic Papers related to Camelids!

In [None]:
%cd DataRepository/high-performance-rag

/content/DataRepository/high-performance-rag


In [None]:
!unzip "Camel Papers Test.zip"

Archive:  Camel Papers Test.zip
  inflating: Camel Papers Test/Acute respiratory distress syndrome in an alpaca cria.pdf  
  inflating: Camel Papers Test/Alpaca liveweight variations and fiber production in Mediterranean range of Chile.pdf  


In [None]:
!unzip "Camel Papers Train.zip"

Archive:  Camel Papers Train.zip
  inflating: Camel Papers Train/Antibody response to the epsilon toxin ofClostridium perfringensfollowing vaccination of Lama glamacrias.pdf  
  inflating: Camel Papers Train/Comparative pigmentation of sheep, goats, and llamas what colors are possible through selection.pdf  
  inflating: Camel Papers Train/Conservative management of a ruptured.pdf  
  inflating: Camel Papers Train/Evaluation of cholesterol and vitamin E concentrations in adult alpacas and nursing crias.pdf  
  inflating: Camel Papers Train/Influence of effects on quality traits and relationships between traits of the llama fleece..pdf  
  inflating: Camel Papers Train/Influence of Follicular Fluid on in Vitro.pdf  
  inflating: Camel Papers Train/Neurological Causes of Diaphragmatic Paralysis in 11 Alpacas.pdf  
  inflating: Camel Papers Train/On the morphology of the cerebellum of the alpaca (Lama pacos)..pdf  
  inflating: Camel Papers Train/Relationships between integumental charact

Now we can begin building our simple index for each of the training directories, and the validation directories.

We will use LlamaIndex's `SimpleNodeParser` to achieve this!

In [None]:
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import MetadataMode

TRAIN_FILES = "Camel Papers Train"
VAL_FILES = "Camel Papers Test"

In [None]:
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import MetadataMode

def load_corpus(directory, verbose=False):
    if verbose:
        print(f"Loading files in {directory}")

    reader = SimpleDirectoryReader(directory)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

In [None]:
train_nodes = load_corpus(TRAIN_FILES, verbose=True)
val_nodes = load_corpus(VAL_FILES, verbose=True)

Loading files in Camel Papers Train
Loaded 91 docs


[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Parsing nodes:   0%|          | 0/91 [00:00<?, ?it/s]

Parsed 155 nodes
Loading files in Camel Papers Test
Loaded 9 docs


Parsing nodes:   0%|          | 0/9 [00:00<?, ?it/s]

Parsed 17 nodes


Now that we've split our source documents into a number of nodes, we can move on to constructing a fine-tuning dataset.

#### Constructing a Fine-tuning Dataset

In [None]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)

In [None]:
from llama_index.llms import OpenAI

llm = OpenAI(temperature=0.0, model="gpt-3.5-turbo")

In [None]:
train_dataset = generate_qa_embedding_pairs(train_nodes, llm=llm)
train_dataset.save_json("train_dataset.json")

100%|██████████| 155/155 [15:57<00:00,  6.18s/it]


In [None]:
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")

In [None]:
val_dataset = generate_qa_embedding_pairs(val_nodes, llm=llm)
val_dataset.save_json("val_dataset.json")

100%|██████████| 17/17 [01:31<00:00,  5.38s/it]


In [None]:
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

#### Fine-tuning `BAAI/bge-small-en-v1.5`


In [None]:
!pip install sentence_transformers -q -U

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone


We'll be leveraging LlamaIndex's `SentenceTransformersFinetuneEngine` to make fine-tuning our embeddings model a breeze.

In [None]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset, # Dataset to be trained on
    model_id="BAAI/bge-small-en-v1.5", # HuggingFace reference to base embeddings model
    model_output_path="llama_model_v1", # Output directory for fine-tuned embeddings model
    val_dataset=val_dataset, # Dataset to validate on
    epochs=2 # Number of Epochs to train for
)

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

All that's left to do now is call `.finetune()`!

In [None]:
finetune_engine.finetune()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Now that we've fine-tuned our embeddings model, lets grab the model out of the engine so we can use it later!

In [None]:
finetuned_embedding_model = finetune_engine.get_finetuned_model()

In [None]:
finetuned_embedding_model.to_json()

'{"model_name": "llama_model_v1", "embed_batch_size": 10, "tokenizer_name": "llama_model_v1", "max_length": 512, "pooling": "cls", "normalize": true, "query_instruction": null, "text_instruction": null, "cache_folder": null, "class_name": "HuggingFaceEmbedding"}'

In [None]:
from llama_index.embeddings import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="llama_model_v1")

In [None]:
!zip -r /content/file.zip llama_model_v1

  adding: llama_model_v1/ (stored 0%)
  adding: llama_model_v1/config_sentence_transformers.json (deflated 26%)
  adding: llama_model_v1/model.safetensors (deflated 15%)
  adding: llama_model_v1/config.json (deflated 48%)
  adding: llama_model_v1/sentence_bert_config.json (deflated 4%)
  adding: llama_model_v1/tokenizer_config.json (deflated 75%)
  adding: llama_model_v1/tokenizer.json (deflated 71%)
  adding: llama_model_v1/README.md (deflated 55%)
  adding: llama_model_v1/eval/ (stored 0%)
  adding: llama_model_v1/eval/Information-Retrieval_evaluation_results.csv (deflated 83%)
  adding: llama_model_v1/1_Pooling/ (stored 0%)
  adding: llama_model_v1/1_Pooling/config.json (deflated 49%)
  adding: llama_model_v1/special_tokens_map.json (deflated 42%)
  adding: llama_model_v1/modules.json (deflated 62%)
  adding: llama_model_v1/2_Normalize/ (stored 0%)
  adding: llama_model_v1/vocab.txt (deflated 53%)


In [None]:
from google.colab import files
files.download("/content/file.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Evaluating Embeddings Model

using `InformationRetrievalEvaluator` 

The score we'll be looking at by default is `Mean Average Precision @ K` or `MAP@K`.

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path

def evaluate_st(
    dataset,
    model_id,
    name,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs, name=name)
    model = SentenceTransformer(model_id)
    output_path = "results/"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

In [None]:
evaluate_st(val_dataset, "BAAI/bge-small-en-v1.5", name="bge")

0.7671918767507002

In [None]:
evaluate_st(val_dataset, "llama_model_v1", name="finetuned")

0.7938725490196079

##  Sentence Window Retrieval

At a high level: 

1. We parse our document into sentence-wise nodes.
2. We find the most relevant sentence-wise nodes to our query.
3. We add additional context based on a "window" around that base sentence-wise node.
4. We use that enhanced context as context for our LLM!


In [None]:
from llama_index import ServiceContext, set_global_service_context
from llama_index.llms import OpenAI
from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
from llama_index.node_parser import SentenceWindowNodeParser, SimpleNodeParser

# window node parser
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=6,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

# simple node parser
simple_node_parser = SimpleNodeParser.from_defaults()

# base Query Engine LLM
llm = OpenAI(model="gpt-3.5-turbo", temperature=0)

# fine-tuned Embeddings model
embed_model = HuggingFaceEmbedding(
    model_name="llama_model_v1"
)

# base Embeddings model
embed_model_base = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en"
)

# fine-tuned ServiceContext
ctx = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
)

# base ServiceContext
ctx_base = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model_base
)

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Let's create nodes using our `node_parser` and `simple_node_parser` after loading our documents found in the `TRAIN_FILES` directory.

In [None]:
documents = SimpleDirectoryReader(
    TRAIN_FILES
).load_data()

In [None]:
nodes = node_parser.get_nodes_from_documents(documents)

In [None]:
base_nodes = simple_node_parser.get_nodes_from_documents(documents)

Now we can create their respecitve `VectorStoreIndex`s for each set of nodes.

In [None]:
from llama_index import VectorStoreIndex

sentence_index = VectorStoreIndex(nodes, service_context=ctx)

In [None]:
sentence_index.storage_context.persist(persist_dir="sentence_index")

In [None]:
!zip -r /content/sentence_index.zip sentence_index

  adding: sentence_index/ (stored 0%)
  adding: sentence_index/docstore.json (deflated 95%)
  adding: sentence_index/default__vector_store.json (deflated 65%)
  adding: sentence_index/graph_store.json (stored 0%)
  adding: sentence_index/index_store.json (deflated 68%)


In [None]:
files.download("/content/sentence_index.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
base_index = VectorStoreIndex(base_nodes, service_context=ctx)

In [None]:
base_index.storage_context.persist(persist_dir="base_index")

In [None]:
!zip -r /content/base_index.zip base_index

updating: base_index/ (stored 0%)
updating: base_index/docstore.json (deflated 78%)
updating: base_index/default__vector_store.json (deflated 60%)
updating: base_index/graph_store.json (stored 0%)
updating: base_index/index_store.json (deflated 67%)


In [None]:
files.download("/content/base_index.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In the following step, we'll set up our `MetadataReplacementPostProcessor` which is what will replace our sentences (`original_text`) with our expanded contexts (`window`).

Remember, we're retrieving the `top_k` (3, in this case) sentences - and then converting them to their surrounding context.

In [None]:
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

query_engine = sentence_index.as_query_engine(
    similarity_top_k=3,
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

Let's look at a sample response!

In [None]:
window_response = query_engine.query("How do camelid genetics influence wool quality?")

In [None]:
window_response.response

'Camelid genetics can influence wool quality in several ways. The selection process during the domestication of each species, such as llamas and alpacas, plays a role in determining the characteristics of their wool. For example, llamas were selected for greater body size and fiber weight rather than color uniformity or fiber fineness. Additionally, the large number of keratin genes and keratin-associated proteins that make up the fiber suggest that their relative composition and interactions are key determinants of fiber characteristics. The control of follicle cycling, which affects fiber growth, is also influenced by genetics, nutrition, and hormones. Overall, understanding camelid genetics can contribute to improving fleece characteristics and wool quality in these animals.'

We can also look at the visual representation of what happened, with our original sentence - and then our expanded context window.



In [None]:
window = window_response.source_nodes[0].node.metadata["window"]
sentence = window_response.source_nodes[0].node.metadata["original_text"]

print(f"Window: {window}")
print("------------------")
print(f"Original Sentence: {sentence}")

Window: 79 August 2022, Vol.  12, No.  4
be studied.  The purpose of this review is to update the reader 
on the current state of knowledge of fiber genetics in domestic South American camelids and to discuss how genomics and the emergence of modern technologies for sequencing and discovering genetic variants will contribute to the advancement in this field.
 Coat Color Genetics
Llamas and alpacas have more than 22 natural colors ran -
ging from black and brown through gray and fawn to white, including all intermediate shades.  Llamas present greater color variation compared to alpacas; tricolor phenotypes may be ob -
served and the presence of white spots is common in llamas.  Additionally, this variety of colors and patterns normally oc -
curs in the same herd, unlike alpaca’s herds that tend to be more homogeneous.  The difference can be attributed to the se -
lection process during the domestication of each species.  The 
llama, as a multipurpose animal, was selected for greater bo

Let's compare to the same query using the simple nodes.

In [None]:
query_engine = base_index.as_query_engine(similarity_top_k=2)
vector_response = query_engine.query("How do camelid genetics influence wool quality?")

In [None]:
vector_response.response

'Camelid genetics can influence wool quality in several ways. The genetic mechanisms underlying commercially important fiber traits, such as fleece type, color, and fineness, play a role in determining the quality of the wool produced. These traits are under genetic control and can be influenced by one or several genes. Additionally, the amount of fiber produced, known as fleece weight, is also impacted by camelid genetics. Breeders can use genetic understanding of these traits to select and improve productive characteristics, as well as to conserve and diversify the species.'

## Evaluating our Pipeline

We'll be leveraging LlamaIndex's evaluation tools to evaluate our pipeline today.

We'll be relying on the [`DatasetGenerator`](https://github.com/run-llama/llama_index/blob/main/llama_index/evaluation/dataset_generation.py) to create our `QueryResponseDataset` leveraging `GPT-4`.

The dataset generated will be similar to before - which is a Question/Context dataset.

> NOTE: GPT-4 powered evaluation can be expensive and fairly time-consuming. Ensure you've scoped out cost before proceeding with evaluation.

In [None]:
import random
from llama_index.evaluation import (
    DatasetGenerator,
    QueryResponseDataset,
)

# the number of nodes to evaluate
num_nodes_eval = 10

# selecting a random sample of nodes
sample_eval_nodes = random.sample(base_nodes, num_nodes_eval)

# setting up our GPT-4 powered evaluation context
eval_service_context = ServiceContext.from_defaults(llm=OpenAI(model="gpt-3.5-turbo"))

# creating our dataset generator
dataset_generator = DatasetGenerator(
    sample_eval_nodes,
    service_context=eval_service_context,
    show_progress=True,
    num_questions_per_chunk=2,
)

  dataset_generator = DatasetGenerator(


Now we can simply fire off our `dataset_generator` and wait!

In [None]:
eval_dataset = await dataset_generator.agenerate_dataset_from_nodes()


  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:03<00:28,  3.19s/it][A
 20%|██        | 2/10 [00:03<00:12,  1.52s/it][A
 30%|███       | 3/10 [00:03<00:06,  1.09it/s][A
 40%|████      | 4/10 [00:04<00:04,  1.42it/s][A
 60%|██████    | 6/10 [00:04<00:02,  1.82it/s][A
 70%|███████   | 7/10 [00:05<00:01,  1.77it/s][A
 80%|████████  | 8/10 [00:05<00:00,  2.23it/s][A
100%|██████████| 10/10 [00:05<00:00,  1.70it/s]

  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:02<00:02,  2.33s/it][A
100%|██████████| 2/2 [00:02<00:00,  1.37s/it]

  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:02<00:02,  2.19s/it][A
100%|██████████| 2/2 [00:04<00:00,  2.14s/it]

  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:02<00:02,  2.44s/it][A
100%|██████████| 2/2 [00:03<00:00,  1.84s/it]

  0%|          | 0/2 [00:00<?, ?it/s][A
100%|██████████| 2/2 [00:08<00:00,  4.25s/it]

  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1

In [None]:
eval_dataset.save_json("llama_eval_qr_dataset.json")

In [None]:
eval_dataset = QueryResponseDataset.from_json("llama_eval_qr_dataset.json")

  return cls(**data)


We'll be using the following standard evaluation metrics provided by LlamaIndex.

- CorrectnessEvaluator - [Code](https://github.com/run-llama/llama_index/blob/main/llama_index/evaluation/correctness.py)
- SemanticSimilarityEvaluator - [Code](https://github.com/run-llama/llama_index/blob/main/llama_index/evaluation/semantic_similarity.py)
- RelevancyEvaluator - [Code](https://github.com/run-llama/llama_index/blob/main/llama_index/evaluation/relevancy.py)
- FaithfulnessEvaluator - [Code](https://github.com/run-llama/llama_index/blob/main/llama_index/evaluation/faithfulness.py)

In [None]:
from llama_index.evaluation import (
    CorrectnessEvaluator,
    SemanticSimilarityEvaluator,
    RelevancyEvaluator,
    FaithfulnessEvaluator
)

evaluator_c = CorrectnessEvaluator(service_context=eval_service_context)
evaluator_s = SemanticSimilarityEvaluator(service_context=eval_service_context)
evaluator_r = RelevancyEvaluator(service_context=eval_service_context)
evaluator_f = FaithfulnessEvaluator(service_context=eval_service_context)

Next, we'll set up additional evaluation tools, these tools will mostly be used to make evaluating and collecting our evaluations a bit simpler. Thanks, LlamaIndex!

In [None]:
from llama_index.evaluation.eval_utils import get_responses, get_results_df
from llama_index.evaluation import BatchEvalRunner

max_samples = 15

eval_qs = eval_dataset.questions
ref_response_strs = [r for (_, r) in eval_dataset.qr_pairs]

Next up, we'll set up `QueryEngine`s for our two pipelines we wish to evaluate and let them predict!

First up is our SentenceWindow-MetaDataReplacement pipeline powered by fine-tuned embeddings.

In [None]:
query_engine = sentence_index.as_query_engine(
    similarity_top_k=3,
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)
pred_responses_finetuned_embeds = get_responses(
    eval_qs[:max_samples], query_engine, show_progress=True
)


  0%|          | 0/15 [00:00<?, ?it/s][A
  7%|▋         | 1/15 [00:03<00:53,  3.85s/it][A
 13%|█▎        | 2/15 [00:04<00:24,  1.85s/it][A
 27%|██▋       | 4/15 [00:04<00:08,  1.23it/s][A
 33%|███▎      | 5/15 [00:05<00:07,  1.37it/s][A
 40%|████      | 6/15 [00:06<00:07,  1.14it/s][A
 47%|████▋     | 7/15 [00:09<00:12,  1.55s/it][A
 53%|█████▎    | 8/15 [00:10<00:10,  1.51s/it][A
 60%|██████    | 9/15 [00:11<00:07,  1.27s/it][A
 67%|██████▋   | 10/15 [00:11<00:04,  1.08it/s][A
 73%|███████▎  | 11/15 [00:12<00:03,  1.16it/s][A
 80%|████████  | 12/15 [00:12<00:02,  1.41it/s][A
 87%|████████▋ | 13/15 [00:13<00:01,  1.30it/s][A
 93%|█████████▎| 14/15 [00:21<00:02,  2.94s/it][A
100%|██████████| 15/15 [00:23<00:00,  1.60s/it]


Next is our Simple Retrieval Base Embeddings pipeline.

In [None]:
base_index_base_embeddings = VectorStoreIndex(base_nodes, service_context=ctx_base)
base_embeddings_base_query_engine = base_index_base_embeddings.as_query_engine(
  similarity_top_k=3
)
base_pred_responses_base_embedings = get_responses(
    eval_qs[:max_samples], base_embeddings_base_query_engine, show_progress=True
)


 50%|█████     | 1/2 [05:08<05:08, 308.61s/it]

  7%|▋         | 1/15 [00:02<00:34,  2.50s/it][A
 13%|█▎        | 2/15 [00:02<00:15,  1.19s/it][A
 27%|██▋       | 4/15 [00:03<00:07,  1.41it/s][A
 47%|████▋     | 7/15 [00:04<00:03,  2.14it/s][A
 53%|█████▎    | 8/15 [00:06<00:04,  1.41it/s][A
 60%|██████    | 9/15 [00:07<00:05,  1.20it/s][A
 67%|██████▋   | 10/15 [00:07<00:03,  1.52it/s][A
 73%|███████▎  | 11/15 [00:08<00:03,  1.20it/s][A
 80%|████████  | 12/15 [00:09<00:02,  1.38it/s][A
 87%|████████▋ | 13/15 [00:10<00:01,  1.22it/s][A
 93%|█████████▎| 14/15 [00:11<00:00,  1.11it/s][A
100%|██████████| 15/15 [00:18<00:00,  1.21s/it]


In [None]:
import numpy as np

pred_response_strs_finetuned_embeds = [str(p) for p in pred_responses_finetuned_embeds]
base_pred_response_strs_base_embeds = [str(p) for p in base_pred_responses_base_embedings]

We'll create our evaluator dict, which will help create the appropriate `pd.DataFrame` in the final step - and set up our `BatchEvalRunner` which will be used to evaluate our pipelines responses against using GPT-4!

In [None]:
evaluator_dict = {
    "correctness": evaluator_c,
    "faithfulness": evaluator_f,
    "relevancy": evaluator_r,
    "semantic_similarity": evaluator_s,
}

batch_runner = BatchEvalRunner(evaluator_dict, workers=2, show_progress=True)

In [None]:
base_eval_results_base_embeddings = await batch_runner.aevaluate_responses(
    queries=eval_qs[:max_samples],
    responses=base_pred_responses_base_embedings[:max_samples],
    reference=ref_response_strs[:max_samples],
)

100%|██████████| 60/60 [00:47<00:00,  1.27it/s]


In [None]:
eval_results_finetuned_embeddings = await batch_runner.aevaluate_responses(
    queries=eval_qs[:max_samples],
    responses=pred_responses_finetuned_embeds[:max_samples],
    reference=ref_response_strs[:max_samples],
)

100%|██████████| 60/60 [01:02<00:00,  1.03s/it]


Finally we can look at our results, which I'll let speak for themselves!

In [None]:
results_df = get_results_df(
    [
        base_eval_results_base_embeddings,
        eval_results_finetuned_embeddings],
    ["Base Retriever w Base Embeddings", "Sentence Window Retriever w FT Embeddings"],
    ["correctness", "relevancy", "faithfulness", "semantic_similarity"],
)

In [None]:
display(results_df.sort_values(by=['semantic_similarity'], ascending=False))

Unnamed: 0,names,correctness,relevancy,faithfulness,semantic_similarity
1,Sentence Window Retriever w FT Embeddings,4.133333,0.933333,0.666667,0.973979
0,Base Retriever w Base Embeddings,3.9,0.733333,0.266667,0.963818
