In [None]:
%pip install -q -U git+https://github.com/huggingface/transformers.git
%pip install accelerate
%pip install -i https://pypi.org/simple/ bitsandbytes
%pip install sentence-transformers

%pip install llama-index llama-index-llms-huggingface llama-index-embeddings-huggingface llama-index-readers-web

%pip install llama-index-embeddings-openai
%pip install llama-index-embeddings-adapter
%pip install llama-index-finetuning


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m57.4 MB/s

In [None]:
import torch
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM


import os
import warnings

warnings.filterwarnings('ignore')

# Getting data

In [None]:
training_file_name = "Scaling_Laws_for_Downstream_Task_Performance_of_Large_Language_Models.pdf"
validation_file_name = "Unraveling_the_Mystery_of_Scaling_Laws.pdf"

working_dir = "./"

!wget 'https://arxiv.org/pdf/2402.04177.pdf'  -O "Scaling_Laws_for_Downstream_Task_Performance_of_Large_Language_Models.pdf"
!wget 'https://arxiv.org/pdf/2403.06563.pdf' -O "Unraveling_the_Mystery_of_Scaling_Laws.pdf"

TRAIN_FILES = [os.path.join(working_dir, training_file_name)]
TRAIN_CORPUS_FPATH = "./train_corpus.json"

VAL_FILES = [os.path.join(working_dir, validation_file_name)]
VAL_CORPUS_FPATH = "./val_corpus.json"

--2024-03-26 04:59:23--  https://arxiv.org/pdf/2402.04177.pdf
Resolving arxiv.org (arxiv.org)... 151.101.3.42, 151.101.131.42, 151.101.195.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.3.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1382773 (1.3M) [application/pdf]
Saving to: ‘Scaling_Laws_for_Downstream_Task_Performance_of_Large_Language_Models.pdf’


2024-03-26 04:59:23 (35.7 MB/s) - ‘Scaling_Laws_for_Downstream_Task_Performance_of_Large_Language_Models.pdf’ saved [1382773/1382773]

--2024-03-26 04:59:23--  https://arxiv.org/pdf/2403.06563.pdf
Resolving arxiv.org (arxiv.org)... 151.101.67.42, 151.101.131.42, 151.101.3.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.67.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 481993 (471K) [application/pdf]
Saving to: ‘Unraveling_the_Mystery_of_Scaling_Laws.pdf’


2024-03-26 04:59:24 (18.4 MB/s) - ‘Unraveling_the_Mystery_of_Scaling_Laws.pdf’ saved [481993/481993]



# Generating Synthetic Data

In [None]:
import json

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode

def load_corpus(files):
    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    parser = SentenceSplitter()
    nodes = parser.get_nodes_from_documents(docs, show_progress=True)
    print(f"Parsed {len(nodes)} nodes")
    return nodes

train_nodes = load_corpus(TRAIN_FILES)
val_nodes = load_corpus(VAL_FILES)

Parsing nodes:   0%|          | 0/17 [00:00<?, ?it/s]

Parsed 23 nodes


Parsing nodes:   0%|          | 0/13 [00:00<?, ?it/s]

Parsed 17 nodes


In [None]:
quantization_conf = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

#We recreate template (or messages) and ensure that they have the correct format, as per,
#https://github.com/run-llama/llama_index/issues/9277#issuecomment-1837545398 for zephyr-7b-beta

def messages_to_prompt(messages):
    prompt = ""
    for message in messages:
        if message.role == 'system':
            prompt += f"<|system|>\n{message.content}</s>\n"
        elif message.role == 'user':
            prompt += f"<|user|>\n{message.content}</s>\n"
        elif message.role == 'assistant':
            prompt += f"<|assistant|>\n{message.content}</s>\n"

    # ensure we start with a system prompt, insert blank if needed
    if not prompt.startswith("<|system|>\n"):
        prompt = "<|system|>\n</s>\n" + prompt

    # add final assistant prompt
    prompt = prompt + "<|assistant|>\n"
    return prompt

In [None]:
def huggingface_llm(model_name="HuggingFaceH4/zephyr-7b-beta",
                    tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
                    context_window=3900,
                    max_new_tokens=256,
                    quantization_config = quantization_conf
                   ):
    llm = HuggingFaceLLM(
        model_name=model_name,
        tokenizer_name=tokenizer_name,
        query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
        context_window=context_window,
        max_new_tokens=max_new_tokens,
        model_kwargs={"quantization_config": quantization_config},
        # tokenizer_kwargs={},
        generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
        messages_to_prompt=messages_to_prompt,
        device_map="auto",
    )

    return llm

llm = huggingface_llm()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [None]:
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

In [None]:
# from llama_index.core import Settings
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Settings.llm = llm
# Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

## Preparing synthetic-training dataset

In [None]:
train_dataset = generate_qa_embedding_pairs(train_nodes, llm)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  4%|▍         | 1/23 [00:21<07:54, 21.56s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  9%|▊         | 2/23 [00:29<04:48, 13.72s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 13%|█▎        | 3/23 [00:37<03:36, 10.82s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 17%|█▋        | 4/23 [00:43<02:48,  8.87s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 22%|██▏       | 5/23 [00:50<02:31,  8.43s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 26%|██▌       | 6/23 [01:00<02:29,  8.78s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 30%|███       | 7/23 [01:09<02:21,  8.84s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 35%|███▍      | 8/23 [01:19<02:18,  9.22s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 39%|███▉      | 9/2

# Finetuning an embeding model

**GIST Large Embedding v0**:


In [None]:
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
from llama_index.core.embeddings import resolve_embed_model
import torch

def embedding_model(model="local:avsolatorio/GIST-large-Embedding-v0",
                    model_output_path="model_output_test",
                    bias=True,
                    no_of_epochs=4,
                    verbose=True,
                    optimizer=torch.optim.AdamW,
                    optimizer_params={"lr": 0.01}
                   ):

    base_embed_model = resolve_embed_model(model)
    finetune_engine = EmbeddingAdapterFinetuneEngine(
        train_dataset,
        base_embed_model,
        model_output_path=model_output_path,
        bias=bias,
        epochs=no_of_epochs,
        verbose=verbose,
        optimizer_class=optimizer,
        optimizer_params=optimizer_params
    )

    return finetune_engine

finetune_engine = embedding_model()
finetune_engine.finetune()

config.json:   0%|          | 0.00/754 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

[1;3;34m> Prepared optimizer, scheduler, and loss model.
[0m

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

[1;3;34m> [Epoch 0] Current loss: 2.1367762088775635
[0m[1;3;34m> [Epoch 0] Current loss: 1.7131267786026
[0m[1;3;34m> [Epoch 0] Current loss: 1.9732509851455688
[0m[1;3;34m> [Epoch 0] Current loss: 2.442192792892456
[0m[1;3;34m> [Epoch 0] Current loss: 3.8278870582580566
[0m[1;3;34m> [Epoch 0] Current loss: 1.5449743270874023
[0m

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

[1;3;34m> [Epoch 1] Current loss: 2.535337448120117
[0m[1;3;34m> [Epoch 1] Current loss: 1.9999969005584717
[0m[1;3;34m> [Epoch 1] Current loss: 2.1655898094177246
[0m[1;3;34m> [Epoch 1] Current loss: 2.420592784881592
[0m[1;3;34m> [Epoch 1] Current loss: 1.4123070240020752
[0m[1;3;34m> [Epoch 1] Current loss: 1.2483347654342651
[0m

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

[1;3;34m> [Epoch 2] Current loss: 2.097515821456909
[0m[1;3;34m> [Epoch 2] Current loss: 1.575049638748169
[0m[1;3;34m> [Epoch 2] Current loss: 1.556888461112976
[0m[1;3;34m> [Epoch 2] Current loss: 2.337671995162964
[0m[1;3;34m> [Epoch 2] Current loss: 1.304823398590088
[0m[1;3;34m> [Epoch 2] Current loss: 1.235952615737915
[0m

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

[1;3;34m> [Epoch 3] Current loss: 1.888196349143982
[0m[1;3;34m> [Epoch 3] Current loss: 1.4202884435653687
[0m[1;3;34m> [Epoch 3] Current loss: 1.3001612424850464
[0m[1;3;34m> [Epoch 3] Current loss: 2.202766180038452
[0m[1;3;34m> [Epoch 3] Current loss: 1.2072255611419678
[0m[1;3;34m> [Epoch 3] Current loss: 1.219875454902649
[0m[1;3;34m> Finished training, saving to model_output_test
[0m

In [None]:
finetune_engine.get_finetuned_model()

AdapterEmbeddingModel(model_name='Adapter for avsolatorio/GIST-large-Embedding-v0', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x78435d0b29b0>)

# Validation

In [None]:
val_dataset = generate_qa_embedding_pairs(val_nodes, llm)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  6%|▌         | 1/17 [00:22<06:07, 22.95s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 12%|█▏        | 2/17 [00:33<03:53, 15.54s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 18%|█▊        | 3/17 [00:42<02:55, 12.55s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 24%|██▎       | 4/17 [00:49<02:15, 10.44s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 29%|██▉       | 5/17 [00:59<02:01, 10.10s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 35%|███▌      | 6/17 [01:06<01:41,  9.23s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 41%|████      | 7/17 [01:20<01:48, 10.85s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 47%|████▋     | 8/17 [01:26<01:24,  9.34s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 53%|█████▎    | 9/1

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

In [None]:
def evaluate(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(
        nodes, embed_model=embed_model, show_progress=True
    )
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

In [None]:
ada = OpenAIEmbedding()
ada_val_results = evaluate(val_dataset, ada)

Generating embeddings:   0%|          | 0/17 [00:00<?, ?it/s]



APIConnectionError: Connection error.

In [None]:
df_ada = pd.DataFrame(ada_val_results)
hit_rate_ada = df_ada["is_hit"].mean()
hit_rate_ada

In [None]:
GIST_model = "local:avsolatorio/GIST-large-Embedding-v0"
GIST_val_results = evaluate(val_dataset, GIST_model)
df_embed_models = pd.DataFrame(GIST_val_results)
hit_rate_bge = df_embed_models["is_hit"].mean()

Generating embeddings:   0%|          | 0/17 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 160.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 9.06 MiB is free. Process 23802 has 14.74 GiB memory in use. Of the allocated memory 14.50 GiB is allocated by PyTorch, and 101.77 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:

hit_rate_bge

In [None]:
val_results_finetuned = evaluate(val_dataset, embed_model)
df_embed_models_finetuned = pd.DataFrame(val_results_finetuned)
hit_rate_bge_finetuned = df_embed_models["is_hit"].mean()
hit_rate_bge_finetuned

In [None]:
results = evaluate(val_dataset, finetune_engine.get_finetuned_model())
df_embed_models = pd.DataFrame(GIST_val_results)
hit_rate = df_embed_models["is_hit"].mean()

Generating embeddings:   0%|          | 0/17 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 9.06 MiB is free. Process 23802 has 14.74 GiB memory in use. Of the allocated memory 14.59 GiB is allocated by PyTorch, and 14.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)