# Finetune an embeddings model using ZenML Data

In this notebook, we generate a synthetic dataset of (query, relevant documents) pairs from a corpus of documents *without labelers* by leveraging LLM.

In [1]:
from llama_index import SimpleWebPageReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import MetadataMode
from zenml import step
from typing import Any, Annotated, List, Tuple, Dict

In [4]:
!zenml init

[1;35mNote: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.[0m
[1;35mNumExpr defaulting to 8 threads.[0m
[?25l[32m⠋[0m Initializing ZenML repository at /home/wjayesh/apps/zenml-rag-llm-finetune.
[2K[1A[2K[32m⠙[0m Initializing ZenML repository at /home/wjayesh/apps/zenml-rag-llm-finetune.
[2K[1A[2K[32m⠹[0m Initializing ZenML repository at /home/wjayesh/apps/zenml-rag-llm-finetune.
[2K[1A[2K[32m⠸[0m Initializing ZenML repository at /home/wjayesh/apps/zenml-rag-llm-finetune.
[1;35mSetting the repo active workspace to 'default'.[0m
[33mSetting the repo active stack to default.[0m
[2K[1A[2K[2;36mZenML repository initialized at [0m[2;35m/home/wjayesh/apps/[0m[2;95mzenml-rag-llm-finetune.[0m
[2;32m⠼[0m[2;36m Initializing ZenML repository at /home/wjayesh/apps/zenml-rag-llm-finetune.[0m
[2K[1A[2K[32m⠼[0m Initializing ZenML repository at /home/wjayesh/apps/zenml-rag-llm-finetune.

[1A[2K[1A[2K[2;36mThe l

#### Scrape all URLs

In [2]:
from steps.url_scraper.url_scraping_utils import get_all_pages, get_nested_readme_urls


@step(enable_cache=True)
def url_scraper(
    docs_url: str = "",
    repo_url: str = "",
    release_notes_url: str = "",
    website_url: str = "",
) -> Tuple[Annotated[List, "train_urls"], Annotated[List, "val_urls"]]:
    """Generates a list of relevant URLs to scrape.

    Args:
        docs_url: URL to the documentation.
        repo_url: URL to the repository.
        release_notes_url: URL to the release notes.
        website_url: URL to the website.

    Returns:
        List of URLs to scrape.
    """
    # examples_readme_urls = get_nested_readme_urls(repo_url)
    # docs_urls = get_all_pages(docs_url, finetuning=True)
    # website_urls = get_all_pages(website_url, finetuning=True)
    # all_urls = docs_urls + website_urls + [release_notes_url]

    # # split into train and val sets
    # train_urls = all_urls[: int(0.8 * len(all_urls))]
    # val_urls = all_urls[int(0.8 * len(all_urls)) :]

    return [website_url], [website_url]

    return train_urls, val_urls

#### Load the contents of the URLs

In [3]:
@step()
def load_corpus(urls: List[str], verbose=False) -> Dict[str, str]:
    if verbose:
        print(f"Loading URLs {urls}")

    reader = SimpleWebPageReader(html_to_text=True)
    docs = reader.load_data(urls)
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    corpus = {
        node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
        for node in nodes
    }
    return corpus

#### Generate Queries

In [4]:
import re
import uuid

from llama_index.llms import OpenAI
from llama_index.schema import MetadataMode
from tqdm.notebook import tqdm


@step()
def generate_queries(
    corpus: Dict[str, str],
    num_questions_per_chunk: int = 2,
    prompt_template: str = "",
    verbose=False,
) -> Tuple[Dict[str, str], Dict[str, List[str]]]:
    """
    Automatically generate hypothetical questions that could be answered with
    doc in the corpus.
    """
    llm = OpenAI(model="gpt-3.5-turbo", api_key="API_KEY")

    prompt_template = (
        prompt_template
        or """\
    Context information is below.
    
    ---------------------
    {context_str}
    ---------------------
    
    Given the context information and not prior knowledge.
    generate only questions based on the below query.
    
    You are a Teacher/ Professor. Your task is to setup \
    {num_questions_per_chunk} questions for an upcoming \
    quiz/examination. The questions should be diverse in nature \
    across the document. Restrict the questions to the \
    context information provided."
    """
    )

    queries = {}
    relevant_docs = {}
    for node_id, text in tqdm(corpus.items()):
        query = prompt_template.format(
            context_str=text, num_questions_per_chunk=num_questions_per_chunk
        )
        response = llm.complete(query)

        result = str(response).strip().split("\n")
        questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
        ]
        questions = [question for question in questions if len(question) > 0]

        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            relevant_docs[question_id] = [node_id]
    return queries, relevant_docs

#### Merge Data

In [5]:
@step()
def merge_data(
    train_corpus: Dict[str, str],
    train_queries: Dict[str, str],
    train_relevant_docs: Dict[str, List[str]],
    val_corpus: Dict[str, str],
    val_queries: Dict[str, str],
    val_relevant_docs: Dict[str, List[str]],
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    train_dataset = {
        "queries": train_queries,
        "corpus": train_corpus,
        "relevant_docs": train_relevant_docs,
    }

    val_dataset = {
        "queries": val_queries,
        "corpus": val_corpus,
        "relevant_docs": val_relevant_docs,
    }

    return train_dataset, val_dataset


#### Generate training examples

In [6]:
from torch.utils.data import DataLoader
from sentence_transformers import InputExample


@step()
def generate_training_examples(
    dataset: Dict[str, Any], batch_size: int = 10
) -> DataLoader:
    """Generate training examples from the dataset.
    
    Args:
        dataset: Dataset containing the corpus, queries and relevant docs.
        batch_size: Batch size for the dataloader.
        
    Returns:
        DataLoader containing the training examples.
    """
    corpus = dataset['corpus']
    queries = dataset['queries']
    relevant_docs = dataset['relevant_docs']

    examples = []
    for query_id, query in queries.items():
        node_id = relevant_docs[query_id][0]
        text = corpus[node_id]
        example = InputExample(texts=[query, text])
        examples.append(example)

    return DataLoader(examples, batch_size=batch_size)

#### Create an evaluator

In [7]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator


@step()
def create_evaluator(dataset: Dict[str, Any]) -> InformationRetrievalEvaluator:
    """Generate training examples from the dataset.

    Args:
        dataset: Dataset containing the corpus, queries and relevant docs.

    Returns:
        InformationRetrievalEvaluator for the dataset.
    """
    corpus = dataset["corpus"]
    queries = dataset["queries"]
    relevant_docs = dataset["relevant_docs"]

    return InformationRetrievalEvaluator(queries, corpus, relevant_docs)

#### Fine tune an embeddings model

In [2]:
from typing import Optional
from sentence_transformers import SentenceTransformer, losses


@step()
def finetune_sentencetransformer_model(
    loader: DataLoader,
    evaluator: InformationRetrievalEvaluator,
    EPOCHS: int = 2,
    model_id: Optional[str] = "BAAI/bge-small-en",
) -> SentenceTransformer:
    model = SentenceTransformer(model_id)
    loss = losses.MultipleNegativesRankingLoss(model=model)

    warmup_steps = int(len(loader) * EPOCHS * 0.1)

    model.fit(
        train_objectives=[(loader, loss)],
        epochs=EPOCHS,
        warmup_steps=warmup_steps,
        show_progress_bar=True,
        evaluator=evaluator, 
        evaluation_steps=50,
    )

    return model

#### Define a pipeline

In [10]:
from zenml import pipeline

@pipeline(name="finetuning_pipeline", enable_cache=True)
def finetuning_pipeline(
    docs_url: str = "",
    repo_url: str = "",
    release_notes_url: str = "",
    website_url: str = "",
):
    train_urls, val_urls = url_scraper(docs_url, repo_url, release_notes_url, website_url)
    train_corpus = load_corpus(train_urls, id="train_loader")
    val_corpus = load_corpus(val_urls, id="val_loader")
    train_queries, train_relevant_docs = generate_queries(train_corpus, id="train_queries_generator")
    val_queries, val_relevant_docs = generate_queries(val_corpus, id="val_queries_generator")
    train_dataset, val_dataset = merge_data(
        train_corpus,
        train_queries,
        train_relevant_docs,
        val_corpus,
        val_queries,
        val_relevant_docs,
    )
    training_examples = generate_training_examples(train_dataset)
    evaluator = create_evaluator(val_dataset)
    model = finetune_sentencetransformer_model(training_examples, evaluator)
    

In [1]:
!zenml stack describe

[1;35mNote: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.[0m
[1;35mNumExpr defaulting to 8 threads.[0m
[?25l[3m        Stack Configuration        [0m
┏━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━┓
┃[1m [0m[1mCOMPONENT_TYPE[0m[1m [0m│[1m [0m[1mCOMPONENT_NAME[0m[1m [0m┃
┠────────────────┼────────────────┨
┃ ARTIFACT_STORE │ default        ┃
┠────────────────┼────────────────┨
┃ ORCHESTRATOR   │ default        ┃
┗━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━┛
[2;3m     'default' stack (ACTIVE)      [0m
[32m⠋[0m Describing the stack...
[2K[1A[2K[2;36mStack [0m[2;32m'default'[0m[2;36m with id [0m[2;32m'116dfeaa-40e5-47f2-b7c1-929aed5ee49f'[0m[2;36m is owned by user [0m
[2;36mdefault and is [0m[2;32m'private'[0m[2;36m.[0m
[2;32m⠋[0m[2;36m Describing the stack...[0m
[2K[1A[2K[32m⠋[0m Describing the stack...

[1A[2K[1A[2K[33mYou can display various ZenML entities including pipelines, runs, stacks and [0m
[33mmuch m

#### Call the pipeline

In [29]:
import os
os.environ["OPENAI_API_KEY"]="API_KEY"

In [30]:
version = "0.47.0"
docs_url = f"https://docs.zenml.io/v/{version}/"
website_url = "https://zenml.io"
repo_url = f"https://github.com/zenml-io/zenml/tree/{version}/examples"
release_notes_url = (
    f"https://github.com/zenml-io/zenml/blob/{version}/RELEASE_NOTES.md"
)

finetuning_pipeline(
    website_url=website_url,
    docs_url=docs_url,
    repo_url=repo_url,
    release_notes_url=release_notes_url,
)

[1;35mInitiating a new run for the pipeline: [0m[1;36mfinetuning_pipeline[1;35m.[0m
[1;35mRegistered new version: [0m[1;36m(version 5)[1;35m.[0m
[1;35mExecuting a new run.[0m
[1;35mUsing user: [0m[1;36mdefault[1;35m[0m
[1;35mUsing stack: [0m[1;36mdefault[1;35m[0m
[1;35m  artifact_store: [0m[1;36mdefault[1;35m[0m
[1;35m  orchestrator: [0m[1;36mdefault[1;35m[0m
[1;35mCaching [0m[1;36menabled[1;35m explicitly for [0m[1;36murl_scraper[1;35m.[0m
[1;35mUsing cached version of [0m[1;36murl_scraper[1;35m.[0m
[1;35mStep [0m[1;36murl_scraper[1;35m has started.[0m
[1;35mUsing cached version of [0m[1;36mtrain_loader[1;35m.[0m
[1;35mStep [0m[1;36mtrain_loader[1;35m has started.[0m
[1;35mUsing cached version of [0m[1;36mval_loader[1;35m.[0m
[1;35mStep [0m[1;36mval_loader[1;35m has started.[0m
[1;35mStep [0m[1;36mtrain_queries_generator[1;35m has started.[0m


  0%|          | 0/7 [00:00<?, ?it/s]

[1;35mStep [0m[1;36mtrain_queries_generator[1;35m has finished in [0m[1;36m10.090s[1;35m.[0m
[1;35mStep [0m[1;36mval_queries_generator[1;35m has started.[0m


  0%|          | 0/7 [00:00<?, ?it/s]

[1;35mStep [0m[1;36mval_queries_generator[1;35m has finished in [0m[1;36m10.446s[1;35m.[0m
[1;35mStep [0m[1;36mmerge_data[1;35m has started.[0m
[1;35mStep [0m[1;36mmerge_data[1;35m has finished in [0m[1;36m0.175s[1;35m.[0m
[1;35mStep [0m[1;36mcreate_evaluator[1;35m has started.[0m
[33mNo materializer is registered for type [0m[1;36m<class 'sentence_transformers.evaluation.InformationRetrievalEvaluator.InformationRetrievalEvaluator'>[33m, so the default Pickle materializer was used. Pickle is not production ready and should only be used for prototyping as the artifacts cannot be loaded when running with a different Python version. Please consider implementing a custom materializer for type [0m[1;36m<class 'sentence_transformers.evaluation.InformationRetrievalEvaluator.InformationRetrievalEvaluator'>[33m according to the instructions at https://docs.zenml.io/user-guide/advanced-guide/artifact-management/handle-custom-data-types[0m
[1;35mStep [0m[1;36m

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.2k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

[1;35mUse pytorch device: cpu[0m


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

: 

In [12]:
from zenml.client import Client

pipeline_model = Client().get_pipeline(
    name_id_or_prefix="finetuning_pipeline"
)

# you can additionally pass in the version if you want
# to move between different pipeline implementations.
# pipeline_model = Client().get_pipeline(
#     name_id_or_prefix=PIPELINE_NAME, version="9"
# )

if pipeline_model.runs is not None:
    # get the last run
    last_run = pipeline_model.runs[0]
    # get the agent_creator step
    queries_steps = last_run.steps["train_queries_generator"]

    try:
        queries = queries_steps.outputs["output_0"].load()
    except ValueError:
        pass

    print(queries)

{'a52951f0-9eae-4a53-b639-dd95f00c1e2e': 'What are some key features of ZenML that make it stand out from other ML orchestrators?', '6920463a-4630-4824-bcf1-f434567f9231': 'How does ZenML simplify the process of developing MLOps workflows?', 'c0a42c2b-d248-4cb2-8440-5450e99928b8': 'Which companies are featured in the context information provided?', '0675d6cd-eabc-4b6f-a574-a24363558b6a': 'What are the different types of logos or images included in the context information?', 'a9b76828-1b88-4d9d-abc6-f33e8ff4d548': 'What are some challenges that companies face when trying to implement machine learning internally?', '4a34ad70-df82-4791-912a-486e3ea41657': 'How does ZenML simplify the ML workflow for teams and bridge the gap between data science and operations?', '4db62f2e-e69d-4d27-8238-250fc885df77': 'What is the purpose of ZenML in the MLOps landscape? How does it differ from other tools?', 'e943b2d3-2c6c-4c57-8979-f516e8d750f7': 'How can ZenML be used to connect and organize data, mode