In [None]:
%pip install -q -U distilabel "farm-haystack[preprocessing]"
%pip install -q -U "distilabel[hf-inference-endpoints, argilla]"
%pip install -q -U ollama openai

In [None]:
import os
from typing import Dict

from distilabel.llm import OllamaLLM
from distilabel.pipeline import Pipeline, pipeline
from distilabel.tasks import TextGenerationTask, SelfInstructTask, Prompt

from datasets import Dataset
from haystack.nodes import PDFToTextConverter, PreProcessor

In [11]:
os.environ['OPENAI_API_KEY'] = "YOUR_TOKEN_MIGHT_GO_HERE"

In [None]:
class QuestionAnsweringTask(TextGenerationTask):
    def generate_prompt(self, question: str) -> str:
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=question,
        ).format_as(
            "openai"
        )  # type: ignore

    def parse_output(self, output: str) -> Dict[str, str]:
        return {"answer": output.strip()}

    @property
    def input_args_names(self) -> list[str]:
        return ["question"]

    @property
    def output_args_names(self) -> list[str]:
        return ["answer"]

In [None]:
from distilabel.llm import OllamaLLM

llm = OllamaLLM(
    model="mixtral",  # should be deployed via `ollama notus:7b-v1-q5_K_M`
    task=QuestionAnsweringTask(),
    prompt_format="openai",
)

In [None]:
generation = llm.generate(
    [{"question": "What's the second most populated city in Denmark?"}]
)

generation[0][0]["parsed_output"]["answer"]

In [None]:
!zenml integration install s3 -y

In [None]:
from zenml.client import Client

artifact = Client().get_artifact_version('86ba966e-66d1-4c79-a464-8bfff65300a0')
loaded_artifact = artifact.load()

In [None]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=150,
    split_respect_sentence_boundary=True,
)
raw_texts = [{"content": doc.page_content} for doc in loaded_artifact]
docs = preprocessor.process(raw_texts)

In [None]:
inputs = [doc.content for doc in docs]
inputs[0][0:500]

In [None]:
instructions_dataset = Dataset.from_dict({"input": inputs})

instructions_dataset

In [None]:
instructions_task = SelfInstructTask(
    application_description="An assistant that can answer questions about the open-source MLOps framework ZenML."
)

In [None]:
instructions_generator = OllamaLLM(
    model="mixtral",
    task=instructions_task,
)

instructions_pipeline = Pipeline(generator=instructions_generator)

In [None]:
generated_instructions = instructions_pipeline.generate(
    dataset=instructions_dataset, num_generations=1, batch_size=3
)

In [None]:
instructions = []
for generations in generated_instructions["instructions"]:
    for generation in generations:
        instructions.extend(generation)

print(f"Number of generated instructions: {len(instructions)}")

for instruction in instructions[:5]:
    print(instruction)

In [None]:
generated_instructions[0]

In [None]:
instructions_rag_dataset = generated_instructions.to_argilla()
instructions_rag_dataset[0]

In [4]:
import argilla as rg
from argilla._constants import DEFAULT_API_KEY

# Argilla credentials
api_url = "https://strickvl-argilla.hf.space" # "https://<YOUR-HF-SPACE>.hf.space"
api_key = "admin.apikey"
# # Huggingface credentials
# hf_token = "hf_..."

rg.init(api_url=api_url, api_key=api_key)

# # If you want to use your private HF Space
# rg.init(extra_headers={"Authorization": f"Bearer {hf_token}"})



In [None]:
instructions_rag_dataset.push_to_argilla(name=f"ollama_instructions_zenml_rag", workspace="admin")

In [5]:
from distilabel.tasks import TextGenerationTask, UltraFeedbackTask

preference_pipeline = pipeline(
    "preference",
    "instruction-following",
    generator=OllamaLLM(
        model="mixtral",
        task=TextGenerationTask(),
        max_new_tokens=256,
        num_threads=2,
        temperature=0.3,
    ),
    labeller=OllamaLLM(
        model="mixtral",
        task=UltraFeedbackTask.for_instruction_following(),
        max_new_tokens=256,
        num_threads=2,
        temperature=0.3,
    ),
    max_new_tokens=256,
    num_threads=2,
    # api_key=os.getenv("OPENAI_API_KEY", None),
    temperature=0.0,
)

In [6]:
remote_dataset = rg.FeedbackDataset.from_argilla(
    "ollama_instructions_zenml_rag", workspace="admin"
)
instructions_dataset = remote_dataset.pull()

instructions_dataset = instructions_dataset.format_as("datasets")
instructions_dataset

Dataset({
    features: ['input', 'instructions', 'instruction-rating', 'instruction-rating-suggestion', 'instruction-rating-suggestion-metadata', 'external_id', 'metadata', 'vectors'],
    num_rows: 6154
})

In [7]:
instructions_dataset[0]

{'input': 'An end-to-end project\n\nPut your new knowledge in action with an end-to-end project\n\nThat was awesome! We learned so many advanced MLOps production concepts:\n\nThe value of deploying ZenML\u200b\n\nAbstracting infrastructure configuration into stacks\u200b\n\n\u200bConnecting remote storage\u200b\n\n\u200bOrchestrating on the cloud\u200b\n\n\u200bConfiguring the pipeline to scale compute\u200b\n\n\u200bConnecting a git repository\u200b\n\nWe will now combine all of these concepts into an end-to-end MLOps project powered by ZenML.\n\nGet started\n\nStart with a fresh virtual environment with no dependencies. Then let\'s install our dependencies:\n\npip install "zenml[templates,server]" notebook\n\nzenml integration install sklearn -y\n\nWe will then use\n\nZenML templates\n\nto help us get the code we need for the project:\n\nmkdir zenml_batch_e2e\n\ncd zenml_batch_e2e\n\nzenml init --template e2e_batch --template-with-defaults\n\n# Just in case, we install the requiremen

In [8]:
instructions_dataset = instructions_dataset.rename_columns({"input": "context", "instructions": "input"})

In [9]:
preference_dataset = preference_pipeline.generate(
    instructions_dataset,  # type: ignore
    num_generations=2,
    batch_size=8,
    display_progress_bar=True,
)

  return self._generate(


Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

Output()

In [10]:
preference_dataset[0]

In [None]:
# Uploading the Preference Dataset
preference_rg_dataset = preference_dataset.to_argilla()

# Adding the context as a metadata property in the new Feedback dataset, as this
# information will be useful later.
for record_feedback, record_huggingface in zip(
    preference_rg_dataset, preference_dataset
):
    record_feedback.metadata["context"] = record_huggingface["context"]

preference_rg_dataset.push_to_argilla(name=f"ollama_instructions_zenml_rag", workspace="admin")

In [None]:
preference_rg_dataset.push_to_huggingface("strickvl/ollama_instructions_zenml_rag")