<a href="https://colab.research.google.com/github/wsanjay/Interesting_notebooks_collection/blob/main/%F0%9F%A7%91%E2%80%8D%F0%9F%8C%BE_DataGrower_Grow_your_own_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🧑‍🌾 DataGrower - Grow your own datasets

You need a dataset for training LLMs. So why not grow one from scratch?

This notebook will help you to build a snthetic dataset from scratch.

1. Define some instruction samples
2. Add you configuration and create the dataset.



In [None]:
DATA = [
    "What if the Beatles had never formed as a band?",
    "Given that f(x) = 5x^3 - 2x + 3, find the value of f(2)."
]

In [None]:
!pip install --upgrade -qqq git+https://github.com/argilla-io/distilabel.git

from distilabel.pipeline import Pipeline
from distilabel.llms.huggingface import InferenceEndpointsLLM
from distilabel.steps.generators.data import LoadDataFromDicts
from distilabel.steps.tasks import TextGeneration
from distilabel.steps.globals.huggingface import PushToHub
from google.colab import userdata

# @markdown ---
# @markdown ### 🌱 Specify the dataset to grow:

# @markdown 🤗 The Huggingface hub dataset repo to push to
REPO_ID = "burtenshaw/gone_and_growned_my_own_dataset" # @param {type:"string"}
HF_TOKEN_NAME = "HF_TOKEN" # @param {type:"string"}

# @markdown 💸 The repo id of an LLM with a free Inference API
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"  # @param {type:"string"}

# @markdown 📖 Model Configuration
TEMPERATURE = 0.7 # @param {type:"slider", min:0, max:1, step:0.1}
MAX_TOKENS = 512 # @param {type:"slider", min:64, max:2048, step:64}
HF_TOKEN = userdata.get(HF_TOKEN_NAME)
# @markdown ---
with Pipeline(
    name="simple-text-generation-pipeline",
    description="A simple text generation pipeline",
) as pipeline:

    llm = InferenceEndpointsLLM(
        base_url=f"https://api-inference.huggingface.co/models/{MODEL_ID}",
        api_key=HF_TOKEN
    )

    load_data = LoadDataFromDicts(
        name="load_data",
        data=[
            {
                "instruction": sample,
            } for sample in DATA
        ],
        batch_size=1,
    )
    generate_with_mistral = TextGeneration(
        name="generate_with_mistral", llm=llm
    )

    load_data.connect(generate_with_mistral)


distiset = pipeline.run(
    parameters={
        "generate_with_mistral": {
            "llm": {
                "generation_kwargs": {
                    "temperature": TEMPERATURE,
                    "max_new_tokens": MAX_TOKENS,
                    "num_generations": 2
                }
            }
        }
    },
)

distiset.push_to_hub(REPO_ID)