In [None]:
# install bonita first
# !pip install -e git+https://github.com/BatsResearch/bonito#egg=bonito

We can start by downloading all our documentation from our Gitbook docs located
at docs.zenml.io. We'll use the langchain scraper to make this easy.

In [2]:
from zenml.client import Client

artifact = Client().get_artifact_version('fb39d530-6655-4cc4-a2dd-a3a85af973e2')
loaded_artifact = artifact.load()

In [6]:
from rich import inspect

inspect(loaded_artifact[0])

In [8]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [9]:
import spacy

nlp = spacy.load("en_core_web_sm")  # Load the English language model

def split_into_sentences(text):
    doc = nlp(text)  # Process the text with SpaCy
    sentences = [sent.text.strip() for sent in doc.sents]  # Extract sentences and strip whitespace
    return sentences

def process_documents(documents):
    all_sentences = []
    for doc in documents:
        text = doc.page_content  # Extract the text from the Document object
        sentences = split_into_sentences(text)  # Split the text into sentences
        all_sentences.extend(sentences)  # Add the sentences to the list
    return all_sentences

  # Your list of langchain Document objects
sentences = process_documents(loaded_artifact)  # Process the documents and get all sentences
sentences[0:5]

['GitHub Container Registry\n\nStoring container images in GitHub.',
 'The GitHub container registry is a\n\ncontainer registry\n\nflavor that comes built-in with ZenML and uses the\n\nGitHub Container Registry\n\nto store container images.',
 'When to use it\n\nYou should use the GitHub container registry if:\n\none or more components of your stack need to pull or push container images.',
 "you're using GitHub for your projects.",
 "If you're not using GitHub, take a look at the other container registry flavors."]

In [10]:
from datasets import Dataset

# Assuming sentences is a list of strings, where each string is a sentence
data = {"sentence": sentences}
dataset = Dataset.from_dict(data)

print(dataset)

Dataset({
    features: ['sentence'],
    num_rows: 8190
})


In [11]:
from bonito import Bonito, SamplingParams
from datasets import load_dataset

# Initialize the Bonito model
bonito = Bonito("BatsResearch/bonito-v1")

sampling_params = SamplingParams(max_tokens=256, top_p=0.95, temperature=0.5, n=1)
synthetic_dataset = bonito.generate_tasks(
    dataset,
    context_col="sentence",
    task_type="qg",
    sampling_params=sampling_params
)