In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
from importlib import resources
from openai import OpenAI
from datasets import Dataset
from instructlab.sdg.pipeline import (
    FULL_PIPELINES_PACKAGE,
    Pipeline,
    PipelineContext,
)

### Steps to run the teacher model 
Note: This section details using ilab to host the model. But you can use any OpenAI API hosting package
* Install instructlab and do `ilab init`
* Run `ilab model download`
* Run model serve :
    ```bash
    ilab serve --model-path $HOME/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf
    ```


In [10]:
## Once we have the above teacher model running lets initialize a OpenAI client that will make calls to our local teacher.
api_key="EMPTY",
base_url="http://localhost:8000/v1"
client = OpenAI(
    api_key=api_key,
    base_url=base_url,
)
teacher_model = client.models.list().data[0].id

In [24]:
## Instantiate the instructlab knowledge generation pipeline
model_family = "mixtral"
context = PipelineContext(
            client=client,
            model_family=model_family,
            model_id=teacher_model,
            num_instructions_to_generate=0,
            max_num_tokens=2048,
        )
yaml_path = resources.files(FULL_PIPELINES_PACKAGE).joinpath("knowledge.yaml")
knowledge_pipeline = Pipeline.from_file(context, yaml_path)

### Adding your document
* Use docling to parse your document
* Once you have that select in any section from that document and put that in below list

In [21]:
documents = ["""The **tonsils** are a set of lymphoid organs facing into the aerodigestive tract, which is known as Waldeyer's tonsillar ring and consists of the adenoid tonsil or "
    "pharyngeal tonsil, two tubal tonsils, two palatine tonsils, and the lingual tonsils. These organs play an important role in the immune system. When used unqualified, the term"
    " most commonly refers specifically to the palatine tonsils, which are two lymphoid organs situated at either side of the back of the human throat. The palatine tonsils and the"
    " adenoid tonsil are organs consisting of lymphoepithelial tissue located near the oropharynx and nasopharynx parts of the throat"""]


### Preparing QNA yaml's seed example
* Next copy the seed example from your `qna.yaml`. 
* In our `qna.yaml` we write a list of seed examples where each element is context + 3 Question-Answers.
* Copy one of the context + 3QA and put it below
* `icl_document` is the context and `icl_query_x` and `icl_response_x` are the 3 QAs
* We will also add the `domain` and `document outline` field from the qna file.

In [22]:
qna_seed_example = {
                "icl_document": "The **tonsils** are a set of lymphoid organs facing into the aerodigestive tract, which is known as Waldeyer's"
                " tonsillar ring and consists of the adenoid tonsil or pharyngeal tonsil, two tubal tonsils, two palatine tonsils, and the lingual tonsils.",
                
                "icl_query_1": "what is the location of the tubal tonsils?",
                
                "icl_response_1": "The location of the tubal tonsils is the roof of the pharynx.",
                
                "icl_query_2": "How long does the adenoid grow?",
                
                "icl_response_2": "The adenoid grows until the age of 5, starts to shrink at the age of 7 and becomes small in adulthood.",
                
                "icl_query_3": "What is the immune systems first line of defense against ingested or inhaled foreign pathogens?",
                
                "icl_response_3": "The tonsils are the immune systems first line of defense.",
                
                "domain": "Anatomy",
                
                "document_outline": "Medical description of tonsils",
}

In [None]:
## Now lets prepare the final dataset for running the generation.
knowledge_dataset = []
for document in documents:
    row = qna_seed_example.copy()
    row.update({'document': document})
    knowledge_dataset.append(row)
knowledge_dataset = Dataset.from_list(knowledge_dataset)

In [25]:
## Run the generation
samples = knowledge_pipeline.generate(knowledge_dataset)

gen_spellcheck Prompt Generation: 100%|██████████| 1/1 [00:06<00:00,  6.61s/it]
gen_knowledge Prompt Generation: 100%|██████████| 2/2 [00:41<00:00, 21.00s/it]
eval_faithfulness_qa_pair Prompt Generation: 100%|██████████| 8/8 [00:32<00:00,  4.04s/it]
eval_faithfulness_qa_pair Prompt Generation: 100%|██████████| 5/5 [00:20<00:00,  4.03s/it]
Map (num_proc=8): 100%|██████████| 8/8 [00:00<00:00, 56.69 examples/s]
Filter (num_proc=8): 100%|██████████| 8/8 [00:00<00:00, 73.59 examples/s]
num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
Map (num_proc=5): 100%|██████████| 5/5 [00:00<00:00, 42.70 examples/s]
num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
Filter (num_proc=5): 100%|██████████| 5/5 [00:00<00:00, 48.04 examples/s]
eval_relevancy_qa_pair Prompt Generation: 100%|██████████| 8/8 [00:37<00:00,  4.73s/it]
eval_relevancy_qa_pair Prompt Generation: 100%|██████████| 3/3 [00:12<00:00,  4.03s/it]
Map (num_proc=8): 100%|██████████| 8/8 [00:00<00:00, 57

In [26]:
print(samples[0]['document'])
print("****************************************")
print(samples[0]['question'])
print("****************************************")
print(samples[0]['response'])

Document:
The tonsils are a set of lymphoid organs facing into the aerodigestive tract, which is known as Waldeyer's tonsillar ring and consists of the adenoid tonsil or pharyngeal tonsil, two tubal tonsils, two palatine tonsils, and the lingual tonsils. These organs play an important role in the immune system. When used unqualified, the term most commonly refers specifically to the palatine tonsils, which are two lymphoid organs situated at either side of the back of the human throat. The palatine tonsils and the adenoid tonsil are organs consisting of lymphoepithelial tissue located near the oropharynx and nasopharynx parts of the throat.
****************************************
Which part of the throat do the palatine tonsils reside in?
****************************************
The palatine tonsils are situated at either side of the back of the human throat.

