In [1]:
import jsonlines
import itertools
from pprint import pprint

from datasets import load_dataset

In [2]:
instruction_tuned_dataset = load_dataset("tatsu-lab/alpaca", split="train", streaming=True)

In [3]:
m = 5
print("Instruction-tuned dataset:")
top_m = list(itertools.islice(instruction_tuned_dataset, m))
for j in top_m:
  pprint(j)
  break

Instruction-tuned dataset:
{'input': '',
 'instruction': 'Give three tips for staying healthy.',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits '
           'and vegetables. \n'
           '2. Exercise regularly to keep your body active and strong. \n'
           '3. Get enough sleep and maintain a consistent sleep schedule.',
 'text': 'Below is an instruction that describes a task. Write a response that '
         'appropriately completes the request.\n'
         '\n'
         '### Instruction:\n'
         'Give three tips for staying healthy.\n'
         '\n'
         '### Response:\n'
         '1.Eat a balanced diet and make sure to include plenty of fruits and '
         'vegetables. \n'
         '2. Exercise regularly to keep your body active and strong. \n'
         '3. Get enough sleep and maintain a consistent sleep schedule.'}


### PromptTemplates

In [4]:
prompt_template_with_input = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:"""

prompt_template_without_input = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:"""

Hydrate prompts (add data to prompts)

In [8]:
# processed_data = []
# for j in top_m:
def process_data(data_dict: dict) -> dict:
    if not data_dict["input"]:
        processed_prompt = prompt_template_without_input.format(
            instruction=data_dict["instruction"]
        )
    else:
        processed_prompt = prompt_template_with_input.format(
            instruction=data_dict["instruction"], input=data_dict["input"]
        )
        
    return {"input": processed_prompt, "output": data_dict["output"]}

In [9]:
type(top_m[0])

dict

In [10]:
processed_data = list(map(process_data, top_m))

pprint(processed_data[0])

{'input': 'Below is an instruction that describes a task. Write a response '
          'that appropriately completes the request.\n'
          '\n'
          '### Instruction:\n'
          'Give three tips for staying healthy.\n'
          '\n'
          '### Response:',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits '
           'and vegetables. \n'
           '2. Exercise regularly to keep your body active and strong. \n'
           '3. Get enough sleep and maintain a consistent sleep schedule.'}


Save data to json

In [11]:
with jsonlines.open(f'json/alpaca_processed.jsonl', 'w') as writer:
    writer.write_all(processed_data)

In [14]:
# Pssst! If you were curious how to upload your own dataset to Huggingface
# Here is how we did it

import pandas as pd
import datasets
from datasets import Dataset

finetuning_dataset = Dataset.from_pandas(pd.DataFrame(data=processed_data))
finetuning_dataset.push_to_hub("veeeeee/test-alpaca")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [22]:
test = load_dataset("veeeeee/test-alpaca", split="train")
test

Dataset({
    features: ['input', 'output'],
    num_rows: 5
})

In [25]:
pprint(test[4])

{'input': 'Below is an instruction that describes a task. Write a response '
          'that appropriately completes the request.\n'
          '\n'
          '### Instruction:\n'
          'Describe a time when you had to make a difficult decision.\n'
          '\n'
          '### Response:',
 'output': 'I had to make a difficult decision when I was working as a project '
           'manager at a construction company. I was in charge of a project '
           'that needed to be completed by a certain date in order to meet the '
           'client’s expectations. However, due to unexpected delays, we were '
           'not able to meet the deadline and so I had to make a difficult '
           'decision. I decided to extend the deadline, but I had to stretch '
           'the team’s resources even further and increase the budget. '
           'Although it was a risky decision, I ultimately decided to go ahead '
           'with it to ensure that the project was completed on time and tha