In [None]:
%%bash
pip install datasets

Load and prepare dataset

In [None]:
from datasets import load_dataset
import random
import re
import csv
from datetime import datetime
import random
import copy

In [None]:
tiny_stories_dataset = load_dataset("roneneldan/TinyStories", split="validation")
# SlimOrca_dataset = load_dataset("Open-Orca/SlimOrca")
truthful_qa_dataset = load_dataset("truthful_qa", "generation", split="validation")
sciq_dataset = load_dataset("sciq", split="test")
wikipedia_dataset = load_dataset("wikimedia/wikipedia", "20231101.en")

## Utils

In [None]:
def normalize_string(text):
  """
  Normalizes a string by removing leading and trailing whitespace, double spaces, and whitespace before symbols.
  Args:
    text: The string to normalize.
  Returns:
    The normalized string.
  """

  # Remove leading and trailing whitespace.
  text = text.strip()

  # Remove whitespace before symbols.
  while re.search(r'\s([\'\".,!?])', text):
    text = re.sub(r'\s([\'\".,!?])', r'\1', text)

  # Remove double spaces.
  text = re.sub(r'\s+', ' ', text)

  return text

In [None]:
sr = normalize_string("This is an  ' s sentence      .  I 'm Tien  ")
print(sr)
sne01 = normalize_string("This   is an     sentence. I'm Tien      ")
print(sne01)
sne02 = normalize_string("This   is an     sentence. I'm      Tien !!!    ")
print(sne02)

In [None]:
def write_prompts_to_file(filename, dataset, num_samples, seq_len, prompt_key,
                          metadata_keys, is_question=False):
  """
  Select random prompts from a dataset and write them to a file.

  Args:
    filename: The name of the file to write to.
    dataset: The list prompts to choose indices.
    num_samples: The number of prompts to write.
    seq_len: The maximum length of the prompts.
    prompt_key: The key in the dataset that contains the prompts.
    metadata_keys: The keys in the dataset that contain metadata.
    is_question: Whether the prompts are questions or not.
  """
  # Open and init write file
  in_file_path = filename + f"_{num_samples}.txt"
  meta_file_path = filename + f"_{num_samples}_metadata.csv"
  in_file = open(in_file_path, 'w')
  meta_file = open(meta_file_path, 'w')

  metadatas = []
  # Generate a random prompts from dataset.
  random_prompts = random.choices(dataset, k=num_samples)
  in_file.write(f"{num_samples}\n")
  # Write the prompts to the file.
  for sample in random_prompts:
    if not is_question:
      # Remove "\n" character
      sample[prompt_key] = sample[prompt_key].replace("\n", " ")
      # Split the prompt into words
      sentence_words = sample[prompt_key].split(" ")
      ub = min(len(sentence_words), seq_len)
      # Random sampling indices at least 5 words to upper bound of prompt
      cutoff_index = random.randrange(5, ub)
      prompt = " ".join(sentence_words[:cutoff_index])
    else:
      prompt = sample[prompt_key]

    # Write the prompts to the in file.
    in_file.write(prompt + "\n")

    metadata = {
      "prompt": prompt,
    }
    for key in metadata_keys:
      metadata[key] = sample[key]
    metadatas.append(metadata)

  # write promts with metadata
  meta_writer = csv.DictWriter(meta_file, fieldnames=metadatas[0].keys())
  meta_writer.writeheader()
  for metadata in metadatas:
    meta_writer.writerow(metadata)

  in_file.close()
  meta_file.close()

### sciq dataset

In [None]:
print(sciq_dataset)

In [None]:
for i in range(3):
    print(sciq_dataset[i]['question'])

In [None]:
write_prompts_to_file("sciq_in", sciq_dataset, 256, 1024, "question", ["correct_answer", "support"], is_question=True)

### TinyStories dataset

In [None]:
print(tiny_stories_dataset)

In [None]:
for i in range(3):
    print(tiny_stories_dataset[i]['text'])
# tiny_stories_dataset.features['text']

In [None]:
write_prompts_to_file("tinystories_in", tiny_stories_dataset, 256, 1024, "text", ["text"], is_question=False)

## truthful_qa dataset

In [None]:
print(truthful_qa_dataset)

In [None]:
for i in range(3):
    print(truthful_qa_dataset[i]['question'])

In [None]:
write_prompts_to_file("truthful_qa_in", truthful_qa_dataset, 256, 1024, "question", ["best_answer", "correct_answers", "source"], is_question=True)

### wikipedia dataset

In [None]:
print(wikipedia_dataset)

In [None]:
for i in range(3):
  print(f"===== {wikipedia_dataset['train'][i]['title']} =====")
  print(wikipedia_dataset['train'][i]['text'])

In [None]:
write_prompts_to_file("wikipedia_in", wikipedia_dataset['train'], 256, 1024, "text", ["text"], is_question=False)

### SlimOrca dataset (Not usefull)

In [None]:
print(SlimOrca_dataset)
print(SlimOrca_dataset['train'][0]['conversations'][0])

In [None]:
for i in range(3):
    print(f"===== {i} =====")
    for content in SlimOrca_dataset['train'][i]['conversations']:
      print(content)

In [None]:
parse_SlimOrca_dataset = []
for sample in SlimOrca_dataset['train']:
  in_sample = {
      "instruction": "",
      "response": ""
  }
  for content in sample['conversations']:
    # if content['from'] == 'system':
    #   in_sample['instruction'] = content['value'] + in_sample['instruction']
    if content['from'] == 'human':
      in_sample['instruction'] += " " + content['value']
    else:
      in_sample['response'] = content['value']  # GPT response

  parse_SlimOrca_dataset.append(in_sample)

In [None]:
for i in range(3):
    print(parse_SlimOrca_dataset[i])

In [None]:
write_prompts_to_file("SlimOrca_in", parse_SlimOrca_dataset, 128, 1024, "instruction", ["response"], is_question=True)