# 1. Setting up Environment

## 1.1 Installing required libraries

In [None]:
!pip install git+https://github.com/huggingface/transformers
!pip install datasets
!pip install bitsandbytes==0.39.1 accelerate
!pip install kaggle
!pip install einops
!pip install ijson
!pip install scipy
!pip install better_profanity
!pip install ipywidgets

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-5mm4ig0s
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-5mm4ig0s
  Resolved https://github.com/huggingface/transformers to commit 769a9542de4e8b23f0a551738e18760621f463e8
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.37.0.dev0-py3-none-any.whl size=8271562 sha256=3bbc3d2be4a8d0310606aa67b71ae85821dcf9d9761c390deb013dbec1c29a7c
  Stored in directory: /tmp/pip-ephem-wheel-cache-vpxganrl/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformers

## 1.2 Importing modules

In [None]:
from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from datasets import Dataset, load_dataset
import transformers
import torch

import os
import gc
import concurrent.futures
from filelock import FileLock
import logging
import random
from random import shuffle
import json
import ijson
from time import time
import re
from tqdm import tqdm
from IPython.display import clear_output
from huggingface_hub import hf_hub_download
import subprocess

# 2. Setting Up Configurations

In [None]:
logger = logging.getLogger("ds_gen")
logger.setLevel(logging.DEBUG)

In [None]:
SAVE_RESPONSE_COUNT = 20
SAVE_INTERVAL = 900

## 2.1 Dataset Configurations

In [None]:
# No of examples to generate
dataset_length = 1000
# No of examples to use as seeds
e = 3
# No of examples to generate in one instruction cycle
N = 5

In [None]:
data = load_dataset("databricks/databricks-dolly-15k")

Downloading readme:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
## Seed datasets used as starting points
# format: (dataset, instruction_column, input_column)
datasets = [
    (
        "databricks/databricks-dolly-15k",
        "instruction",
        "context",
        "category",
        ['brainstorming', 'classification', 'closed_qa', 'creative_writing', 'general_qa', 'information_extraction', 'open_qa', 'summarization']
    ),
    # (
    #     "SirNeural/flan_v2",
    #     "inputs",
    #     None
    # )
]

## 2.2 Model Configurations

In [None]:
# 4-bit quantization config
quantization_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# Model used for generating the dataset
eval_model_GGML = "TheBloke/h2ogpt-gm-oasst1-en-2048-falcon-40b-v2-GGML"
eval_model_file_GGML = "h2ogpt-falcon-40b.ggmlv3.q4_k.bin"

# 3. Preparation

## 3.1 Model

In [None]:
model_path = hf_hub_download(
    eval_model_GGML,
    eval_model_file_GGML
)

In [None]:
!git clone https://github.com/cmp-nct/ggllm.cpp
os.chdir("ggllm.cpp")
!rm -rf build && mkdir build && export LLAMA_CUBLAS=1 && make -DGGML_CUBLAS=1 .. && make falcon_main
os.chdir("..")

Cloning into 'ggllm.cpp'...
remote: Enumerating objects: 4261, done.[K
remote: Counting objects: 100% (1740/1740), done.[K
remote: Compressing objects: 100% (109/109), done.[K
remote: Total 4261 (delta 1673), reused 1631 (delta 1631), pack-reused 2521[K
Receiving objects: 100% (4261/4261), 107.40 MiB | 4.28 MiB/s, done.
Resolving deltas: 100% (2891/2891), done.
/bin/bash: cmake: command not found


## 3.2 Tokenizer

In [None]:
tokenizer_download_path = os.path.join(os.path.dirname(model_path), 'tokenizer.json')
os.system(f'wget -O {tokenizer_download_path} https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-falcon-40b-v2/resolve/main/tokenizer.json')

--2023-08-25 07:01:35--  https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-falcon-40b-v2/resolve/main/tokenizer.json
Resolving huggingface.co (huggingface.co)... 18.165.122.11, 18.165.122.101, 18.165.122.120, ...
Connecting to huggingface.co (huggingface.co)|18.165.122.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2734158 (2.6M) [text/plain]
Saving to: ‘/home/user/.cache/huggingface/hub/models--TheBloke--h2ogpt-gm-oasst1-en-2048-falcon-40b-v2-GGML/snapshots/67bac2c70c558b2a01231392c86fd88c56aea064/tokenizer.json’

     0K .......... .......... .......... .......... ..........  1%  443K 6s
    50K .......... .......... .......... .......... ..........  3%  443K 6s
   100K .......... .......... .......... .......... ..........  5% 78.4M 4s
   150K .......... .......... .......... .......... ..........  7% 45.0M 3s
   200K .......... .......... .......... .......... ..........  9%  448K 3s
   250K .......... .......... .......... .......... .......... 11

0

## 3.3 Functions

### 3.3.1 Load Categories

In [None]:
def load_categories():
  folder_path = "res"
  file_name = "categories.txt"

  if not os.path.isfile(os.path.join(folder_path, file_name)):
    if not os.path.exists(folder_path):
      os.makedirs(folder_path)

    logger.warning("The Wikipedia categories file could not be located. Please ensure that it has been uploaded to the './res' directory.")

    while not os.path.isfile(os.path.join(folder_path, file_name)):
      pass

  logger.info("Found the Wikipedia categories file.")

  categories = set()
  with open(os.path.join(folder_path, file_name), "r") as file:
      for line in file:
          categories.add(line.strip())

  logger.info("Wikipedia categories have been loaded.")

  return list(categories)

### 3.3.2 Generate Instruction Prompt

In [None]:
available_categories = []

def generate_instruction_prompt(N, e, datasets, categories):
    global available_categories

    dataset, instruction_cn, context_cn, category_cn, d_categories = random.choice(datasets)
    data = load_dataset(dataset, split="train")

    if not available_categories:
        available_categories = d_categories.copy()

    selected_category = random.choice(available_categories)
    available_categories.remove(selected_category)

    data = data.filter(lambda example: example[category_cn] == selected_category)

    seed_instructions = data.shuffle(seed=random.randint(1, 100))[:e]

    examples = ""
    for instruction, context in zip(seed_instructions[instruction_cn], seed_instructions[context_cn]):
        examples += "<example>" + instruction

        if context:
            examples += "\n\nInput:" + context

        examples += "</example>\n"

        if len(examples) >= 2000:
            break

    topic_guided = random.randint(0, 1)
    topics = ""
    if topic_guided == True:
        shuffle(categories)
        guided_topics = categories[:e]

        topics += ",".join(guided_topics)

    prompt_text = f"""### SYSTEM: You are an AI assistant. Answer as honestly and correctly as possible.
### YOUR TASK: Generate {N} diverse examples that are similar to the provided examples.
You do not need to provide a responses to the generated examples.
Do not repeat the provided examples.
Each generated example must include an instruction.
Each generated example may have an additional context, if necessary.
Each generated example can be either an imperative sentence or a question.
Each generated example must begin with "<example>" and end with "</example>"
{f"Each generated example should be themed on one of the topics of {topics}" if topic_guided == True else ""}

### PROVIDED EXAMPLES(Category: {selected_category}):
{examples}"""
    prompt = f"{prompt_text}\n###RESPONSE:".replace("\"", "\\\"")

    del data
    del instruction_cn
    del context_cn
    del seed_instructions
    del examples
    del categories
    del topics
    del prompt_text
    if topic_guided:
      del guided_topics

    gc.collect()

    return prompt, dataset, topic_guided, selected_category

### 3.3.3 Generate New Tokens

In [None]:
def generate(prompt, model_path):
    cmd = f"/home/user/LaMini-LM/ggllm.cpp/falcon_main -t 8 -ngl 100 -b 1 -m {model_path} -p \"{prompt}\""

    outputs = subprocess.check_output(cmd, shell=True)

    return str(outputs)[len(prompt):]

### 3.3.4 Extract from Output

In [None]:
def extract_examples_from_model_output(model_output):
    pattern = r'<example>(.*?)<\/example>|(?:(?<=\n)|^)(?:\s*)(?:-|\d+\.)\s+(.*?)(?=\n(?:\s*)(?:-|\d+\.|$))|(?:(?<=\n)|^)(?:\s*)\d+\.\s+(.*?)(?=\n(?:\s*)\d+\.|$)'
    matches = re.findall(pattern, model_output, re.DOTALL)
    outs = [match[0] or match[1] for match in matches]

    return [out.replace("\\\\n", "\\n").replace("\\'", "'").replace("\\\"", "\"").strip() for out in outs]

### 3.3.5 Populate Examples into the Dataset

In [None]:
def write_to_file(data: dict, folder_path: str, file_name: str) -> None:
    lock = FileLock(os.path.join(folder_path, file_name + ".lock"))
    with lock:
        with open(os.path.join(folder_path, file_name), "w") as file:
            json.dump(data, file)

def populate_instructions_onto_the_dataset(examples, dataset_name, is_topic_guided, category):
    folder_path = "out"
    file_name = "dataset.json"

    dataset = {
        "instruction": [],
        "response": [],
        "source": [],
        "is_topic_guided": [],
        "category": []
    }

    if not os.path.isfile(os.path.join(folder_path, file_name)):
      if not os.path.exists(folder_path):
        os.makedirs(folder_path)

      with open(os.path.join(folder_path, file_name), "a") as file:
        file.write("{}")

    with open(os.path.join(folder_path, file_name), "r") as file:
      dataset.update(json.load(file))

    dataset["instruction"] += examples
    dataset["source"] += [dataset_name] * len(examples)
    dataset["is_topic_guided"] += [is_topic_guided] * len(examples)
    dataset["category"] += [category] * len(examples)

    curr_example_count = len(dataset["instruction"])

    with concurrent.futures.ThreadPoolExecutor() as executor:
        future = executor.submit(write_to_file, dataset, folder_path, file_name)

    del folder_path
    del file_name
    del dataset
    del examples
    del dataset_name
    del is_topic_guided

    gc.collect()

    return curr_example_count

def populate_responses_onto_the_dataset(responses):
    folder_path = "out"
    file_name = "dataset.json"

    dataset = {
        "instruction": [],
        "response": [],
        "source": [],
        "is_topic_guided": [],
        "category": []
    }

    with open(os.path.join(folder_path, file_name), "r") as file:
      dataset.update(json.load(file))

    dataset["response"] += responses

    with open(os.path.join(folder_path, file_name), "w") as file:
      json.dump(dataset, file)

    del folder_path
    del file_name
    del dataset
    del responses

    gc.collect()

# 4. Running the Dataset Generation Process

In [None]:
categories = load_categories()

INFO:ds_gen:Found the Wikipedia categories file.
INFO:ds_gen:Wikipedia categories have been loaded.


In [None]:
pbar = tqdm(total=dataset_length)

curr_example_count = 0

while curr_example_count < dataset_length:
    try:
        logger.info("Generating prompt")
        prompt, dn, is_topic_guided, category = generate_instruction_prompt(N, e, datasets, categories)

        logger.info("Generating model outputs for prompts")
        model_output = generate(prompt, model_path)

        logger.info("Populating examples onto the dataset")
        examples = extract_examples_from_model_output(model_output)
        curr_example_count = populate_instructions_onto_the_dataset(examples, dn, is_topic_guided, category)

        clear_output(wait=True)
        pbar.n = curr_example_count
        pbar.refresh()
    except:
        pass

pbar.close()

 84%|████████▍ | 839/1000 [1:45:03<20:09,  7.51s/it]main: build = 883 (2b487f2)
falcon.cpp: loading model from /home/user/.cache/huggingface/hub/models--TheBloke--h2ogpt-gm-oasst1-en-2048-falcon-40b-v2-GGML/snapshots/67bac2c70c558b2a01231392c86fd88c56aea064/h2ogpt-falcon-40b.ggmlv3.q4_k.bin
falcon.cpp: file version 4
falcon.cpp: fallback for old file format. Loading BPE merges from tokenizer.json
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|          Info |     format | n_vocab |   n_bpe | n_ctx | n_embd |   n_head ; kv | n_layer | falcon | ftype |   n_ff |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|               |    ggml v3 |   65024 |   64784 |  2048 |   8192 |     128 ;   8 |      60 | 40;40B |    15 |  32768 |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+


In [None]:
batch_list_size = 4
instructions = []
responses = []

def should_populate_responses(t0):
  return len(responses) >= LOGGING_RESPONSE_COUNT or time() - t0 >= LOGGING_INTERVAL

with open("out/dataset.json", "r") as file:
  curr_response_count = 0
  for response in ijson.items(file, "response.item"):
    curr_response_count += 1

with open("out/dataset.json", "r") as file:
  instruction_count = 0
  for instruction in ijson.items(file, "instruction.item"):
    instruction_count += 1

t0 = time()
pbar = tqdm(total=instruction_count)
pbar.update(curr_response_count)
with open("out/dataset.json", "r") as file:
  for i, instruction in enumerate(ijson.items(file, "instruction.item")):

    if i >= curr_response_count:
        if len(instructions) < batch_list_size:
            instructions.append(instruction)

        prompts = instructions
        model_outputs = generate(prompts, pipe)

        for i in range(batch_list):
            responses.append(model_outputs[i][0]['generated_text'])

    if should_populate_responses(t0):
        populate_responses_onto_the_dataset(responses)
        pbar.update(len(responses))
        responses = []
        t0 = time()

pbar.close()