# 1. Setting up Environment

## 1.1 Installing required libraries

In [None]:
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q datasets
!pip install -q -U  bitsandbytes==0.39.1 accelerate
!pip install -q einops
!pip install -q ijson
!pip install nltk
# !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: nltk
Successfully installed nltk-3.8.1


## 1.2 Importing modules

In [None]:
# Standard library imports for system and file operations
import gc
import os
from getpass import getpass
from time import time
import subprocess
import multiprocessing
import threading

# Standard library imports for data manipulation and computation
import json
import random
import re
import string
from random import shuffle

# Imports for logging and progress tracking
import logging
from IPython.display import clear_output
from tqdm import tqdm

# Imports for natural language processing and machine learning
import ijson
import nltk
import torch
import transformers
from accelerate import Accelerator
from datasets import Dataset, load_dataset
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import hf_hub_download

## 1.3 Setting Environment Variables

# 2. Setting Up Configurations

In [None]:
logger = logging.getLogger("evolInstructLogger")

# Create handlers
c_handler = logging.StreamHandler()
f_handler = logging.FileHandler('evolInstructLogger.log')
c_handler.setLevel(logging.DEBUG)
f_handler.setLevel(logging.DEBUG)

c_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
c_handler.setFormatter(c_format)
f_handler.setFormatter(f_format)

# Add handlers to the logger
logger.addHandler(c_handler)
logger.addHandler(f_handler)

## 2.1 Dataset Configurations

In [None]:
## Datasets
datasets = [
    "databricks/databricks-dolly-15k"
]

## 2.2 Model Configurations

In [None]:
# Model used for generating the dataset
falcon_model_GGML = "TheBloke/h2ogpt-gm-oasst1-en-2048-falcon-40b-v2-GGML"
falcon_model_file_GGML = "h2ogpt-falcon-40b.ggmlv3.q6_k.bin"

falcon_model_path = hf_hub_download(
    falcon_model_GGML,
    falcon_model_file_GGML
)

# Model used for evaluating evolved instructions
llama_model_GGML = "TheBloke/Llama-2-70B-chat-GGML"
llama_model_file_GGML = "llama-2-70b-chat.ggmlv3.q4_0.bin"

llama_model_file_GGUF = "llama-2-70b-chat.ggufv3.q4_1.bin"

llama_model_path_GGML = hf_hub_download(
    llama_model_GGML,
    llama_model_file_GGML
)

llama_model_path_GGUF = os.path.join(os.path.dirname(llama_model_path_GGML), llama_model_file_GGUF)

# 3. Preparation

## Falcon Model

In [None]:
!git clone https://github.com/cmp-nct/ggllm.cpp
%cd ggllm.cpp
os.environ['LLAMA_CUBLAS'] = '1'
os.environ['PATH'] = "/usr/local/cuda/bin:/usr/local/cuda/bin:/home/user/miniconda3/envs/jupyter/bin:/home/user/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin"
!make falcon_main falcon_quantize falcon_perplexity
%cd ..




Cloning into 'ggllm.cpp'...
remote: Enumerating objects: 4273, done.[K
remote: Counting objects: 100% (1561/1561), done.[K
remote: Compressing objects: 100% (99/99), done.[K
remote: Total 4273 (delta 1498), reused 1470 (delta 1462), pack-reused 2712[K
Receiving objects: 100% (4273/4273), 107.45 MiB | 15.41 MiB/s, done.
Resolving deltas: 100% (2889/2889), done.
/home/user/.local/share/Trash/files/evol-instruct/ggllm.cpp
I ggllm.cpp build info: 
I UNAME_S:  Linux
I UNAME_P:  x86_64
I UNAME_M:  x86_64
I CFLAGS:   -I.              -O3 -std=c11   -fPIC -DNDEBUG -DGGML_PERF=1 -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -march=native -mtune=native -DGGML_USE_K_QUANTS -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/targets/x86_64-linux/include
I CXXFLAGS: -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -DGGML_PERF=1 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=nat

--2023-09-08 18:13:28--  https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-falcon-40b-v2/resolve/main/tokenizer.json
Resolving huggingface.co (huggingface.co)... 18.165.122.30, 18.165.122.120, 18.165.122.101, ...
Connecting to huggingface.co (huggingface.co)|18.165.122.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2734158 (2.6M) [text/plain]
Saving to: ‘/home/user/.cache/huggingface/hub/models--TheBloke--h2ogpt-gm-oasst1-en-2048-falcon-40b-v2-GGML/snapshots/67bac2c70c558b2a01231392c86fd88c56aea064/tokenizer.json’

     0K .......... .......... .......... .......... ..........  1% 37.5M 0s
    50K .......... .......... .......... .......... ..........  3% 38.9M 0s
   100K .......... .......... .......... .......... ..........  5%  154M 0s
   150K .......... .......... .......... .......... ..........  7%  123M 0s
   200K .......... .......... .......... .......... ..........  9% 63.4M 0s
   250K .......... .......... .......... .......... .......... 11

0

## Llama 2 Model

In [None]:
# Compile llama.cpp
!git clone https://github.com/ggerganov/llama.cpp
%cd llama.cpp
!make LLAMA_CUBLAS=1
%cd ../

# Convert the model from GGML to GGUF
!python3 llama.cpp/convert-llama-ggml-to-gguf.py -i {llama_model_path_GGML} -o {llama_model_path_GGUF} --eps 1e-5 --context-length 4096 --gqa 8

Cloning into 'llama.cpp'...
remote: Enumerating objects: 8602, done.[K
remote: Counting objects: 100% (4256/4256), done.[K
remote: Compressing objects: 100% (366/366), done.[K
remote: Total 8602 (delta 4072), reused 3935 (delta 3890), pack-reused 4346[K
Receiving objects: 100% (8602/8602), 7.80 MiB | 24.95 MiB/s, done.
Resolving deltas: 100% (5976/5976), done.
/home/user/llama.cpp
I llama.cpp build info: 
I UNAME_S:  Linux
I UNAME_P:  x86_64
I UNAME_M:  x86_64
I CFLAGS:   -I. -Icommon -DNDEBUG -DGGML_USE_K_QUANTS -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/targets/x86_64-linux/include  -O3 -std=c11   -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Wno-unused-function -pthread -march=native -mtune=native 
I CXXFLAGS: -I. -Icommon -DNDEBUG -DGGML_USE_K_QUANTS -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/targets/x86_64-linux/include  -O3 -s

## 3.3 Dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("h2oai/h2ogpt-gm-oasst1-en-2048-falcon-40b-v2")

In [None]:
def preprocess_dataset(example):
    instruction = f"{example['instruction']}\n\n{example['context']}" if example['context'] else f"{example['instruction']}"
    return {'instruction': tokenizer.decode(tokenizer.encode(instruction, max_length=1024, truncation=True))}

data = load_dataset(
    datasets[0],
    split="train",
).map(
    preprocess_dataset,
    remove_columns=['context', 'response']
)

## 3.4 nltk Stopwords

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 3.5 Helpers

### 3.5.1 Generate New Tokens

In [None]:
def falcon_generate(prompt, temp=0.8, timeout=600):
    global falcon_model_path

    prompt = prompt.replace("\"", "\\\"")
    cmd = f"ggllm.cpp/falcon_main -t 22 -ngl 60 -b 512 --temp {temp} -m {falcon_model_path} -p \"{prompt}\""

    def run_cmd(cmd, result):
        try:
            output = subprocess.check_output(cmd, shell=True)
            result['output'] = output
        except subprocess.CalledProcessError as e:
            result['error'] = str(e)

    # Create a multiprocessing Manager to share data between processes
    manager = multiprocessing.Manager()
    result = manager.dict()

    # Create a separate process to run the command
    process = multiprocessing.Process(target=run_cmd, args=(cmd, result))
    process.start()
    process.join(timeout=timeout)  # Wait for the process to finish or timeout

    if process.is_alive():
        # If the process is still running after the timeout, terminate it
        process.terminate()
        process.join()
        return ""

    if 'error' in result:
        return ""

    out = result.get('output', b'') \
        .decode("utf-8").replace("<|endoftext|>", "") \
        .strip().strip('\n') \
        .split("<bot>:", 1)[1].strip().strip('\n').replace("\\\"", "\"")

    return out

In [None]:
def llama_generate(prompt):
    global llama_model_path_GGUF

    prompt = prompt.replace("\"", "\\\"")
    prompt = f"<s>[INST]{prompt}[/INST]"
    cmd = f"llama.cpp/main -c 2048 -t 22 -m {llama_model_path_GGUF} -ngl 83 -p \"{prompt}\""

    try:
        output = subprocess.check_output(cmd, shell=True)
    except subprocess.CalledProcessError:
        return ""

    out = output.decode("utf-8").strip().strip('\n').split("[/INST]", 1)[1].replace("\\\"", "\"")

    return out

### 3.5.2 InstructionEvolution Class

In [None]:
class InstructionEvolution:
    def __init__(self, initial_instructions, config=None):
        self.pool = initial_instructions
        self.evolved_dataset = {
            'instruction': [],
            'response': [],
            'category': [],
            'evolution_strategy': [],
            'in-depth-evolving_operation': [],
            'epoch': []

        }
        self.config = {
            "strategy": None,
            "in_depth_evolution_operation": None,
            "prompt": None
        }

    def select_evolution_strategy(self):
        logger.info("Selecting evolution strategy")
        strategies = [
            (0, "in-depth-evolving"),
            (1, "in-breadth-evolving"),
        ]

        self.config["strategy"] = random.choice(strategies)
        logger.info(f"Evolution strategy: {self.config['strategy'][1]}")

        return self

    def select_in_depth_evolution_operation(self):
        logger.info("Selecting in-depth evolution operation")
        operations = [
            (0, "add-constraints"),
            (1, "deepening"),
            (2, "concretizing"),
            (3, "increase-reasoning-steps")
        ]

        self.config["in_depth_evolution_operation"] = random.choice(operations)
        logger.info(f"In-depth evolution operation: {self.config['in_depth_evolution_operation'][1]}")

        return self

    def format_prompt_with_in_depth_evolution_operation(self):
        match self.config['in_depth_evolution_operation'][0]:
            case 0:
                self.config["prompt"] = self.config["prompt"].format(operation="by adding one more constraints/requirements into #Given Prompt#")
            case 1:
                self.config["prompt"] = self.config["prompt"].format(operation="if #Given Prompt# contains inquiries about certain issues, the depth and breadth of the inquiry can be increased.")
            case 2:
                self.config["prompt"] = self.config["prompt"].format(operation="by replacing general concepts with more specific concepts.")
            case 3:
                self.config["prompt"] = self.config["prompt"].format(operation="if #Given Prompt# can be solved with just a few simple thinking processes, you can rewrite it to explicitly request multiple-step reasoning.")

        return self

    def generate_prompt(self, instruction):
        match self.config["strategy"][0]:
            case 0:
                self.config["prompt"] = """<human>: I want you to act as a prompt rewriter.
Your objective is to rewrite the #Given Prompt# into a more complex version.
But the rewritten prompt must be reasonable and must be understood and responded by humans.
Your rewriting cannot omit the non-text parts such as the table and code in #Given Prompt#:. Also, please do not omit the context in #Given Prompt#.
You should try your best not to make the #Rewritten Prompt# become verbose, #Rewritten Prompt# can only add 10 to 20 words into #Given Prompt#.
‘#Given Prompt#’, ‘#Rewritten Prompt#’, ‘given prompt’ and ‘rewritten prompt’ are not allowed to appear in #Rewritten Prompt#
You SHOULD complicate the given prompt {operation}
#Given Prompt#:
{instruction}
<bot>: #Rewritten Prompt#:"""

                self.config["prompt"] = self.config["prompt"].format(operation="{operation}", instruction=instruction)
                self.format_prompt_with_in_depth_evolution_operation()
            case 1:
                self.config["prompt"] = """<human>: I want you to act as a prompt creator.
Your goal is to draw inspiration from the #Given Prompt# to create a brand new prompt.
This new prompt should belong to the same domain as the #Given Prompt# but be even more rare.
The LENGTH and difficulty level of the #Created Prompt# should be similar to that of the #Given Prompt#.
The #Created Prompt# must be reasonable and must be understood and responded by humans.
‘#Given Prompt#’, ‘#Created Prompt#’, ‘given prompt’ and ‘created prompt’ are not allowed to appear in #Created Prompt#.
Your response only contains the #Created Prompt# and no explanation of the new prompt. Do not provide a response to either the #Given Prompt# or the #Created Prompt#.
#Given Prompt#:
{instruction}
<bot>: #Created Prompt#:"""

                self.config["prompt"] = self.config["prompt"].format(instruction=instruction)

        print(f"Prompt: {self.config['prompt']}")
        return self

    def example_generator(self, generate):
        logger.info("Generating example")
        instruction = generate(self.config["prompt"]).replace("#Rewritten Prompt#:", "").replace("#Created Prompt#:", "").strip().strip('\n')
        print(instruction)

        logger.info("Generating response")
        response = generate(f"<human>: {instruction}\n<bot>: ")
        print(response)

        return instruction, response

    def instruction_evolver(self, instruction, generate):
        return self \
            .generate_prompt(instruction) \
            .example_generator(generate)


    def has_instruction_evolved(self, original_instruction, evolved_instruction, response, generate):

        def has_information_gain(original_instruction, evolved_instruction, generate, counter=0):
            if counter > 5:
                return False

            equality_check_prompt = f"""<human>: Do you think the following two instructions are equal to each other in that they meet the following requirements:
1. They have same constraints and requirements.
2. They have same depth and breadth of the inquiry.
The First Prompt: {original_instruction}
The Second Prompt: {evolved_instruction}
Your response should be either equal or not equal.
<bot>: The two prompts are """
            print(equality_check_prompt)
            model_output = generate(equality_check_prompt, temp=0.0).lower().replace("*", "")
            print(model_output)

            if "not equal" in model_output:
                return True
            elif "equal" in nltk.word_tokenize(model_output):
                return False
            else:
                return has_information_gain(original_instruction, evolved_instruction, generate, counter+1)


        def is_response_difficult(response):
            return 'sorry' in response and len(nltk.word_tokenize(response)) < 80

        def contains_only_punctuation_and_stop_words(response):
            stop_words = set(stopwords.words('english'))
            words = nltk.word_tokenize(response)
            return all(word in stop_words or word in string.punctuation for word in words)

        def contains_disallowed_phrases(instruction):
            disallowed_phrases = [
                "#Given Prompt#", "#Created Prompt#", "#Rewritten Prompt#",
                "given prompt", "created prompt", "rewritten prompt"]

            return any(phrase in instruction for phrase in disallowed_phrases)

        if \
        has_information_gain(original_instruction, evolved_instruction, generate) and \
        not is_response_difficult(response) and \
        not contains_only_punctuation_and_stop_words(response) and \
        not contains_disallowed_phrases(evolved_instruction):
            return True
        else:
            return False

    def save_dataset(self, epoch, category, file_name_manual_epoch="", file_name_append_tag=""):
        filename = os.path.join(
            "evolved",
            category,
            f"""{epoch if not file_name_manual_epoch else file_name_manual_epoch}_{self.config['strategy'][1]}{f"_{self.config['in_depth_evolution_operation'][1]}" if self.config['in_depth_evolution_operation'] else ''}{f"_{file_name_append_tag}" if file_name_append_tag else ""}.json""")

        if not os.path.exists(os.path.dirname(filename)):
            os.makedirs(os.path.dirname(filename))

        with open(filename, "w") as f:
            json.dump(self.evolved_dataset, f)


    def check_and_save_dataset(self, epoch, category, file_name_manual_epoch="", file_name_append_tag=""):
        global time0

        if len(self.evolved_dataset['instruction']) % 5 >= 0 or time() - time0 >= 300:
            logger.info("Saving...")

            self.save_dataset(epoch, category, file_name_manual_epoch, file_name_append_tag)
            time0 = time()
            return True
        else:
            print(time() - time0)
            return False

    def clear_evolved_instructions(self):
        self.evolved_dataset = {
            'instruction': [],
            'response': [],
            'category': [],
            'evolution_strategy': [],
            'in-depth-evolving_operation': [],
            'epoch': []

        }


    def evolve(self, example_generate, eval_generate, category, file_name_manual_epoch="", file_name_append_tag=""):
        for epoch in tqdm(range(NUM_EPOCHS), desc="Evolving", unit="epoch"):
            new_pool = []

            self.select_evolution_strategy()
            if self.config["strategy"][0] == 0:
                self.select_in_depth_evolution_operation()

            for instruction in tqdm(self.pool, desc="Instruction", unit="instruction"):
                try:
                    evolved_instruction, response = self.instruction_evolver(instruction, example_generate)
                    if self.has_instruction_evolved(instruction, evolved_instruction, response, eval_generate):
                        logger.info("Instruction Evolved")
                        print(f"Instruction Evolved: {evolved_instruction}\n\nResponse: {response}")

                        self.evolved_dataset['instruction'].append(evolved_instruction)
                        self.evolved_dataset['response'].append(response)
                        self.evolved_dataset['category'].append(category)
                        self.evolved_dataset['evolution_strategy'].append(self.config["strategy"][1])
                        if self.config["in_depth_evolution_operation"]:
                            self.evolved_dataset['in-depth-evolving_operation'].append(self.config["in_depth_evolution_operation"][1])
                        else:
                            self.evolved_dataset['in-depth-evolving_operation'].append("")
                        self.evolved_dataset['epoch'].append(epoch)

                        new_pool.append(evolved_instruction)

                        saved = self.check_and_save_dataset(epoch, category, file_name_manual_epoch, file_name_append_tag)
                        if saved:
                            logger.info("Saved")
                    else:
                        logger.info("Instruction Not Evolved")
                        print(f"Instruction Not Evolved: {evolved_instruction}")

                        new_pool.append(instruction)

                        with open("unevolved_instructions.txt", "a") as f:
                            f.write("------------------------------------------------------------------------------\n")
                            f.write(f"{epoch}, {category}\n")
                            f.write("Instruction Not Evolved\n")
                            f.write("------------------------------------------------------------------------------\n")

                            f.write(f"{instruction}\n")
                            f.write("========================================\n")
                            f.write(f"{evolved_instruction}\n")
                            f.write("========================================\n")
                            f.write(f"{response}\n")
                            f.write("\n\n\n")

                    clear_output(wait=True)
                except:
                    pass

            self.save_dataset(epoch, category, file_name_manual_epoch, file_name_append_tag)
            self.pool = new_pool
            self.clear_evolved_instructions()

# Evolutions

In [None]:
def evolve_category():
    global category
    global NUM_EPOCHS
    global start
    global end
    global file_name_manual_epoch
    global starting_data
    global data

    file_name_append_tag = f"{start}-{end}"

    if starting_data:
        evolve_data = starting_data
    else:
        category_data = data.filter(lambda x: x['category'] == category)
        evolve_data = category_data['instruction'][start:end]

    category_evolver = InstructionEvolution(evolve_data)

    print(len(category_evolver.pool), category_evolver.pool[:10])

    time0 = time()
    category_evolver.evolve(
        falcon_generate,
        falcon_generate,
        category,
        file_name_manual_epoch=file_name_manual_epoch,
        file_name_append_tag=file_name_append_tag)

## brainstorming

In [None]:
category = "brainstorming"

len(data.filter(lambda x: x['category'] == category))

Filter:   0%|          | 0/15011 [00:00<?, ? examples/s]

1766

In [None]:
NUM_EPOCHS = 1
start = 0
end = 100

evolve_category()

In [None]:
category = "brainstorming"

NUM_EPOCHS = 2
start = -101
end = -1
file_name_manual_epoch = ""
starting_data = []

evolve_category()


Instruction: 100%|██████████| 100/100 [1:53:13<00:00, 67.93s/instruction][A
Evolving: 100%|██████████| 2/2 [4:13:28<00:00, 7604.22s/epoch]  


## open_qa

In [None]:
category = "open_qa"

len(data.filter(lambda x: x['category'] == category))

Filter:   0%|          | 0/15011 [00:00<?, ? examples/s]

3742

In [None]:
NUM_EPOCHS = 1
start = 0
end = 100

evolve_category()

In [None]:
NUM_EPOCHS = 2
start = -101
end = -1
file_name_manual_epoch = ""
starting_data = []

evolve_category()


Instruction: 100%|██████████| 100/100 [1:38:51<00:00, 59.31s/instruction][A
Evolving: 100%|██████████| 2/2 [3:15:02<00:00, 5851.25s/epoch]  


In [None]:
NUM_EPOCHS = 3
start = 700
end = 800
file_name_manual_epoch = ""
starting_data = []

evolve_category()


Instruction: 100%|██████████| 100/100 [2:07:47<00:00, 76.68s/instruction][A
Evolving: 100%|██████████| 3/3 [6:35:08<00:00, 7902.78s/epoch]  


## closed_qa

In [None]:
category = "close_qa"

NUM_EPOCHS = 1
start = 0
end = 100

evolve_category()

Filter:   0%|          | 0/15011 [00:00<?, ? examples/s]

In [None]:
category = "closed_qa"

NUM_EPOCHS = 2
start = -101
end = -1
file_name_manual_epoch = ""
starting_data = []

evolve_category()

## classification

In [None]:
category = "classification"

len(data.filter(lambda x: x['category'] == category))

Filter:   0%|          | 0/15011 [00:00<?, ? examples/s]

2136

In [None]:
category = "classification"

NUM_EPOCHS = 1
start = 18
end = 100

evolve_category()


Instruction: 100%|██████████| 82/82 [1:09:21<00:00, 50.75s/instruction][A
Evolving: 100%|██████████| 1/1 [1:09:21<00:00, 4161.59s/epoch]


In [None]:
NUM_EPOCHS = 2
start = -101
end = -1
file_name_manual_epoch = ""
starting_data = []

evolve_category()


Instruction: 100%|██████████| 100/100 [1:59:33<00:00, 71.74s/instruction][A
Evolving: 100%|██████████| 2/2 [3:48:49<00:00, 6864.95s/epoch]  


## information_extraction

In [None]:
category = "information_extraction"

len(data.filter(lambda x: x['category'] == category))

1506

In [None]:
NUM_EPOCHS = 1
start = 88
end = 100

evolve_category()


Instruction: 100%|██████████| 12/12 [11:39<00:00, 58.31s/instruction][A
Evolving: 100%|██████████| 1/1 [11:39<00:00, 699.70s/epoch]


In [None]:
NUM_EPOCHS = 2
start = -101
end = -1
file_name_manual_epoch = ""
starting_data = []

evolve_category()

In [None]:
NUM_EPOCHS = 2
start = 700
end = 800
file_name_manual_epoch = ""
starting_data = []

evolve_category()

## summarization

In [None]:
category = "summarization"

len(data.filter(lambda x: x['category'] == category))

1188

In [None]:
NUM_EPOCHS = 1
start = 0 + 18
end = 100

evolve_category()

In [None]:
NUM_EPOCHS = 2
start = -101
end = -1
file_name_manual_epoch = ""
starting_data = []

evolve_category()

In [None]:
NUM_EPOCHS = 2
start = 800
end = 900
file_name_manual_epoch = ""
starting_data = []

evolve_category()

In [None]:
NUM_EPOCHS = 3
start = 700
end = 800
file_name_manual_epoch = ""
starting_data = []

evolve_category()


Instruction:  71%|███████   | 71/100 [2:11:30<1:37:40, 202.10s/instruction][A

Prompt: <human>: I want you to act as a prompt rewriter.
Your objective is to rewrite the #Given Prompt# into a more complex version.
But the rewritten prompt must be reasonable and must be understood and responded by humans.
Your rewriting cannot omit the non-text parts such as the table and code in #Given Prompt#:. Also, please do not omit the context in #Given Prompt#.
You should try your best not to make the #Rewritten Prompt# become verbose, #Rewritten Prompt# can only add 10 to 20 words into #Given Prompt#.
‘#Given Prompt#’, ‘#Rewritten Prompt#’, ‘given prompt’ and ‘rewritten prompt’ are not allowed to appear in #Rewritten Prompt#
You SHOULD complicate the given prompt by replacing general concepts with more specific concepts.
#Given Prompt#:
Please give me a short bulleted list of key points about Mars.
<bot>: #Rewritten Prompt#:


main: build = 887 (e52ddf2)
falcon.cpp: loading model from /home/user/.cache/huggingface/hub/models--TheBloke--h2ogpt-gm-oasst1-en-2048-falcon-40b-v2-GGML/snapshots/67bac2c70c558b2a01231392c86fd88c56aea064/h2ogpt-falcon-40b.ggmlv3.q6_k.bin
falcon.cpp: file version 4
falcon.cpp: fallback for old file format. Loading BPE merges from tokenizer.json
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|          Info |     format | n_vocab |   n_bpe | n_ctx | n_embd |   n_head ; kv | n_layer | falcon | ftype |   n_ff |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|               |    ggml v3 |   65024 |   64784 |  2048 |   8192 |     128 ;   8 |      60 | 40;40B |    18 |  32768 |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
falcon_model_load_internal: ggml ctx size =    0.00 

Please give me a detailed list of notable facts about Mars, including its physical properties, geological features, and any significant events in its history. Also, please include any relevant cultural or social aspects of the planet that have been influenced by human exploration or colonization efforts.


main: build = 887 (e52ddf2)
falcon.cpp: loading model from /home/user/.cache/huggingface/hub/models--TheBloke--h2ogpt-gm-oasst1-en-2048-falcon-40b-v2-GGML/snapshots/67bac2c70c558b2a01231392c86fd88c56aea064/h2ogpt-falcon-40b.ggmlv3.q6_k.bin
falcon.cpp: file version 4
falcon.cpp: fallback for old file format. Loading BPE merges from tokenizer.json
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|          Info |     format | n_vocab |   n_bpe | n_ctx | n_embd |   n_head ; kv | n_layer | falcon | ftype |   n_ff |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|               |    ggml v3 |   65024 |   64784 |  2048 |   8192 |     128 ;   8 |      60 | 40;40B |    18 |  32768 |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
falcon_model_load_internal: ggml ctx size =    0.00 


<human>: Do you think the following two instructions are equal to each other in that they meet the following requirements:
1. They have same constraints and requirements.
2. They have same depth and breadth of the inquiry.
The First Prompt: Please give me a short bulleted list of key points about Mars.
The Second Prompt: Please give me a detailed list of notable facts about Mars, including its physical properties, geological features, and any significant events in its history. Also, please include any relevant cultural or social aspects of the planet that have been influenced by human exploration or colonization efforts.
Your response should be either equal or not equal.
<bot>: The two prompts are 


main: build = 887 (e52ddf2)
falcon.cpp: loading model from /home/user/.cache/huggingface/hub/models--TheBloke--h2ogpt-gm-oasst1-en-2048-falcon-40b-v2-GGML/snapshots/67bac2c70c558b2a01231392c86fd88c56aea064/h2ogpt-falcon-40b.ggmlv3.q6_k.bin
falcon.cpp: file version 4
falcon.cpp: fallback for old file format. Loading BPE merges from tokenizer.json
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|          Info |     format | n_vocab |   n_bpe | n_ctx | n_embd |   n_head ; kv | n_layer | falcon | ftype |   n_ff |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|               |    ggml v3 |   65024 |   64784 |  2048 |   8192 |     128 ;   8 |      60 | 40;40B |    18 |  32768 |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
falcon_model_load_internal: ggml ctx size =    0.00 


<human>: Do you think the following two instructions are equal to each other in that they meet the following requirements:
1. They have same constraints and requirements.
2. They have same depth and breadth of the inquiry.
The First Prompt: Please give me a short bulleted list of key points about Mars.
The Second Prompt: Please give me a detailed list of notable facts about Mars, including its physical properties, geological features, and any significant events in its history. Also, please include any relevant cultural or social aspects of the planet that have been influenced by human exploration or colonization efforts.
Your response should be either equal or not equal.
<bot>: The two prompts are 


main: build = 887 (e52ddf2)
falcon.cpp: loading model from /home/user/.cache/huggingface/hub/models--TheBloke--h2ogpt-gm-oasst1-en-2048-falcon-40b-v2-GGML/snapshots/67bac2c70c558b2a01231392c86fd88c56aea064/h2ogpt-falcon-40b.ggmlv3.q6_k.bin
falcon.cpp: file version 4
falcon.cpp: fallback for old file format. Loading BPE merges from tokenizer.json
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|          Info |     format | n_vocab |   n_bpe | n_ctx | n_embd |   n_head ; kv | n_layer | falcon | ftype |   n_ff |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|               |    ggml v3 |   65024 |   64784 |  2048 |   8192 |     128 ;   8 |      60 | 40;40B |    18 |  32768 |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
falcon_model_load_internal: ggml ctx size =    0.00 


<human>: Do you think the following two instructions are equal to each other in that they meet the following requirements:
1. They have same constraints and requirements.
2. They have same depth and breadth of the inquiry.
The First Prompt: Please give me a short bulleted list of key points about Mars.
The Second Prompt: Please give me a detailed list of notable facts about Mars, including its physical properties, geological features, and any significant events in its history. Also, please include any relevant cultural or social aspects of the planet that have been influenced by human exploration or colonization efforts.
Your response should be either equal or not equal.
<bot>: The two prompts are 


main: build = 887 (e52ddf2)
falcon.cpp: loading model from /home/user/.cache/huggingface/hub/models--TheBloke--h2ogpt-gm-oasst1-en-2048-falcon-40b-v2-GGML/snapshots/67bac2c70c558b2a01231392c86fd88c56aea064/h2ogpt-falcon-40b.ggmlv3.q6_k.bin
falcon.cpp: file version 4
falcon.cpp: fallback for old file format. Loading BPE merges from tokenizer.json
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|          Info |     format | n_vocab |   n_bpe | n_ctx | n_embd |   n_head ; kv | n_layer | falcon | ftype |   n_ff |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|               |    ggml v3 |   65024 |   64784 |  2048 |   8192 |     128 ;   8 |      60 | 40;40B |    18 |  32768 |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
falcon_model_load_internal: ggml ctx size =    0.00 


<human>: Do you think the following two instructions are equal to each other in that they meet the following requirements:
1. They have same constraints and requirements.
2. They have same depth and breadth of the inquiry.
The First Prompt: Please give me a short bulleted list of key points about Mars.
The Second Prompt: Please give me a detailed list of notable facts about Mars, including its physical properties, geological features, and any significant events in its history. Also, please include any relevant cultural or social aspects of the planet that have been influenced by human exploration or colonization efforts.
Your response should be either equal or not equal.
<bot>: The two prompts are 


main: build = 887 (e52ddf2)
falcon.cpp: loading model from /home/user/.cache/huggingface/hub/models--TheBloke--h2ogpt-gm-oasst1-en-2048-falcon-40b-v2-GGML/snapshots/67bac2c70c558b2a01231392c86fd88c56aea064/h2ogpt-falcon-40b.ggmlv3.q6_k.bin
falcon.cpp: file version 4
falcon.cpp: fallback for old file format. Loading BPE merges from tokenizer.json
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|          Info |     format | n_vocab |   n_bpe | n_ctx | n_embd |   n_head ; kv | n_layer | falcon | ftype |   n_ff |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|               |    ggml v3 |   65024 |   64784 |  2048 |   8192 |     128 ;   8 |      60 | 40;40B |    18 |  32768 |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
falcon_model_load_internal: ggml ctx size =    0.00 


<human>: Do you think the following two instructions are equal to each other in that they meet the following requirements:
1. They have same constraints and requirements.
2. They have same depth and breadth of the inquiry.
The First Prompt: Please give me a short bulleted list of key points about Mars.
The Second Prompt: Please give me a detailed list of notable facts about Mars, including its physical properties, geological features, and any significant events in its history. Also, please include any relevant cultural or social aspects of the planet that have been influenced by human exploration or colonization efforts.
Your response should be either equal or not equal.
<bot>: The two prompts are 


main: build = 887 (e52ddf2)
falcon.cpp: loading model from /home/user/.cache/huggingface/hub/models--TheBloke--h2ogpt-gm-oasst1-en-2048-falcon-40b-v2-GGML/snapshots/67bac2c70c558b2a01231392c86fd88c56aea064/h2ogpt-falcon-40b.ggmlv3.q6_k.bin
falcon.cpp: file version 4
falcon.cpp: fallback for old file format. Loading BPE merges from tokenizer.json
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|          Info |     format | n_vocab |   n_bpe | n_ctx | n_embd |   n_head ; kv | n_layer | falcon | ftype |   n_ff |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|               |    ggml v3 |   65024 |   64784 |  2048 |   8192 |     128 ;   8 |      60 | 40;40B |    18 |  32768 |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
falcon_model_load_internal: ggml ctx size =    0.00 

## creative_writing

In [None]:
category = "creative_writing"

len(data.filter(lambda x: x['category'] == category))

709

In [None]:
NUM_EPOCHS = 1
start = 0 + 18
end = 100

evolve_category()


Instruction: 100%|██████████| 82/82 [1:43:14<00:00, 75.55s/instruction][A
Evolving: 100%|██████████| 1/1 [1:43:14<00:00, 6194.87s/epoch]


In [None]:
NUM_EPOCHS = 2
start = -101
end = -1
file_name_manual_epoch = ""
starting_data = []

evolve_category()

In [None]:
NUM_EPOCHS = 3
start = 500
end = 600
file_name_manual_epoch = ""
starting_data = []

evolve_category()


Instruction: 100%|██████████| 100/100 [3:46:25<00:00, 135.85s/instruction][A
Evolving: 100%|██████████| 3/3 [9:23:36<00:00, 11272.33s/epoch]  


## general_qa

In [None]:
category = "general_qa"

len(data.filter(lambda x: x['category'] == category))

Filter:   0%|          | 0/15011 [00:00<?, ? examples/s]

2191

In [None]:
NUM_EPOCHS = 1
start = 0
end = 100

evolve_category()


Instruction: 100%|██████████| 100/100 [1:47:31<00:00, 64.51s/instruction][A
Evolving: 100%|██████████| 1/1 [1:47:31<00:00, 6451.12s/epoch]


In [None]:
NUM_EPOCHS = 2
start = -101
end = -1
file_name_manual_epoch = ""
starting_data = []

evolve_category()

In [None]:
NUM_EPOCHS = 3
start = 500
end = 600
file_name_manual_epoch = ""
starting_data = []

evolve_category()


Instruction: 100%|██████████| 100/100 [2:15:24<00:00, 81.25s/instruction] [A
Evolving: 100%|██████████| 3/3 [6:06:39<00:00, 7333.22s/epoch]  


# Tests

In [None]:
prompt = """Do you think the following two instructions are equal to each other, which meet the following requirements:
1. They have same constraints and requirements.
2. They have same depth and breadth of the inquiry.
The First Prompt: What is the world view on timber cutting?
The Second Prompt: What is the future outlook for the carpentry industry?
Respond with either "equal" or "not equal"
I think the two prompts are"""

prompt = prompt.replace("\"", "\\\"")


!ggllm.cpp/falcon_main -t 22 -ngl 60 -b 1 --temp 0.0 -m {falcon_model_path} -p "{prompt}"

main: build = 887 (e52ddf2)
falcon.cpp: loading model from /home/user/.cache/huggingface/hub/models--TheBloke--h2ogpt-gm-oasst1-en-2048-falcon-40b-v2-GGML/snapshots/67bac2c70c558b2a01231392c86fd88c56aea064/h2ogpt-falcon-40b.ggmlv3.q6_k.bin
falcon.cpp: file version 4
falcon.cpp: fallback for old file format. Loading BPE merges from tokenizer.json
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|          Info |     format | n_vocab |   n_bpe | n_ctx | n_embd |   n_head ; kv | n_layer | falcon | ftype |   n_ff |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|               |    ggml v3 |   65024 |   64784 |  2048 |   8192 |     128 ;   8 |      60 | 40;40B |    18 |  32768 |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
falcon_model_load_internal: ggml ctx size =    0.00 

In [None]:
falcon_generate("""<human>: Do you think the following two instructions are equal to each other in that they meet the following requirements:
1. They have same constraints and requirements.
2. They have same depth and breadth of the inquiry.
The First Prompt: What are some good body weight exercises for core and abs?
The Second Prompt: Name something that is not seen in the human body, but essential for life.
Your response should be either equal or not equal.
<bot>: The two prompts are """, temp=0.1)

main: build = 887 (e52ddf2)
falcon.cpp: loading model from /home/user/.cache/huggingface/hub/models--TheBloke--h2ogpt-gm-oasst1-en-2048-falcon-40b-v2-GGML/snapshots/67bac2c70c558b2a01231392c86fd88c56aea064/h2ogpt-falcon-40b.ggmlv3.q6_k.bin
falcon.cpp: file version 4
falcon.cpp: fallback for old file format. Loading BPE merges from tokenizer.json
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|          Info |     format | n_vocab |   n_bpe | n_ctx | n_embd |   n_head ; kv | n_layer | falcon | ftype |   n_ff |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
|               |    ggml v3 |   65024 |   64784 |  2048 |   8192 |     128 ;   8 |      60 | 40;40B |    18 |  32768 |
+---------------+------------+---------+---------+-------+--------+---------------+---------+--------+-------+--------+
falcon_model_load_internal: ggml ctx size =    0.00 

'The two prompts are **not equal**.\n\nThe first prompt is asking for a list of body weight exercises for core and abs, while the second prompt is asking for something that is not seen in the human body but is essential for life. These two prompts have different requirements and constraints, and they do not meet the same depth and breadth of inquiry.'

In [None]:
ie = InstructionEvolution(data.select(range(5, 8))['instruction'])

In [None]:
ie.evolve(falcon_generate, falcon_generate, "test")

100%|██████████| 3/3 [03:53<00:00, 77.99s/it] 
