In [1]:
import pandas as pd
import json


with open("../out/data.json") as fout:
    raw_data = json.load(fout)

user_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:

{}

"""

system_template = """### Response:

{} 

"""

end_template = """{}### End"""

agg_conversations = []
for idx_thread, thread in enumerate(raw_data):
    conversations = ""
    prev_owner = ""
    
    for idx_post in range(len(thread)):
        post_curr = thread[idx_post]
        if post_curr["role"] == "user":
            if prev_owner == "user":
                conversations += "\n" + post_curr["text"]
            elif prev_owner == "system":
                conversations += "\n" + user_template.format(post_curr["text"])
            else:
                conversations += "\n" + user_template.format(post_curr["text"])

        if post_curr["role"] == "system":
            if prev_owner == "system":
                conversations += "\n" + post_curr["text"]
            elif prev_owner == "user":
                conversations += "\n" + system_template.format(post_curr["text"])
            else:
                conversations += "\n" + user_template.format(post_curr["text"])

        prev_owner = post_curr["role"]
    agg_conversations.append(end_template.format(conversations))

# create dataframe
test_conv_dataframe = pd.DataFrame(agg_conversations, columns=["conversations"])
test_conv_dataframe

Unnamed: 0,conversations
0,\nBelow is an instruction that describes a tas...
1,\nBelow is an instruction that describes a tas...
2,\nBelow is an instruction that describes a tas...
3,\nBelow is an instruction that describes a tas...
4,\nBelow is an instruction that describes a tas...
...,...
1406,\nBelow is an instruction that describes a tas...
1407,\nBelow is an instruction that describes a tas...
1408,\nBelow is an instruction that describes a tas...
1409,\nBelow is an instruction that describes a tas...


In [2]:
test_conv_dataframe.to_csv("../data/test-conversations.csv", sep="\t", index=None)

In [3]:
import re
import sys
import torch
from transformers import (LlamaTokenizer, LlamaForCausalLM)

model_path = 'openlm-research/open_llama_3b_v2'

'''model = LlamaForCausalLM.from_pretrained(
    model_path, 
    load_in_8bit=True,
    device_map='auto', 
    # If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or 'sequential'.
)

model.config.use_cache = False
model.config.pretraining_tp = 1'''

tokenizer = LlamaTokenizer.from_pretrained(model_path)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
import re

def extract_ins_res_pairs(input_string):
    pairs_ins_res = []
    size_pairs = []
    pattern = "Below is an instruction that describes a task. Write a response that appropriately completes the request"
    matches = [m.start() for m in re.finditer(pattern, input_string)]
    #print(matches, len(input_string))
    for i in range(len(matches)-1):
        f,n = matches[i], matches[i+1]
        if i+1 < len(matches)-1:
            extracted_conv = input_string[f:n]
        else:
            extracted_conv = input_string[matches[i+1]:len(input_string)]
        input_ids = tokenizer.encode(extracted_conv, return_tensors="pt")
        pairs_ins_res.append(extracted_conv)
        size_pairs.append(len(input_ids[0]))
    return pairs_ins_res, size_pairs

ins_res_dataset = []
prs_size_dataset = []
for idx, item in test_conv_dataframe.iterrows():
    prs, size_prs = extract_ins_res_pairs(item["conversations"])
    ins_res_dataset.extend(prs)
    prs_size_dataset.extend(size_prs)
individual_conversations = pd.DataFrame(zip(ins_res_dataset, prs_size_dataset), columns=["conversations", "tensor_size"])

In [16]:
size = individual_conversations["tensor_size"]

In [19]:
import numpy as np

len(size), np.mean(size), np.median(size)

(733, 450.2769440654843, 261.0)

In [22]:
small_df = individual_conversations[individual_conversations["tensor_size"] <= 200]
small_df

Unnamed: 0,conversations,tensor_size
5,Below is an instruction that describes a task....,103
14,Below is an instruction that describes a task....,199
15,Below is an instruction that describes a task....,156
16,Below is an instruction that describes a task....,137
19,Below is an instruction that describes a task....,189
...,...,...
728,Below is an instruction that describes a task....,154
729,Below is an instruction that describes a task....,163
730,Below is an instruction that describes a task....,54
731,Below is an instruction that describes a task....,82


In [23]:
small_df.to_csv("../data/small-conversations.csv", sep="\t", index=None)