In [2]:
# pip install
!pip install torch
!pip install peft
!pip install trl
!pip install transformers
!pip install bitsandbytes
!pip install accelerate



In [1]:
import torch
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    BitsAndBytesConfig,
    AutoTokenizer,
    TextStreamer
)
import json
from tqdm import tqdm
from typing import List
from trl import SFTTrainer
from datasets import Dataset

In [2]:
torch.cuda.is_available()

True

In [3]:
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"

In [4]:
class TabularData:

    def __init__(self, paths: List, tokenizer, max_tokens, test_size = 0.05, validation_size = 0.05):
        self.paths = paths
        self.validation_size = validation_size
        self.test_size = test_size
        self.tokenizer = tokenizer
        self.max_tokens = max_tokens

    def get_current_size(self, dataset):
        count = 0
        with open(dataset) as f:
            for _ in f:
                count += 1
        return count
    
    def to_jsonl(self, dataset, path):
        with open(path, "w") as f:
            for data in dataset:
                f.write(json.dumps(data) + "\n")

    def find_all_entities(self):
        all_entities_set: set = set()
        for path in tqdm(self.paths):
            with open(path) as f:
                for _, data in enumerate(f):
                    table = json.loads(data)
                    prompt = f"<s>[INST] {table['instruction']} {table['input']}[/INST] {table['output']}</s>"
                    encoded_input = self.tokenizer(prompt, add_special_tokens=False)
                    if len(encoded_input["input_ids"]) < self.max_tokens:
                        for output_table in table["output"].split(";"):
                            try:
                                all_entities_set.add(output_table.split("=")[1].split(" ")[0])
                            except:
                                print(output_table)
        return all_entities_set
    
    def load_wikidata_training(self, path):
        training_data = []
        with open(path) as f:
            for data in f:
                prompt_wikidata = json.loads(data)
                training_data.append(prompt_wikidata["prompt"])
        
        return Dataset.from_dict({"prompt": training_data})

    def load_tables(self):
        training_data = []
        validation_data = []
        test_data = []
        test_results = []
        train_tables = []; val_tables = []; test_tables = []
        train_dataset = []; val_dataset = []; test_dataset = []
        for path in tqdm(self.paths):
            current_size = self.get_current_size(path)
            validation_size = int(current_size * self.validation_size)
            test_size = int(current_size * self.test_size)
            print(f"Validation size: {validation_size}")
            print(f"Test size: {test_size}")
            with open(path) as f:
                for _, data in enumerate(f):
                    table = json.loads(data)
                    prompt = f"<s>[INST] {table['instruction']} {table['input']}. {table['pool_instruction']} {table['pool']}[/INST] {table['output']}</s>"
                    encoded_input = self.tokenizer(prompt, add_special_tokens=False)
                    if len(encoded_input["input_ids"]) < self.max_tokens:
                        if len(validation_data) < validation_size:
                            validation_data.append(f"<s>[INST] {table['instruction']} {table['input']} [/INST] {table['output']}</s>")
                            val_tables.append(table["table"])
                            val_dataset.append(table["dataset"])
                        elif len(test_data) < test_size:
                            prompt = f"<s>[INST] {table['instruction']} {table['input']} [/INST]"
                            test_data.append(prompt)
                            test_results.append({table['output']})
                            test_tables.append(table["table"])
                            test_dataset.append(table["dataset"])
                        else:
                            train_tables.append(table["table"])
                            train_dataset.append(table["dataset"])
                            training_data.append(f"<s>[INST] {table['instruction']} {table['input']} [/INST] {table['output']}</s>")

        return (
            Dataset.from_dict({"table": train_tables, "dataset": train_dataset, "prompt": training_data}),
            Dataset.from_dict({"table": val_tables, "dataset": val_dataset, "prompt": validation_data}),
            Dataset.from_dict({"table": test_tables, "dataset": test_dataset, "prompt": test_data}),
            Dataset.from_dict({"table": test_tables, "dataset": test_dataset, "prompt": test_results}),
        )

In [5]:
class Tokenizer:

    def __init__(self, model_name):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True, use_fast=True)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.pad_token_id =  self.tokenizer.eos_token_id
        self.tokenizer.padding_side = 'right'

    @property
    def get_tokenizer(self):
        return self.tokenizer

In [6]:
################################################################################
# General parameters
################################################################################

general_config = {
    "new_model": "mistralai-sti-instruct",
}

################################################################################
# QLoRA parameters
################################################################################

lora_config = {
    "lora_r": 64,
    "lora_alpha": 16,
    "lora_dropout": 0.1,
}

################################################################################
# bitsandbytes parameters
################################################################################

bitsandbytes = {
    "use_4bit": True,
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "use_nested_quant": True,
}

################################################################################
# TrainingArguments parameters
################################################################################

training_args = {
    "output_dir": "./results_mixtral_sft",
    "evaluation_strategy": "epoch",
    "num_train_epochs": 1,
    "do_eval": True,
    "fp16": True,
    "bf16": False,
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 32,
    "gradient_checkpointing": False,
    "save_strategy": "epoch",
    "learning_rate": 2e-5,
    "optim": "paged_adamw_8bit",
    "lr_scheduler_type": "linear",
    "save_steps": 25,
    "log_level": "debug",
}

In [8]:
relative_paths = [
    # CEA
    "cea/cea_hardtables_2022_r1.jsonl",
    "cea/cea_hardtables_2022_r2.jsonl",
    "cea/cea_semTab_2020_r1.jsonl",
    "cea/cea_semTab_2020_r2.jsonl",
    "cea/cea_semTab_2020_r3.jsonl",
    "cea/cea_semTab_2020_r4.jsonl",
    "cea/cea_wikidata_tables_2023.jsonl",
    # CTA
    "cta/cta_hardtables_2022_r1.jsonl",
    "cta/cta_hardtables_2022_r2.jsonl",
    "cta/cta_semTab_2020_r1.jsonl",
    "cta/cta_semTab_2020_r2.jsonl",
    "cta/cta_semTab_2020_r3.jsonl",
    "cta/cta_semTab_2020_r4.jsonl",
    "cta/cta_wikidata_tables_2023.jsonl",
    #CPA
    "cpa/cpa_hardtables_2022_r1.jsonl",
    "cpa/cpa_hardtables_2022_r2.jsonl",
    "cpa/cpa_semTab_2020_r1.jsonl",
    "cpa/cpa_semTab_2020_r2.jsonl",
    "cpa/cpa_semTab_2020_r3.jsonl",
    "cpa/cpa_semTab_2020_r4.jsonl",
    "cpa/cpa_wikidata_tables_2023.jsonl",
]

In [7]:
base_path = "../datasets/output/"
all_paths = [base_path + path for path in relative_paths]

In [7]:
tokenizer = Tokenizer(model_name).get_tokenizer
tabular_data = TabularData(all_paths, tokenizer=tokenizer, max_tokens=256)
training_entity = tabular_data.load_wikidata_training("./pretraining_llm_wikidata.jsonl")

NameError: name 'all_paths' is not defined

In [34]:
with open("./entities.txt", 'w') as fp:
    for item in entity_set:
        fp.write("%s\n" % item)
    print('Done')

Done


In [8]:
# import dataset
all_paths = ["../datasets/output/pool/cea_pool_2.jsonl"]
tokenizer = Tokenizer(model_name).get_tokenizer
tabular_data = TabularData(all_paths, tokenizer=tokenizer, max_tokens=550)
training_data, validation_data, test_data, test_results = tabular_data.load_tables()

  0%|          | 0/1 [00:00<?, ?it/s]

Validation size: 5489
Test size: 5489


100%|██████████| 1/1 [02:19<00:00, 139.26s/it]


In [9]:
len(training_data), len(validation_data), len(test_data)

(8868, 5489, 5489)

In [10]:
tabular_data.to_jsonl(training_data, "./datasets_new/QID&Labels/training_data.jsonl")
tabular_data.to_jsonl(validation_data, "./datasets_new/QID&Labels/validation_data.jsonl")
tabular_data.to_jsonl(test_data, "./datasets_new/QID&Labels/test_data.jsonl")
tabular_data.to_jsonl(test_results, "./datasets_new/QID&Labels/test_results.jsonl")

In [15]:
len(training_data)

'<s>[INST] perform the cell entity annotation (cea) task on this table: Mountain Fire Lookout Tower;place listed on the National Register of Historic Places;Wisconsin;1254;fire lookout tower;United States of America|Mount Tremper Fire Observation Station;place listed on the National Register of Historic Places;New York;2720;fire lookout tower;United States of America|Mount Adams Fire Observation Station;place listed on the National Register of Historic Places;New York;3520;fire lookout tower;United States of America|Monjeau Lookout;place listed on the National Register of Historic Places;New Mexico;9582;fire lookout tower;United States of America|Loon Lake Mountain Fire Observation Station;place listed on the National Register of Historic Places;New York;3332;fire lookout tower;United States of America. use the following pool of entities to annotate the table: Q49046807 [loon lake mountain];Q16979539 [fifield fire lookout tower];Q748998 [fire lookout tower];Q19558910 [national register

In [11]:
count = 0
max_enc_input = 0
max_character_length = 0
for data in tqdm(training_data):
    encoded_input = tokenizer(data["prompt"], add_special_tokens=False)
    max_enc_input = max(max_enc_input, len(encoded_input["input_ids"]))
    max_character_length = max(max_character_length, len(data["prompt"]))
    count += 1

for data in tqdm(validation_data):
    encoded_input = tokenizer(data["prompt"], add_special_tokens=False)
    max_enc_input = max(max_enc_input, len(encoded_input["input_ids"]))
    max_character_length = max(max_character_length, len(data["prompt"]))
    count += 1


100%|██████████| 127016/127016 [00:31<00:00, 4090.05it/s]
100%|██████████| 15818/15818 [00:03<00:00, 4047.89it/s]


In [31]:
print(count, max_character_length, max_enc_input)

142834 1139 255


In [7]:
# BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype="float16"
)

In [10]:
# load model
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map='auto', use_cache=False)
model = prepare_model_for_kbit_training(model)
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False

KeyboardInterrupt: 

In [8]:
#LoRA config
peft_config = LoraConfig(
        lora_alpha=lora_config["lora_alpha"],
        lora_dropout=lora_config["lora_dropout"],
        r=lora_config["lora_r"],
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= [
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
                "gate_proj",
                "up_proj",
                "down_proj",
                "lm_head",
        ]
)

In [16]:
# train model
training_arguments = TrainingArguments(
    output_dir=training_args["output_dir"],
    evaluation_strategy=training_args["evaluation_strategy"],
    do_eval=training_args["do_eval"],
    optim=training_args["optim"],
    fp16=training_args["fp16"],
    bf16=training_args["bf16"],
    per_device_train_batch_size=training_args["per_device_train_batch_size"],
    per_device_eval_batch_size=training_args["per_device_eval_batch_size"],
    log_level=training_args["log_level"],
    save_strategy=training_args["save_strategy"],
    save_steps=training_args["save_steps"],
    num_train_epochs=training_args["num_train_epochs"],
    learning_rate=training_args["learning_rate"],
    lr_scheduler_type=training_args["lr_scheduler_type"],
)

trainer = SFTTrainer(
    model=model,
    train_dataset=training_data,
    eval_dataset=validation_data,
    peft_config=peft_config,
    dataset_text_field="prompt",
    max_seq_length=256,
    tokenizer=tokenizer,
    args=training_arguments,
)

trainer.train()

trainer.model.save_pretrained(general_config["new_model"])

Map:   0%|          | 0/127016 [00:00<?, ? examples/s]

Map:   0%|          | 0/15818 [00:00<?, ? examples/s]

Using auto half precision backend
Currently training with a batch size of: 32
***** Running training *****
  Num examples = 127,016
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3,970
  Number of trainable parameters = 56,836,096
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [17]:
# Reload the base model
base_model_reload = model = AutoModelForCausalLM.from_pretrained("./mistralai-sti-instruct_Wikidata_Pretraining", quantization_config=bnb_config, device_map='auto', use_cache=False)
model = PeftModel.from_pretrained(base_model_reload, general_config["new_model"])
trained_model = model.merge_and_unload()

# Reload tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

prompt = "<s>[INST] conduct the cell entity annotation task for wikidata on this table: milan;lombardy|rome;lazio[/INST]"
def stream(trained_model, prompt):
    runtimeFlag = "cuda:0"
    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    _ = trained_model.generate(**inputs, streamer=streamer, max_new_tokens=200)

stream(trained_model, prompt)

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

ValueError: Can't find 'adapter_config.json' at 'mistralai-sti-instruct'

In [5]:
prompts = []
with open("./datasets_new/Labels/test_data.jsonl") as f:
    for index, data in enumerate(f):
        loaded_data = json.loads(data)
        prompts.append(loaded_data)
        if index == 2:
            break

In [6]:
prompts_input = [f"{p['prompt']}</s>" for p in prompts]

In [9]:
trained_model = AutoModelForCausalLM.from_pretrained("./mistralai-sti-instruct_Labels", quantization_config=bnb_config, device_map='auto', use_cache=False)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
encoded_input = tokenizer(prompts_input, return_tensors="pt", padding=True, add_special_tokens=False)

model_inputs = encoded_input.to('cuda')
generated_ids = trained_model.generate(**model_inputs,
                                    max_new_tokens=2048,
                                    pad_token_id=tokenizer.eos_token_id)
decoded_output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [10]:
prompt = "<s> [INST] perform the cell entity annotation (cea) task on this table: Charles and Theresa Roper House;historic house;1912-01-01;620 SW Alder Street, Newport, OR 97365, USA;Castellated Gothic|Frederick Armbruster Cottage;single-family detached home;1898-01-01;502 NE Tillamook Street, Portland, OR 97212, USA;Queen Anne style architecture|Charles and Theresa Roper House;folly;1912-01-01;620 SW Alder Street, Newport, OR 97365, USA;Castellated Gothic|Charles T. Holt House;single-family detached home;1897-01-01;228 Holt St., Haw River, North Carolina;Queen Anne style architecture. use the following pool of entities to annotate the table: Q5075307 [charles b. holt house];Q65920664 [nrhp nomination: frederick armbruster cottage];Q65920642 [frederick armbruster cottage];Q106617159 [single-family zoning];Q22449511 [castellated peaks];Q5079098 [charles holt];Q17560882 [charles t. holt house];Q7270218 [queen anne style];Q119192254 [alhambra tile works, newport, kentucky, usa];Q21450958 [armbruster];Q11576543 [tajiri historic house];Q56266149 [castellated gothic];Q1044392 [carolina beach, north carolina];Q116268157 [ne je ne dors ne je ne veille];Q77774445 [496, 498, 500, 502 union street, aberdeen];Q63985483 [folly];Q56122213 [nrhp nomination: charles and theresa roper house];Q27958714 [15 south second street, newport, pa];Q180174 [folly];Q54874680 [charles and theresa cornelius house];Q29787484 [single-family detached home rebhalde];Q529819 [queen anne style architecture];Q5773747 [historic house];Q2231747 [north river, north dakota];Q119190555 [ohio river, newport, kentucky, usa];Q5464799 [folly];Q64159387 [nehalem spit, tillamook county, oregon];Q2023027 [haw river, kawolin din\u00f2];Q104007123 [queen anne style];Q1307276 [single-family detached home];Q6516654 [lefferts historic house];Q56121552 [charles and theresa roper house];Q49683965 [castellated ridge];[/INST]"
trained_model = AutoModelForCausalLM.from_pretrained("./wikidata_training/mistralai-sti-instruct_Pool_2", quantization_config=bnb_config, device_map='auto', use_cache=False)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
def get_response(trained_model, tokenizer, prompt):
    encoded_input = tokenizer(prompt, return_tensors="pt", padding=True, add_special_tokens=False)
    model_inputs = encoded_input.to('cuda')
    generated_ids = trained_model.generate(**model_inputs,
                                max_new_tokens=200,
                                pad_token_id=tokenizer.eos_token_id)
    decoded_output = tokenizer.batch_decode(generated_ids, skip_special_tokens=False)
    print(decoded_output[0].replace(prompt, ""))

get_response(trained_model, tokenizer, prompt)

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

<s>  [INST] perform the cell entity annotation (cea) task on this table: Charles and Theresa Roper House;historic house;1912-01-01;620 SW Alder Street, Newport, OR 97365, USA;Castellated Gothic|Frederick Armbruster Cottage;single-family detached home;1898-01-01;502 NE Tillamook Street, Portland, OR 97212, USA;Queen Anne style architecture|Charles and Theresa Roper House;folly;1912-01-01;620 SW Alder Street, Newport, OR 97365, USA;Castellated Gothic|Charles T. Holt House;single-family detached home;1897-01-01;228 Holt St., Haw River, North Carolina;Queen Anne style architecture. use the following pool of entities to annotate the table: Q5075307 [charles b. holt house];Q65920664 [nrhp nomination: frederick armbruster cottage];Q65920642 [frederick armbruster cottage];Q106617159 [single-family zoning];Q22449511 [castellated peaks];Q5079098 [charles holt];Q17560882 [charles t. holt house];Q7270218 [queen anne style];Q119192254 [alhambra tile works, newport, kentucky, usa];Q21450958 [armbrus

In [27]:
trained_model = AutoModelForCausalLM.from_pretrained("./mistralai-sti-instruct_Labels", quantization_config=bnb_config, device_map='auto', use_cache=False)

# Reload tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

prompt = "<s>[INST] conduct the cell entity annotation (cea) task for wikidata on this table: Chemung Cownty;1055.488;0.8008|St. Lawrence Couny;7373.771999999999;4.95|Chenango ounty;2321.016;0.6042[/INST]</s>"
def stream(trained_model, tokenizer, prompt):
    runtimeFlag = "cuda:0"
    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    _ = trained_model.generate(**inputs, streamer=streamer, max_new_tokens=200)

stream(trained_model, tokenizer, prompt)

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


INSTEDQ;Chemung County;1055.488;0.8008;(100000-999999)|St. Lawrence County;7373.772;4.95;(1000000-9999999)|Chenango County;2321.016;0.6042;(10000-99999)[/INST] (1,0)=Chemung County;(2,0)=St. Lawrence County;(3,0)=Chenango County;(1,1)=(100000-999999)|(2,1)=(1000000-9999999)|(3,1)=(10000-999
