In [1]:
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os
import json
from tqdm import tqdm
from datasets import Dataset
from pathlib import Path
import re
import time
from typing import List, Dict, Any
import subprocess
import threading

print(torch.cuda.device_count())

if torch.cuda.get_device_capability()[0] >= 8:
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16

print(f"attn_implementation: {attn_implementation}, torch_dtype: {torch_dtype}")

4
attn_implementation: flash_attention_2, torch_dtype: torch.bfloat16


In [2]:


def read_dataset(train_type, dir, path):
    # the file is originally json-list format
    # we want every first-level elements to be a string itself
    # for example, [{"Hi": "a'b'"}, {"Hi": "c'd'"}] -> ["""{"Hi": "a'b'"}""", """{"Hi": "c'd'"}"""]
    
    metadata = json.load(open(dir / "metadata.json", "r"))

    path = dir / path
    with open(path, "r", encoding="utf-8") as f:
        data = json.loads(f.read())
    
    result = []
    for d in data:
        if "v5" in dir.name:
            if train_type in ["woall", "FI", "ISP"]:
                del d["Response"]["Strategy"]
            
            if train_type in ["woall", "FI"]:
                del d["Response"]["Input Semantic Parsing"]
            
            if train_type in ["woall"]:
                del d["Response"]["Formalized Input"]
        elif "v6" in dir.name:
            if train_type in ["woall"]:
                del d["Response"]["생각"]
        
        tags = d["Tags"]["Style"]

        skip_tags = ["Graph", "Reason"]

        skip = False
        for skip_tag in skip_tags:
            if skip_tag in tags:
                skip = True
                break
        
        if skip:
            continue

        result.append({"Metadata": metadata, "Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)})
    # result = [{"Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)} for d in data]
    # print(f"Read {len(result)} examples from {path}")
    # print(f"Type of result: {type(result)}")
    # print(f"Type of result[0]: {type(result[0])}")
    # print(f"Type of result[0]['Input']: {type(result[0]['Input'])}")
    # print(f"Type of result[0]['Response']: {type(result[0]['Response'])}")
    return result

# Configuration
dataset_name = "v6-250306-optimizetoken"
dataset_name = "v7-250309-reduceinputanddatefunctioncall"
BASE_DIR = Path(f"../finetuning/dataset/{dataset_name}")


train_type = [
    "woall", # 0
    "FI", # 1
    "ISP", # 2
    "ours" # 3
][3]

if train_type == "woall":
    model_name, tr_config = \
        "sh2orc-Llama-3.1-Korean-8B-Instruct", \
        "r128_a256_woall/checkpoint-60"
    
    model_name, tr_config = \
        "sh2orc-Llama-3.1-Korean-8B-Instruct", \
        "v5_r64_a128_woall/checkpoint-72"

    model_name, tr_config = \
        "sh2orc-Llama-3.1-Korean-8B-Instruct", \
        "v5_r32_a64_woall/checkpoint-70"
    
    model_name, tr_config = \
        "sh2orc-Llama-3.1-Korean-8B-Instruct", \
        "v7_r256_a512_woall_16bit_adamw16bit_0322/checkpoint-60"

elif train_type == "FI":
    # model_name, tr_config = \
    #     "sh2orc-Llama-3.1-Korean-8B-Instruct", \
    #     f"r256_a512_FI/checkpoint-57"
    model_name, tr_config = \
        "sh2orc-Llama-3.1-Korean-8B-Instruct", \
        f"v5_r256_a512_FI/checkpoint-43"
    pass

elif train_type == "ISP":
    # model_name, tr_config = \
    #     "sh2orc-Llama-3.1-Korean-8B-Instruct", \
    #     f"r256_a512_ISP/checkpoint-104"
    pass

elif train_type == "ours":
    # model_name, tr_config = \
    #     "sh2orc-Llama-3.1-Korean-8B-Instruct", \
    #     "r128_a256_ours/checkpoint-51"
    
    model_name, tr_config = \
        "sh2orc-Llama-3.1-Korean-8B-Instruct", \
        "v7_r256_a512_ours_4bit_0322/checkpoint-37"
    
    model_name, tr_config = \
        "Bllossom-llama-3-Korean-Bllossom-70B", \
        "v7_r32_a64_ours_4bit_0322/checkpoint-67"

    model_name, tr_config = \
        "Bllossom-llama-3-Korean-Bllossom-70B", \
        "v7_r64_a128_ours_4bit_0322/checkpoint-38"

    model_name, tr_config = \
        "sh2orc-Llama-3.1-Korean-8B-Instruct", \
        "v7_r256_a512_ours_16bit_adamw16bit_0322/checkpoint-56"
print(f"Model: {model_name}, Config: {tr_config}")

model_dir = Path(f"/model/{model_name}")
checkpoint_dir = Path(f"{model_dir}/chkpts/{tr_config}")
cache_dir = Path(f"{model_dir}/cache")

# Verify paths exist
if not checkpoint_dir.exists():
    raise ValueError(f"Checkpoint directory {checkpoint_dir} does not exist")
if not BASE_DIR.exists():
    raise ValueError(f"Base directory {BASE_DIR} does not exist")

dataset = []
for scenario_dir in [d for d in BASE_DIR.iterdir() if d.is_dir() and "scenario" in d.name and "metadata.json" in [f.name for f in d.iterdir()]]:
    data = read_dataset(train_type, scenario_dir, "onlyq_ts.json")
    for i, d in enumerate(data):
        data[i]["Scenario"] = scenario_dir.name
    dataset.extend(data)
    

common_prompt = open(BASE_DIR / F"prompt.txt", "r").read()

if "v5" in BASE_DIR.name:
    if train_type in ["woall", "FI", "ISP"]:
        # search <|ST|>~~<|ST|> and remove between them
        common_prompt = re.sub(r"\n?<\|ST\|>(.|\n)*?<\|ST\|>", "", common_prompt)
    if train_type in ["woall", "FI"]:
        # search <|ISP|>~~<|ISP|> and remove between them
        common_prompt = re.sub(r"\n?<\|ISP\|>(.|\n)*?<\|ISP\|>", "", common_prompt)
    if train_type in ["woall"]:
        # search <|FI|>~~<|FI|> and remove between them
        common_prompt = re.sub(r"\n?<\|FI\|>(.|\n)*?<\|FI\|>", "", common_prompt)

elif "v6" in BASE_DIR.name or "v7" in BASE_DIR.name:
    if train_type in ["woall"]:
        # search <|FI|>~~<|FI|> and remove between them
        common_prompt = re.sub(r"\n?<\|Ours\|>(.|\n)*?<\|Ours\|>", "", common_prompt)

# remove all <||>
common_prompt = re.sub(r"<\|.*?\|>", "", common_prompt)

Model: sh2orc-Llama-3.1-Korean-8B-Instruct, Config: v7_r256_a512_ours_16bit_adamw16bit_0322/checkpoint-56


In [3]:
def extract_content(text: str) -> str:
    """Extract content from model output."""
    if "start_header_id" in text:
        pattern = r"<\|start_header_id\|>assistant<\|end_header_id\|>(.*?)<\|eot_id\|>"
    elif "start_of_turn" in text:
        pattern = r"<start_of_turn>model\n(.*?)<eos>"
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else None

In [4]:
class UnslothInference:
    def __init__(
        self,
        checkpoint_dir: str,
        cache_dir: str,
        max_new_tokens: int = 3500,
        attn_implementation: str = attn_implementation,
    ):
        

        self.checkpoint_dir = Path(checkpoint_dir)
        self.cache_dir = Path(cache_dir)
        self.max_new_tokens = max_new_tokens
        self.attn_implementation = attn_implementation

        # Verify model files exist
        if not self.checkpoint_dir.exists():
            raise ValueError(f"Checkpoint directory {checkpoint_dir} does not exist")
        # if not (self.checkpoint_dir / "config.json").exists():
        #     raise ValueError(f"config.json not found in {checkpoint_dir}")
        
        # Set torch dtype based on GPU capability
        self.torch_dtype = torch_dtype
        self.model, self.tokenizer = self.setup_model()


    def setup_model(self):
        """Initialize model and tokenizer for the given rank."""
        from unsloth import FastLanguageModel
        model, tokenizer = FastLanguageModel.from_pretrained(
            self.checkpoint_dir.as_posix(),
            dtype = self.torch_dtype,
            load_in_4bit = False,
            load_in_8bit = False,
            # quantization_config=BitsAndBytesConfig(
            #     load_in_4bit=True,
            #     bnb_4bit_use_double_quant=True,
            #     bnb_4bit_quant_type="nf4",
            #     bnb_4bit_compute_dtype=torch_dtype,
            #     # load_in_8bit=True,
            #     # llm_int8_enable_fp32_cpu_offload=True
            # ),
            attn_implementation=self.attn_implementation,
            cache_dir=self.cache_dir.as_posix(),
            local_files_only=True,
            device_map="cuda",
        )
        FastLanguageModel.for_inference(model)
    
        tokenizer.padding_side = "left"
        return model, tokenizer

    def infer(
        self,
        data: Dict,
        common_prompt: str,
    ) -> str:
        try:
            
            model, tokenizer = self.model, self.tokenizer
            start_time = time.time()
            metadata, input = data["Metadata"], data["Input"]
            if "llama" in model.config.architectures[0].lower():
                chat = [
                    {"role": "system", "content": common_prompt},
                    {"role": "user", "content": f"Metadata:{metadata};Input:{input};"},
                ]
            elif "gemma" in model.config.architectures[0].lower():
                chat = [
                    {"role": "user", "content": f"{common_prompt};{json.dumps(metadata)};{input}"},
                ]
            else:
                raise ValueError(f"Unsupported model architecture: {model.config.architectures[0]}")
            
            chat = tokenizer.apply_chat_template(
                chat,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt"
            ).to(model.device)


            # 배치 추론 실행
            outputs = model.generate(
                input_ids=chat,
                # attention_mask=attention_mask,
                max_new_tokens=self.max_new_tokens,
                use_cache=True,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False  # 결정론적 생성
            )
            
            # 결과 디코딩 및 파싱
            response = tokenizer.batch_decode(outputs, skip_special_tokens=False)[0]
            parsed = extract_content(response)

            if parsed is None:
                raise ValueError(f"Failed to extract content from response: {response}")


            end_time = time.time()
            print(f"Elapsed time: {end_time - start_time:.2f}s")
            return parsed
            
        except Exception as e:
            print(f"Error in infer: {str(e)}")
            import traceback
            traceback.print_exc()
            return None
        
    def run(
        self,
        dataset: List[Dict],
        common_prompt: str
    ):
        """Run inference in batches."""
            
        # Setup model and tokenizer
        
        
        # 토크나이저에 패딩 토큰 설정
        # if tokenizer.pad_token is None:
        #     tokenizer.pad_token = tokenizer.eos_token
        # print(tokenizer.pad_token, tokenizer.eos_token)
        with tqdm(total=len(dataset)) as pbar:
            for data_idx in range(len(dataset)):
                data = dataset[data_idx]          
                
                # 배치 처리
                response = self.infer(
                    data, common_prompt
                )
                
                print(response)

In [5]:
class LLamaInference(UnslothInference):
    def __init__(
        self,
        gguf_path: str,
        binary_path: str = "/workspace/llama-cpp/build/bin/main",
        max_new_tokens: int = 100,
        temperature: float = 0.0,
        top_p: float = 1.0,
        seed: int = 42
    ):
        """
        Instead of loading a model via Hugging Face, this version sets up llama.cpp inference.
        It assumes that the GGUF model file is named 'model.gguf' and is located inside checkpoint_dir.
        """
        self.gguf_path = gguf_path
        self.binary_path = binary_path
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.top_p = top_p
        self.seed = seed

        # Assume the GGUF model file is named "model.gguf" inside checkpoint_dir
        if not os.path.exists(self.gguf_path):
            raise ValueError(f"GGUF model file {self.model_path} does not exist")

    def infer(self, data: str, common_prompt:str) -> str:
        """
        Run inference by invoking the llama.cpp binary.
        Constructs a command line with the given prompt and generation parameters,
        and returns the generated text.
        """
        user_input = f"Metadata:{data['Metadata']};Input:{data['Input']};"

        # os.environ["CUDA_VISIBLE_DEVICES"] = "0"

        command = [
            str(self.binary_path),
            "-m", str(self.gguf_path),
            # "-sys", str(common_prompt),
            "--system-prompt-file", "prompt.temp",
            # "-p", str(user_input),
            # "--chat-template", "llama3",

            "-n", str(self.max_new_tokens),
            "-c", str(len(common_prompt) + len(user_input)+100),
            "--threads", str(os.cpu_count()),
            "-ngl", str(33),
            # "-no-cnv",
            # "-st",
            "--temp", str(self.temperature),
            "--top_p", str(self.top_p),
            # "--no-warmup",
            "--seed", str(self.seed)
        ]
        # Run the command and capture stdout
        def read_output(process):
            for line in iter(process.stdout.readline, ''):
                print(line, end='')  # 명령어의 출력 실시간 표시
            process.stdout.close()

        print(" ".join(command))
        process = subprocess.Popen(
            command,
            shell=True,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            bufsize=1
        )

        # 출력을 실시간으로 읽는 스레드
        thread = threading.Thread(target=read_output, args=(process,))
        thread.start()

        # 사용자의 입력을 받아서 명령어에 전달
        try:
            while process.poll() is None:  # 프로세스가 살아있는 동안
                process.stdin.flush()  # 즉시 전송
        except KeyboardInterrupt:
            print("Interrupted by user")
        finally:
            process.stdin.close()
            process.wait()
            thread.join()

    # def run(
    #     self,
    #     dataset: List[Dict],
    #     common_prompt: str, 
    #     output_file: str
    # ):
    #     # 배치 처리
    #     with tqdm(total=len(dataset)) as pbar:
    #         for data in dataset:

    #             response = self.infer(common_prompt, data)
                    
    #             if response is not None:
    #                 try:
    #                     response = eval(response)
    #                 except Exception as e:
    #                     print(f"Error in eval: {str(e)}")
                    
    #                 result = {
    #                     "Input": data["Input"],
    #                     "Scenario": data["Scenario"],
    #                     "Metadata": data["Metadata"],
    #                     "Candidate": response,
    #                 }
                    
    #                 with open(output_file, "a", encoding="utf-8") as f:
    #                     f.write(json.dumps(result, ensure_ascii=False) + "\n")
    #             else:
    #                 print(f"Error in response for sample {data}")
            
    #         pbar.update(1)

In [6]:
inference_type = [
    "unsloth", # 0
    "llama" # 1
][1]

if inference_type == "unsloth":
    inference = UnslothInference(
        checkpoint_dir=str(checkpoint_dir),
        cache_dir=str(cache_dir),
        max_new_tokens = 3500,
    )
else:
    gguf_path = Path(f"../src/i2i.gguf")
    if not gguf_path.exists():
        raise ValueError(f"GGUF model file {gguf_path} does not exist")
    
    with open("prompt.temp", "w") as f:
        f.write(common_prompt)
    
    inference = LLamaInference(
        gguf_path=gguf_path,
        binary_path = "../llama.cpp/build/bin/llama-cli",
        max_new_tokens = 500,
    )


In [7]:
inference.run(
    dataset=dataset[:1],
    common_prompt=common_prompt
)

  0%|          | 0/1 [00:00<?, ?it/s]

../llama.cpp/build/bin/llama-cli -m ../src/i2i.gguf --system-prompt-file prompt.temp -p Metadata:{'site_name': 'YongDongIllHighSchool', 'user_name': '홍길동', 'user_role': 'customer', 'idu_name': '01_IB5', 'idu_mapping': {'01_IB5': ['우리반'], '01_IB7': ['옆반'], '02_I81': ['앞반']}, 'modality_mapping': {'roomtemp': ['실내온도'], 'settemp': ['설정온도'], 'oper': ['전원']}, 'current_datetime': '2022-09-30 12:00:00'};Input:오늘 아침과 저녁의 온도차이는 얼마나 돼?; -n 500 -c 786 --threads 32 -ngl 33 -st --temp 0.0 --top_p 1.0 --seed 42
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 4 CUDA devices:
  Device 0: NVIDIA RTX A6000, compute capability 8.6, VMM: yes
  Device 1: NVIDIA RTX A6000, compute capability 8.6, VMM: yes
  Device 2: NVIDIA RTX A6000, compute capability 8.6, VMM: yes
  Device 3: NVIDIA RTX A6000, compute capability 8.6, VMM: yes
build: 4943 (18b663d8) with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: llama backend init
main: l

  0%|          | 0/1 [00:01<?, ?it/s]

None



