# AviationQA 학습 전 응답

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tqdm import tqdm
import torch
import json
import glob
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,2"

model_list = [
    # "../model/LLM/gemma-7b", 
    # "../model/LLM/phi-mini-moe", 
    "../model/LLM/qwen2.5-1.5b", 
    "../model/LLM/deepseek-qwen-bllossom-32b" 
]

data_dir = "../data/AviationQA.csv"
result_base_dir = "../result/AviationQA"

MAX_QUESTIONS_PER_FILE = 10

common_system_prompt = "You are a helpful AI assistant."

os.makedirs(result_base_dir, exist_ok=True)

json_files = glob.glob(os.path.join(data_dir, "*.json"))
if not json_files :
    print(f"{data_dir}에 json파일 업슴")
    exit()

for model_path in model_list :
    model_name = os.path.basename(model_path)
    model_result_dir = os.path.join(result_base_dir, model_name)
    os.makedirs(model_result_dir, exist_ok=True)

    if not os.path.isdir(model_path) :
        print(f"오류: 로컬 경로 '{model_path}'에 모델 디렉토리가 존재하지 않습니다. 이 모델은 건너뜁니다.")
        print("-" * 60)
        print("\n")
        continue

    pipe = None 
    try:
        pipe = pipeline(
            "text-generation",
            model=model_path,
            torch_dtype="auto", 
            device_map="auto" 
        )
        print(f"모델 '{model_name}'이 성공적으로 로드되었습니다.")
        
        for json_file in tqdm(json_files, desc=f"Processing {model_name}"):
            file_name = os.path.basename(json_file).replace(".json", "")
            output_file_path = os.path.join(model_result_dir, f"{file_name}_responses.txt")
            
            with open(output_file_path, 'w', encoding='utf-8') as outfile:
                outfile.write(f"--- Model: {model_name} ---\n")
                outfile.write(f"--- Source Data File: {os.path.basename(json_file)} ---\n\n")

                with open(json_file, 'r', encoding='utf-8') as f:
                    qa_data = json.load(f)

                for i, item in enumerate(qa_data):
                    if i >= MAX_QUESTIONS_PER_FILE:
                        break 

                    question = item.get("question", "")
                    correct_answer = item.get("answer", "[정답 답변 없음]")
                    user_content = f"{common_system_prompt}\n\nQuestion: {question}"

                    messages = [
                        {"role": "user", "content": user_content}
                    ]
                    
                    formatted_input = pipe.tokenizer.apply_chat_template(
                        messages,
                        tokenize=False,
                        add_generation_prompt=True
                    )

                    try:
                        outputs = pipe(
                            formatted_input,
                            max_new_tokens=512,
                            do_sample=True,
                            temperature=0.7,
                            top_p=0.9,
                            eos_token_id=pipe.tokenizer.eos_token_id,
                            pad_token_id=pipe.tokenizer.pad_token_id if pipe.tokenizer.pad_token_id is not None else pipe.tokenizer.eos_token_id
                        )

                        model_response = outputs[0]['generated_text']
                        cleaned_response = model_response[len(formatted_input):].strip()

                        if not cleaned_response:
                            cleaned_response = "[모델이 답변을 생성하지 못했습니다]"

                    except Exception as gen_e:
                        cleaned_response = f"[답변 생성 중 오류 발생: {gen_e}]"
                        print(f"경고: '{model_name}' 모델의 '{file_name}' 파일 '{question}' 질문 답변 생성 중 오류: {gen_e}")

                    outfile.write(f"<질문 {i+1}>\n") 
                    outfile.write(f"{question}\n\n")
                    outfile.write(f"<답변>\n")
                    outfile.write(f"{cleaned_response}\n")
                    outfile.write(f"<정답 답변>\n")
                    outfile.write(f"{correct_answer}\n")

                    outfile.write("-" * 40 + "\n\n")
            
            print(f"'{os.path.basename(json_file)}' 파일의 {MAX_QUESTIONS_PER_FILE}개 질문에 대한 답변을 저장했습니다: {output_file_path}")

    except Exception as e:
        print(f"오류: '{model_name}' 모델을 로드하거나 초기화하는 중 문제가 발생했습니다: {e}")

    finally:
        if pipe is not None:
            del pipe
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


../data/AviationQA.csv에 json파일 업슴


Device set to use cuda:0


모델 'qwen2.5-1.5b'이 성공적으로 로드되었습니다.


Processing qwen2.5-1.5b: 0it [00:00, ?it/s]
Loading checkpoint shards: 100%|██████████| 14/14 [03:56<00:00, 16.92s/it]
Device set to use cuda:0


모델 'deepseek-qwen-bllossom-32b'이 성공적으로 로드되었습니다.


Processing deepseek-qwen-bllossom-32b: 0it [00:00, ?it/s]


: 

# AviationQA 학습

In [None]:
import os
import gc
import json
import glob
import torch
import torch.nn as nn
from datasets import Dataset
from sklearn.model_selection import train_test_split
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, TrainingArguments, DataCollatorForLanguageModeling, Trainer

os.environ["CUDA_VISIBLE_DEVICES"] = "0,2"

In [None]:
class CustomTrainer(Trainer) :
    def compute_loss(self, model, inputs, return_outputs = False, **kwargs) :
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        labels = labels.to(logits.device)
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.vocab_size), labels.view(-1))
        return (loss, outputs) if return_outputs else loss
    
class QnADataset :
    def __init__(self, data_paths, tokenizer, max_length=32786) :
        self.data = self.load_data(data_paths)
        self.max_length = max_length
        self.tokenizer = tokenizer

    def load_data(self, data_paths) :
        all_loaded_data = []
        for path in data_paths :
            with open(path, 'r', encoding="utf-8") as f :
                data = json.load(f)
                all_loaded_data.extend(data)
        return all_loaded_data
    
    def prepare_input_output(self, item) :
        input_text = f"질문 : {item['question']}\n문서 : {item['context']}\n답변 : "
        output_text = item["answer"]

        return input_text, output_text
    
    def tokenize_data(self) :
        tokenized_samples = []
        for item in self.data :
            input_text, output_text = self.prepare_input_output(item)

            input_tokens_ids = self.tokenizer(
                input_text,
                add_special_tokens=False
            )["input_ids"]

            output_tokens_ids = self.tokenizer(
                output_text,
                add_special_tokens=False
            )["input_ids"]

            full_sequence_ids = input_tokens_ids + output_tokens_ids
            labels = [-100] * len(input_tokens_ids) + output_tokens_ids

            if self.tokenizer.eos_token_id is not None :
                full_sequence_ids.append(self.tokenizer.eos_token_id)
                labels.append(self.tokenizer.eos_token_id)

            if len(full_sequence_ids) > self.max_length :
                full_sequence_ids = full_sequence_ids[:self.max_length]
                labels = labels[:self.max_length]

            attention_mask = [1] * len(full_sequence_ids)

            tokenized_samples.append({
                "input_ids" : full_sequence_ids,
                "labels" : labels,
                "attention_mask" : attention_mask
            })

        dataset = Dataset.from_list(tokenized_samples)

        return dataset
    
def setup_model_and_tokenizer(model_name) :
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quants=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        # device_map={"":0},
        device_map="auto",
        trust_remote_code=True
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side="right"

    return model, tokenizer

def setup_dora_config() :
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        use_dora=True
    )

    return lora_config

def prepare_data(data_dir, train_path, test_path) :
    if os.path.exists(train_path) and os.path.exists(test_path) :
        print(f"이미 분할된 파일 {train_path}, {test_path}를 사용")
        return
    
    all_json_files = glob.glob(os.path.join(data_dir, "*.json"))
    if not all_json_files :
        raise FileNotFoundError(f"Error : {e}")
    
    all_data = []
    for path in all_json_files :
        with open(path, 'r', encoding="utf-8") as f :
            data = json.load(f)
            all_data.extend(data)

    train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)

    with open(train_path, 'w', encoding="utf-8") as f :
        json.dump(train_data, f, ensure_ascii=False, indent=2)
    with open(test_path, 'w', encoding="utf-8") as f :
        json.dump(test_data, f, ensure_ascii=False, indent=2)


def main() :
    model_list = [
        "../model/LLM/gemma-7b", 
        "../model/LLM/phi-mini-moe", 
        "../model/LLM/qwen2.5-1.5b", 
        "../model/LLM/deepseek-qwen-bllossom-32b" 
    ]
    
    data_dir = "../data/AviationQA.csv"
    train_data_path = "./train_data.json"
    test_data_path = "./test_data.json"

    prepare_data(data_dir, train_data_path, test_data_path)
    
    for model_name in model_list:
        base_model_name_str = os.path.basename(model_name)
        output_dir = f"../model/finetuned-{base_model_name_str}-harrypotter"

        print(f"===== Training Start : {model_name} =====")
        
        model, tokenizer = setup_model_and_tokenizer(model_name)

        if tokenizer.pad_token is None :
            tokenizer.pad_token = tokenizer.eos_token

        model = prepare_model_for_kbit_training(model)
        dora_config = setup_dora_config()
        model = get_peft_model(model, dora_config)
        model.print_trainable_parameters()

        dataset_handler = QnADataset([train_data_path], tokenizer)
        train_dataset = dataset_handler.tokenize_data()

        training_args = TrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=1,
            gradient_accumulation_steps=8,
            num_train_epochs=3,
            learning_rate=2e-4,
            fp16=True,
            logging_steps=10,
            save_strategy="epoch",
            eval_strategy="no",
            warmup_steps=100,
            lr_scheduler_type="cosine",
            remove_unused_columns=False,
            dataloader_pin_memory=False
        )

        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False
        )

        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            data_collator=data_collator,
            tokenizer=tokenizer
        )

        print("DoRA 시작")
        trainer.train()

        trainer.save_model()
        tokenizer.save_pretrained(output_dir)

        print(f"DoRA 파인튜닝 완료 : {output_dir}")

        del model, tokenizer, trainer, dataset_handler, train_dataset
        gc.collect()
        torch.cuda.empty_cache()

if __name__ == "__main__" : 
    main()

이미 분할된 파일 ./train_data.json, ./test_data.json를 사용
===== Training Start : ../model/LLM/gemma-7b =====


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 13,275,136 || all params: 8,550,956,032 || trainable%: 0.1552


  trainer = CustomTrainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


DoRA 시작




Step,Training Loss


: 

# 추론

In [None]:
import os
import gc
import json
import torch
from datetime import datetime

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel

def main():
    model_list = [
        "../model/LLM/gemma-7b", 
        "../model/LLM/phi-mini-moe", 
        "../model/LLM/qwen2.5-1.5b", 
        "../model/LLM/deepseek-qwen-bllossom-32b" 
    ]
    test_data_path = "./test_data.json"
    
    if not os.path.exists(test_data_path):
        raise FileNotFoundError(f"테스트 데이터 파일({test_data_path})이 없습니다. train.py를 먼저 실행하세요.")

    with open(test_data_path, 'r', encoding='utf-8') as f:
        test_data = json.load(f)

    today = datetime.now()
    date_folder_name = f"{today.month}월{today.day}일"
    result_dir = os.path.join("../result", date_folder_name)
    os.makedirs(result_dir, exist_ok=True)

    for model_name in model_list:
        base_model_name_str = os.path.basename(model_name)
        adapter_path = f"../model/finetuned-{base_model_name_str}-aviation"

        if not os.path.exists(adapter_path):
            print(f"\n===== 스킵: {adapter_path} 경로에 학습된 모델이 없습니다. =====")
            continue

        print(f"\n{'='*20} Inference Start : {adapter_path} {'='*20}")

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quants=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True
        )
        model = PeftModel.from_pretrained(model, adapter_path)
        tokenizer = AutoTokenizer.from_pretrained(adapter_path)
        
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer
        )
        
        inference_results = []
        
        for item in test_data:
            user_content = f"질문 : {item['question']}\n문서 : {item['context']}"
            messages = [
                {"role": "user", "content": user_content}
            ]
            
            # 파이프라인의 토크나이저를 사용하여 챗 템플릿 적용
            formatted_prompt = pipe.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )
            
            # 파이프라인으로 텍스트 생성
            outputs = pipe(
                formatted_prompt,
                max_new_tokens=256,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                eos_token_id=pipe.tokenizer.eos_token_id,
                pad_token_id=pipe.tokenizer.pad_token_id
            )
            
            # 생성된 부분만 추출
            full_response = outputs[0]['generated_text']
            generated_answer = full_response[len(formatted_prompt):].strip()

            print("\n---")
            print(f"질문: {item['question']}")
            print(f"정답: {item['answer']}")
            print(f"모델 생성 답변: {generated_answer}")
            print("---")

            result_entry = {
                "question": item['question'],
                "ground_truth_answer": item['answer'],
                "generated_answer": generated_answer
            }
            inference_results.append(result_entry)

        file_name = f"{base_model_name_str}_results.json"
        save_path = os.path.join(result_dir, file_name)
        with open(save_path, 'w', encoding='utf-8') as f:
            json.dump(inference_results, f, ensure_ascii=False, indent=4)
        print(f"\n추론 결과 저장 완료: {save_path}")

        del model, tokenizer, pipe
        gc.collect()
        torch.cuda.empty_cache()

if __name__ == "__main__":
    main()




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

ValueError: Can't find 'adapter_config.json' at '../model/finetuned-gemma-7b-harrypotter'