# 기본적인 질문 답변 확인

In [None]:
import os
import torch
from transformers import pipeline

In [None]:
model_list = [
    "microsoft/Phi-mini-MoE-instruct",
    "Gensyn/Qwen2.5-1.5B-Instruct",
    "./model/LLM/deepseek-qwen-bllossom-32b"
    
]
question = "What is LLM?"

messages = [
    {"role" : "system", "content" : "You are a helpful AI assistant."},
    {"role" : "user", "content" : question}
]

for model_name in model_list :
    print(f"{model_name} 테스트")

    if not model_name :
        print("모델이 입력되지 않았습니다.")
        continue
    if model_name.startswith("./") and not os.path.isdir(model_name) :
        print(f"로컬 경로 {model_name}에 모델이 없음")
        continue
    try :
        pipe = pipeline(
            "text-generation",
            model=model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        prompt = pipe.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        outputs = pipe(
            prompt,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )

        print(f"모델 응답 : {outputs[0]['generated_text']}")

    except Exception as e :
        print(f"{model_name} 모델을 로드하거나 사용하는 중 오류 발생 : {e}")

    finally :
        print('-' * 20)
        print('\n')

./model/LLM/deepseek-qwen-bllossom-32b 테스트


Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

Device set to use cuda:0


모델 응답 : <｜begin▁of▁sentence｜>You are a helpful AI assistant.<｜User｜>What is LLM?<｜Assistant｜><think>
Okay, so I need to figure out what an LLM is. The user mentioned that it stands for Large Language Model, but I want to make sure I understand it thoroughly. Let me start by breaking down the term. "Large" probably refers to the size of the model in terms of parameters. "Language Model" suggests it's related to processing or generating human language. 

I remember hearing about models like GPT-3 or BERT. They're used for tasks like text generation, translation, and answering questions. So maybe LLMs are a category that includes these models. But how exactly do they work? I think they use machine learning, specifically deep learning, with neural networks. The architecture might involve transformers, which I've heard are a type of neural network architecture introduced by Vaswani et al. in 2017. Transformers use attention mechanisms to process sequences of data, which is useful for langua

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "google/gemma-7b-it" 

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
)

if torch.cuda.is_available():
    model.to("cuda")
    print("모델이 GPU에 로드되었습니다.")
else:
    print("CUDA를 사용할 수 없어 모델이 CPU에 로드되었습니다. 추론 속도가 느릴 수 있습니다.")

prompt = "What is LLM?"

input_ids = tokenizer.apply_chat_template(
    [{"role": "user", "content": prompt}],
    tokenize=True,
    add_special_tokens=True,
    return_tensors="pt"
)

if torch.cuda.is_available():
    input_ids = input_ids.to("cuda")

outputs = model.generate(
    input_ids,
    max_new_tokens=500,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    pad_token_id=tokenizer.eos_token_id 
)

generated_text = tokenizer.decode(outputs[0, input_ids.shape[-1]:], skip_special_tokens=True)

print(f"프롬프트: {prompt}")
print(f"모델 답변: {generated_text}")

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


모델이 GPU에 로드되었습니다.
프롬프트: What is LLM?
모델 답변: Language Large Language Models (LLMs) are a type of deep learning model that are specifically designed to understand and generate human-like text. LLMs are trained on vast amounts of text data and are able to perform a wide range of tasks, including text summarization, translation, code generation, and information retrieval.

Here are some key features of LLMs:

* **Large-scale training:** LLMs are trained on massive datasets of text data, typically billions or even trillions of


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import os

question = "What is LLM?"

common_messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": question}
]

model_list = [
    "./model/LLM/gemma-7b", 
    "./model/LLM/phi-mini-moe", 
    "./model/LLM/qwen2.5-1.5b", 
    "./model/LLM/deepseek-qwen-bllossom-32b" 
]

for model_name in model_list:
    print(f"모델 테스트 시작: {model_name}")

    if model_name.startswith("./") and not os.path.isdir(model_name):
        print(f"오류: 로컬 경로 '{model_name}'에 모델 디렉토리가 존재하지 않습니다. 스킵합니다.")
        print("-" * 40)
        print("\n")
        continue

    try:
        pipe = pipeline(
            "text-generation",
            model=model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        print(f"모델 '{model_name}'이 성공적으로 로드되었습니다.")

        if "Qwen" in model_name:
            formatted_input = pipe.tokenizer.apply_chat_template(
                common_messages,
                tokenize=False,
                add_generation_prompt=True
            )
        elif "Phi" in model_name:
             formatted_input = pipe.tokenizer.apply_chat_template(
                common_messages,
                tokenize=False,
                add_generation_prompt=True
            )
        elif "gemma" in model_name.lower():
            formatted_input = pipe.tokenizer.apply_chat_template(
                common_messages,
                tokenize=False,
                add_generation_prompt=True
            )
        else: 
            formatted_input = pipe.tokenizer.apply_chat_template(
                common_messages,
                tokenize=False,
                add_generation_prompt=True
            )

        outputs = pipe(
            formatted_input,
            max_new_tokens=500, 
            do_sample=True, 
            temperature=0.7, 
            top_k=50,
            top_p=0.9, 
            eos_token_id=pipe.tokenizer.eos_token_id 
        )

        
        full_generated_text = outputs[0]['generated_text']
        answer_start_index = len(formatted_input)
        
        cleaned_answer = full_generated_text[answer_start_index:].strip()
        
        print(f"프롬프트: {question}")
        print(f"모델 응답 ({model_name}): {cleaned_answer}")

    except Exception as e:
        print(f"오류: '{model_name}' 모델을 로드하거나 사용하는 중 문제가 발생했습니다: {e}")

    finally:
        if 'pipe' in locals():
            del pipe
        if 'model' in locals(): 
            del model
        if 'tokenizer' in locals(): 
            del tokenizer
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            print("CUDA 캐시가 비워졌습니다.")
        print("-" * 40)
        print("\n")

모델 테스트 시작: google/gemma-7b-it


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


모델 'google/gemma-7b-it'이 성공적으로 로드되었습니다.
오류: 'google/gemma-7b-it' 모델을 로드하거나 사용하는 중 문제가 발생했습니다: System role not supported
CUDA 캐시가 비워졌습니다.
----------------------------------------


모델 테스트 시작: microsoft/Phi-mini-MoE-instruct


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


모델 'microsoft/Phi-mini-MoE-instruct'이 성공적으로 로드되었습니다.
프롬프트: What is LLM?
모델 응답 (microsoft/Phi-mini-MoE-instruct): LLM stands for "Master of Laws." It is a postgraduate academic degree in law, offered by law schools in many countries. The LLM is designed for individuals who have already completed an undergraduate law degree (such as a Juris Doctor, LLB, or JD) and wish to specialize in a particular area of law, gain advanced knowledge in a specific legal field, or prepare for a career in international law, law and business, or other specialized areas.

The LLM program typically takes one year to complete, although some programs may take up to two years. Candidates may need to have a certain number of years of work experience in the legal field, depending on the country and institution offering the program. The LLM program may include courses in various areas of law, such as international law, human rights, environmental law, intellectual property law, or commercial law, among others.

In

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Device set to use cuda:0


모델 'Gensyn/Qwen2.5-1.5B-Instruct'이 성공적으로 로드되었습니다.
프롬프트: What is LLM?
모델 응답 (Gensyn/Qwen2.5-1.5B-Instruct): LLM stands for Large Language Model. It refers to advanced artificial intelligence models that have the ability to generate human-like text across a wide range of topics, from writing stories and articles to answering questions on specific subjects. These models use deep learning algorithms to analyze vast amounts of data and learn patterns in language, allowing them to produce coherent and contextually appropriate responses.

Some popular examples of large language models include:

1. GPT-3 (Generative Pre-trained Transformer 3)
2. BERT (Bidirectional Encoder Representations from Transformers)
3. T5 (Text-to-Video)
4. DALL-E

These models have been trained using massive datasets and can perform tasks such as machine translation, question answering, summarization, and more. However, it's important to note that they may not always be accurate or fair, and their outputs should be re

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Device set to use cuda:0


모델 './model/LLM/deepseek-qwen-bllossom-32b'이 성공적으로 로드되었습니다.


: 

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" 

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-7B-Instruct",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]


# 해리포터 QA 데이터 답변 결과

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


모델 'gemma-7b'이 성공적으로 로드되었습니다.


Processing gemma-7b:  14%|█▍        | 1/7 [00:23<02:18, 23.14s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


'01_Harry_Potter_and_the_Sorcerers_Stone.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/gemma-7b/01_Harry_Potter_and_the_Sorcerers_Stone_responses.txt


Processing gemma-7b:  29%|██▊       | 2/7 [00:43<01:48, 21.71s/it]

'02_Harry_Potter_and_the_Chamber_of_Secrets.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/gemma-7b/02_Harry_Potter_and_the_Chamber_of_Secrets_responses.txt


Processing gemma-7b:  43%|████▎     | 3/7 [01:17<01:48, 27.13s/it]

'03_Harry_Potter_and_the_Prisoner_of_Azkaban.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/gemma-7b/03_Harry_Potter_and_the_Prisoner_of_Azkaban_responses.txt


Processing gemma-7b:  57%|█████▋    | 4/7 [01:48<01:25, 28.53s/it]

'04_Harry_Potter_and_the_Goblet_of_Fire.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/gemma-7b/04_Harry_Potter_and_the_Goblet_of_Fire_responses.txt


Processing gemma-7b:  71%|███████▏  | 5/7 [02:00<00:45, 22.77s/it]

'05_Harry_Potter_and_the_Order_of_the_Phoenix.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/gemma-7b/05_Harry_Potter_and_the_Order_of_the_Phoenix_responses.txt


Processing gemma-7b:  86%|████████▌ | 6/7 [02:17<00:20, 20.60s/it]

'06_Harry_Potter_and_the_Half_Blood_Prince.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/gemma-7b/06_Harry_Potter_and_the_Half_Blood_Prince_responses.txt


Processing gemma-7b: 100%|██████████| 7/7 [02:26<00:00, 20.90s/it]

'07_Harry_Potter_and_the_Deathly_Hallows.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/gemma-7b/07_Harry_Potter_and_the_Deathly_Hallows_responses.txt





Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


모델 'phi-mini-moe'이 성공적으로 로드되었습니다.


Processing phi-mini-moe:  14%|█▍        | 1/7 [03:34<21:27, 214.54s/it]

'01_Harry_Potter_and_the_Sorcerers_Stone.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/phi-mini-moe/01_Harry_Potter_and_the_Sorcerers_Stone_responses.txt


Processing phi-mini-moe:  29%|██▊       | 2/7 [06:56<17:16, 207.35s/it]

'02_Harry_Potter_and_the_Chamber_of_Secrets.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/phi-mini-moe/02_Harry_Potter_and_the_Chamber_of_Secrets_responses.txt


Processing phi-mini-moe:  43%|████▎     | 3/7 [10:30<14:00, 210.19s/it]

'03_Harry_Potter_and_the_Prisoner_of_Azkaban.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/phi-mini-moe/03_Harry_Potter_and_the_Prisoner_of_Azkaban_responses.txt


Processing phi-mini-moe:  57%|█████▋    | 4/7 [14:04<10:35, 211.88s/it]

'04_Harry_Potter_and_the_Goblet_of_Fire.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/phi-mini-moe/04_Harry_Potter_and_the_Goblet_of_Fire_responses.txt


Processing phi-mini-moe:  71%|███████▏  | 5/7 [17:40<07:06, 213.28s/it]

'05_Harry_Potter_and_the_Order_of_the_Phoenix.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/phi-mini-moe/05_Harry_Potter_and_the_Order_of_the_Phoenix_responses.txt


Processing phi-mini-moe:  86%|████████▌ | 6/7 [21:19<03:35, 215.32s/it]

'06_Harry_Potter_and_the_Half_Blood_Prince.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/phi-mini-moe/06_Harry_Potter_and_the_Half_Blood_Prince_responses.txt


Processing phi-mini-moe: 100%|██████████| 7/7 [25:02<00:00, 214.68s/it]

'07_Harry_Potter_and_the_Deathly_Hallows.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/phi-mini-moe/07_Harry_Potter_and_the_Deathly_Hallows_responses.txt



Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Device set to use cuda:0


모델 'qwen2.5-1.5b'이 성공적으로 로드되었습니다.


Processing qwen2.5-1.5b:  14%|█▍        | 1/7 [00:14<01:29, 14.87s/it]

'01_Harry_Potter_and_the_Sorcerers_Stone.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/qwen2.5-1.5b/01_Harry_Potter_and_the_Sorcerers_Stone_responses.txt


Processing qwen2.5-1.5b:  29%|██▊       | 2/7 [00:27<01:08, 13.60s/it]

'02_Harry_Potter_and_the_Chamber_of_Secrets.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/qwen2.5-1.5b/02_Harry_Potter_and_the_Chamber_of_Secrets_responses.txt


Processing qwen2.5-1.5b:  43%|████▎     | 3/7 [00:45<01:02, 15.66s/it]

'03_Harry_Potter_and_the_Prisoner_of_Azkaban.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/qwen2.5-1.5b/03_Harry_Potter_and_the_Prisoner_of_Azkaban_responses.txt


Processing qwen2.5-1.5b:  57%|█████▋    | 4/7 [00:55<00:39, 13.20s/it]

'04_Harry_Potter_and_the_Goblet_of_Fire.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/qwen2.5-1.5b/04_Harry_Potter_and_the_Goblet_of_Fire_responses.txt


Processing qwen2.5-1.5b:  71%|███████▏  | 5/7 [01:06<00:25, 12.67s/it]

'05_Harry_Potter_and_the_Order_of_the_Phoenix.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/qwen2.5-1.5b/05_Harry_Potter_and_the_Order_of_the_Phoenix_responses.txt


Processing qwen2.5-1.5b:  86%|████████▌ | 6/7 [01:15<00:11, 11.33s/it]

'06_Harry_Potter_and_the_Half_Blood_Prince.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/qwen2.5-1.5b/06_Harry_Potter_and_the_Half_Blood_Prince_responses.txt


Processing qwen2.5-1.5b: 100%|██████████| 7/7 [01:26<00:00, 12.34s/it]

'07_Harry_Potter_and_the_Deathly_Hallows.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/qwen2.5-1.5b/07_Harry_Potter_and_the_Deathly_Hallows_responses.txt





Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

Device set to use cuda:0


모델 'deepseek-qwen-bllossom-32b'이 성공적으로 로드되었습니다.


Processing deepseek-qwen-bllossom-32b:  14%|█▍        | 1/7 [06:37<39:44, 397.42s/it]

'01_Harry_Potter_and_the_Sorcerers_Stone.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/deepseek-qwen-bllossom-32b/01_Harry_Potter_and_the_Sorcerers_Stone_responses.txt


Processing deepseek-qwen-bllossom-32b:  29%|██▊       | 2/7 [13:50<34:51, 418.34s/it]

'02_Harry_Potter_and_the_Chamber_of_Secrets.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/deepseek-qwen-bllossom-32b/02_Harry_Potter_and_the_Chamber_of_Secrets_responses.txt


Processing deepseek-qwen-bllossom-32b:  43%|████▎     | 3/7 [20:25<27:11, 407.86s/it]

'03_Harry_Potter_and_the_Prisoner_of_Azkaban.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/deepseek-qwen-bllossom-32b/03_Harry_Potter_and_the_Prisoner_of_Azkaban_responses.txt


Processing deepseek-qwen-bllossom-32b:  57%|█████▋    | 4/7 [27:05<20:14, 404.74s/it]

'04_Harry_Potter_and_the_Goblet_of_Fire.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/deepseek-qwen-bllossom-32b/04_Harry_Potter_and_the_Goblet_of_Fire_responses.txt


Processing deepseek-qwen-bllossom-32b:  71%|███████▏  | 5/7 [33:07<12:58, 389.10s/it]

'05_Harry_Potter_and_the_Order_of_the_Phoenix.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/deepseek-qwen-bllossom-32b/05_Harry_Potter_and_the_Order_of_the_Phoenix_responses.txt


Processing deepseek-qwen-bllossom-32b:  86%|████████▌ | 6/7 [39:52<06:34, 394.65s/it]

'06_Harry_Potter_and_the_Half_Blood_Prince.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/deepseek-qwen-bllossom-32b/06_Harry_Potter_and_the_Half_Blood_Prince_responses.txt


Processing deepseek-qwen-bllossom-32b: 100%|██████████| 7/7 [46:47<00:00, 401.12s/it]

'07_Harry_Potter_and_the_Deathly_Hallows.json' 파일의 10개 질문에 대한 답변을 저장했습니다: ./result/HarryPotter/deepseek-qwen-bllossom-32b/07_Harry_Potter_and_the_Deathly_Hallows_responses.txt





: 