In [1]:
%pip install -q transformers datasets loralib sentencepiece bitsandbytes accelerate langchain torch torchvision torchaudio peft numpy protobuf jupyter ipykernel

Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import LlamaTokenizer, LlamaForCausalLM, BitsAndBytesConfig, pipeline
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from peft import PeftModel, PeftConfig

import torch

model_id = "meta-llama/Llama-2-7b-chat-hf"
# peft_model_id = "quantumaikr/llama-2-ko-7b-chat-hf-4bit"
# config = PeftConfig.from_pretrained(peft_model_id)
model = LlamaForCausalLM.from_pretrained(
    model_id,
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    ),
    device_map="auto"
)
# model = PeftModel.from_pretrained(model, peft_model_id)
# model.to(device=f"cuda", non_blocking=True) # -> bitsandbytes를 사용하여 4비트로 양자화된 모델을 로드한 후, .to 메소드를 호출시 에러 발생

OSError: You are trying to access a gated repo.
Make sure to request access at https://huggingface.co/meta-llama/Llama-2-7b-chat-hf and pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`.

In [3]:
local_llm = HuggingFacePipeline(pipeline=pipeline(
    "text-generation",
    model=model,
    tokenizer=LlamaTokenizer.from_pretrained(model_id),
    max_length=1024,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.2,
    return_full_text=False,
))

llm_chain = LLMChain(
    llm=local_llm,
    prompt=PromptTemplate(input_variables=["instruction"], template="### 질문:\n{intruction}\n\n답변:")
)

conversation = ConversationChain(
    llm=local_llm,
    verbose=True,
    memory=ConversationBufferWindowMemory(k=4)
)

conversation.prompt.template = '''대화기록:
{history}
질문: {input}
답변:'''

def ask(question):
    print(llm_chain.invoke(question))

def chat(input):
    print(conversation.predict(input=input))

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonF

In [4]:
ask("나는 전재형이야. 오늘 뭐 먹었어?")

{'intruction': '나는 전재형이야. 오늘 뭐 먹었어?', 'text': '\n나는 프로그램으로 지정된 대상자에게만 존재하고, 일반인들에게는 접근이 제한됩니다. 따라서 오늘 나는 프로그램으로 지정된 대상자에게만 먹을 수 있습니다.'}


In [5]:
chat('안녕 나는 전재형이야')



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3m대화기록:

질문: 안녕 나는 전재형이야
답변:[0m

[1m> Finished chain.[0m
 안녕하세요, 전재형임


In [6]:
chat('내 이름이 뭔지 아니?')



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3m대화기록:
Human: 안녕 나는 전재형이야
AI:  안녕하세요, 전재형임
질문: 내 이름이 뭔지 아니?
답변:[0m

[1m> Finished chain.[0m
 내 이름은 '김철수'입니다.
