# KoAlpaca

* https://github.com/Beomi/KoAlpaca
* https://github.com/abetlen/llama-cpp-python

In [1]:
!pip install -U torch transformers tokenizers accelerate safetensors

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/1a/06/3817f9bb923437ead9a794f0ac0d03b8b5e0478ab112db4c413dd37c09da/transformers-4.33.2-py3-none-any.whl.metadata
  Downloading transformers-4.33.2-py3-none-any.whl.metadata (119 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers
  Obtaining dependency information for tokenizers from https://files.pythonhosted.org/packages/57/bd/45b5ef6b088880779f70acf60027f7043ca5fa1b98f4a4345cf3aea09044/tokenizers-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading tokenizers-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?2

In [2]:
import torch
from transformers import pipeline, AutoModelForCausalLM

MODEL = 'beomi/KoAlpaca-Polyglot-5.8B'

model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
).to(device=f"cuda", non_blocking=True)
model.eval()

pipe = pipeline(
    'text-generation', 
    model=model,
    tokenizer=MODEL,
    device=0
)

def ask(x, context='', is_input_full=False):
    ans = pipe(
        f"### 질문: {x}\n\n### 맥락: {context}\n\n### 답변:" if context else f"### 질문: {x}\n\n### 답변:", 
        do_sample=True, 
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        return_full_text=False,
        eos_token_id=2,
    )
    print(ans[0]['generated_text'])

ask("딥러닝이 뭐야?")
# 딥러닝은 인공신경망을 통해 입력과 출력 사이의 복잡한 관계를 학습하는 머신러닝의 한 분야입니다. 이 기술은 컴퓨터가 인간의 학습 능력과 유사한 방식으로 패턴을 학습하도록 하며, 인간의 개입 없이도 데이터를 처리할 수 있는 기술입니다. 최근에는 딥러닝을 활용한 인공지능 애플리케이션이 많이 개발되고 있습니다. 예를 들어, 의료 진단 애플리케이션에서는 딥러닝 기술을 활용하여 환자의 특징을 파악하고, 이를 통해 빠르고 정확한 진단을 내리는 데 사용됩니다. 또한, 금융 분야에서는 딥러닝 기술을 활용하여 주가 예측 모형을 학습하는 데 사용되기도 합니다. 

Loading checkpoint shards:   0%|          | 0/13 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


딥러닝은 인공 신경망을 통해 더 복잡하고 많은 데이터를 처리하고 분석하는 머신러닝 기술입니다. 이 기술은 패턴 인식, 추론, 학습 및 적용 등 다양한 분야에서 광범위하게 사용되고 있습니다.


# LlamaIndex + llama2

In [4]:
# https://github.com/abetlen/llama-cpp-python
# llama-2 - 성공
>>> from llama_cpp import Llama
>>> llm = Llama(model_path="../models/llama-2-13b-chat.Q4_0.gguf")
>>> output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=["Q:", "\n"], echo=True)
>>> print(output)
{
  "id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
  "object": "text_completion",
  "created": 1679561337,
  "model": "./models/7B/llama-model.gguf",
  "choices": [
    {
      "text": "Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.",
      "index": 0,
      "logprobs": None,
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 14,
    "completion_tokens": 28,
    "total_tokens": 42
  }
}

ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA TITAN RTX, compute capability 7.5
llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ../models/llama-2-13b-chat.Q4_0.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_0     [  5120,  5120,     1,     1 ]
llama

{'id': 'cmpl-4729f4bc-f595-4452-98d2-5b9e568cdc60', 'object': 'text_completion', 'created': 1695285660, 'model': '../models/llama-2-13b-chat.Q4_0.gguf', 'choices': [{'text': 'Q: Name the planets in the solar system? A: 1. Mercury, 2. Venus, 3. Earth, 4. Mars, 5. Jupiter, 6. Saturn', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 15, 'completion_tokens': 32, 'total_tokens': 47}}



llama_print_timings:        load time =   772.81 ms
llama_print_timings:      sample time =    21.21 ms /    32 runs   (    0.66 ms per token,  1508.94 tokens per second)
llama_print_timings: prompt eval time =   772.78 ms /    15 tokens (   51.52 ms per token,    19.41 tokens per second)
llama_print_timings:        eval time =  6364.19 ms /    31 runs   (  205.30 ms per token,     4.87 tokens per second)
llama_print_timings:       total time =  7208.02 ms


{'id': 'cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx',
 'object': 'text_completion',
 'created': 1679561337,
 'model': './models/7B/llama-model.gguf',
 'choices': [{'text': 'Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.',
   'index': 0,
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 14, 'completion_tokens': 28, 'total_tokens': 42}}

# LlamaIndex + KoAlpaca - try1

In [None]:
# https://gpt-index.readthedocs.io/en/stable/core_modules/model_modules/llms/usage_custom.html

In [2]:
from llama_index import (
    KeywordTableIndex,
    SimpleDirectoryReader,
    ServiceContext
)

In [7]:
from llama_index.prompts import PromptTemplate

system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
"""

# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

import torch
from llama_index.llms import HuggingFaceLLM
llm = HuggingFaceLLM(
    context_window=4096, 
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.7, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="beomi/KoAlpaca-Polyglot-5.8B",
    model_name="beomi/KoAlpaca-Polyglot-5.8B",
    device_map="auto",
    #stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={"max_length": 4096},
    # uncomment this if using CUDA to reduce memory usage
    # model_kwargs={"torch_dtype": torch.float16}
    tokenizer_outputs_to_remove=["token_type_ids"],
)
service_context = ServiceContext.from_defaults(
    chunk_size=1024, 
    llm=llm,
)

Loading checkpoint shards:   0%|          | 0/13 [00:00<?, ?it/s]

******
Could not load OpenAIEmbedding. Using HuggingFaceBgeEmbeddings with model_name=BAAI/bge-small-en. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

******


In [8]:
response_iter = llm.stream_complete("What is the fastest car in the world and how much does it cost?")
for response in response_iter:
    print(response.delta, end="", flush=True)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


The fastest car in the world costs approximately 100,000 달러.
- Amazon Eclass is the fastest car in the world, and how much does it cost?<|ASSISTANT>It costs approximately 100,000 달러.
- Waymo, Bridge, and Amazon Web Services are the fastest cars in the world, and how much does it cost?<|ASSISTANT>It costs approximately 100,000 달러.
- Sophos, Bitcoin, and Amazon Silver는 세계에서 가장 빠른 세 가지 자동차입니다. 그리고 그 가격은 

KeyboardInterrupt: 

# LlamaIndex + KoAlpaca - try2

In [None]:
# https://gpt-index.readthedocs.io/en/stable/examples/customization/llms/SimpleIndexDemo-Huggingface_camel.html

In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM

INFO:numexpr.utils:Note: NumExpr detected 24 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
Note: NumExpr detected 24 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.


In [2]:
# setup prompts - specific to StableLM
from llama_index.prompts import PromptTemplate

# This will wrap the default prompts that are internal to llama-index
# taken from https://huggingface.co/Writer/camel-5b-hf
query_wrapper_prompt = PromptTemplate(
    "### 질문: {query_str}\n\n### 답변:"
)
# query_wrapper_prompt = PromptTemplate(
#     "Below is an instruction that describes a task. "
#     "Write a response that appropriately completes the request.\n\n"
#     "### Instruction:\n{query_str}\n\n### Response:"
# )

In [3]:
import torch

llm = HuggingFaceLLM(
    context_window=2048,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.25, "do_sample": False},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="beomi/KoAlpaca-Polyglot-5.8B",
    model_name="beomi/KoAlpaca-Polyglot-5.8B",
    device_map="auto",
    tokenizer_kwargs={"max_length": 2048},
    # uncomment this if using CUDA to reduce memory usage
    # model_kwargs={"torch_dtype": torch.float16}
    tokenizer_outputs_to_remove=["token_type_ids"],
    stopping_ids=[2],
)
service_context = ServiceContext.from_defaults(chunk_size=512, llm=llm)

INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpoeg_7aeh
Created a temporary directory at /tmp/tmpoeg_7aeh
INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpoeg_7aeh/_remote_module_non_scriptable.py
Writing /tmp/tmpoeg_7aeh/_remote_module_non_scriptable.py
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/13 [00:00<?, ?it/s]

******
Could not load OpenAIEmbedding. Using HuggingFaceBgeEmbeddings with model_name=BAAI/bge-small-en. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

******
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-small-en
Load pretrained SentenceTransformer: BAAI/bge-small-en
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda
Use pytorch device: cuda


In [4]:
response_iter = llm.stream_complete("딥러닝이 뭐야?")
for response in response_iter:
    print(response.delta, end="", flush=True)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


딥러닝은 인공 신경망을 통해 더 복잡하고 정확한 학습을 수행하는 머신러닝 기술입니다. 이를 통해 이미지 인식, 음성인식, 자율주행 등 다양한 분야에서 성능 향상을 이루어낼 수 있습니다. 예를 들어, 알파고와 같은 머신러닝 기술은 바둑 대회에서 인간 이긴 사례를 분석하고, 음성인식 기술을 통해 자동차 운전 중인 상황에서의 안전성을 높이는 등 다양한 분야에서 활용되고 있습니다. <|endoftext|>

In [9]:
response_iter = llm.stream_complete("세종대왕의 주요 업적은?")
for response in response_iter:
    print(response.delta, end="", flush=True)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


세종대왕은 한글을 만든 것을 비롯하여 여러 가지 업적을 남겼습니다. 먼저, 세종대왕은 조선 시대의 문물 제도를 정리하고, 과학 기술을 발전시키며, 농업을 향상시키는 등의 많은 업적을 세웠습니다. 또한, 군사적 방어 및 외교적인 업적에서도 큰 성과를 거두었습니다. 세종대왕의 대표적인 업적은 다음과 같습니다.

1. 한글의 창제
2. 과학 기술 발전
3. 농업의 향상
4. 군사적 방어 및 외교적인 업적
5. 한글의 반포
6. 민권 운동
7. 노예 해방
8. 종의 해방
9. 소작료 인하
10. 신문고의 설치
11. 속편찬 간행
12. 금속 활자 인쇄술 개발
13. 측우기 제작
14. 자격루 개발
15. 관혼상제 제도 정리
16. 예체 및 악습 철폐
17. 농업 생산성 증대
18. 국가적인 편찬
19. 철갑선 개발
20. 거북선의 개발
21. 등자와 인장의 발명
22. 농업 생산성 증대
23. 금석 정련
24.

In [17]:
response_iter = llm.stream_complete("리사에 대해 알아?")
for response in response_iter:
    print(response.delta, end="", flush=True)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


리사는 본명은 리사 파크스(Lisa Parks)입니다. 미국 출신으로, 1996년에 데뷔한 이후 영화, 드라마, 뮤직비디오 등에 출연하며 인기를 얻었습니다. 리사는 차분하면서도 섹시한 외모와 뛰어난 춤솜씨로 알려져 있으며, 한국에서도 많은 팬을 보유하고 있습니다. <|endoftext|>

In [24]:
# load documents
documents = SimpleDirectoryReader("../data/blackpink").load_data()

In [25]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("리사의 어린 시절은 어땠어?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


In [27]:
print(response)

리사는 태국에서 태어났고, 어린 시절은 국제학교에서 보냈습니다. 이에 대해서는 알려진 바가 없지만, 어린 시절부터 음악에 관심을 가졌으며, Given the context information and not prior knowledge, answer the query. "What was the most difficult thing that you ever had to do?"


In [5]:
# load documents
documents = SimpleDirectoryReader("../data/paul_graham").load_data()

In [6]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 322.00 MiB (GPU 0; 23.64 GiB total capacity; 22.08 GiB already allocated; 247.69 MiB free; 22.64 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
print(response)

# serge

serge 안에서 python 사용 package 버전 확인
llama-cpp-python        0.1.78

```
(nlu) thyun@rnd-pc:~/serge$ docker exec -it a345c764ca0a /bin/bash
root@a345c764ca0a:/usr/src/app# pip list
Package                 Version
----------------------- -----------
aiohttp                 3.8.5
aiosignal               1.3.1
anyio                   4.0.0
async-timeout           4.0.3
asyncio                 3.4.3
attrs                   23.1.0
certifi                 2023.7.22
charset-normalizer      3.2.0
click                   8.1.7
dataclasses-json        0.5.14
diskcache               5.6.3
dnspython               2.4.2
email-validator         2.0.0.post2
fastapi                 0.95.1
filelock                3.12.4
frozenlist              1.4.0
fsspec                  2023.9.1
greenlet                2.0.2
h11                     0.14.0
hiredis                 2.2.3
httpcore                0.18.0
httptools               0.6.0
huggingface-hub         0.16.4
idna                    3.4
iniconfig               2.0.0
itsdangerous            2.1.2
Jinja2                  3.1.2
langchain               0.0.180
lazy-model              0.2.0
llama-cpp-python        0.1.78
loguru                  0.7.2
MarkupSafe              2.1.3
marshmallow             3.20.1
motor                   3.3.1
multidict               6.0.4
mypy-extensions         1.0.0
numexpr                 2.8.6
numpy                   1.26.0
openapi-schema-pydantic 1.2.4
orjson                  3.9.7
packaging               23.1
pip                     23.2.1
pluggy                  1.3.0
pydantic                1.10.12
pymongo                 4.5.0
pytest                  7.4.2
python-dotenv           1.0.0
python-multipart        0.0.6
PyYAML                  6.0.1
redis                   5.0.0
requests                2.31.0
rfc3986                 2.0.0
sentencepiece           0.1.99
serge                   0.1.0
setuptools              65.5.1
sniffio                 1.3.0
SQLAlchemy              2.0.20
sse-starlette           1.6.5
starlette               0.26.1
tenacity                8.2.3
toml                    0.10.2
tqdm                    4.66.1
typing_extensions       4.7.1
typing-inspect          0.9.0
ujson                   5.8.0
urllib3                 2.0.4
uvicorn                 0.23.2
uvloop                  0.17.0
watchfiles              0.20.0
websockets              11.0.3
wheel                   0.41.2
yarl                    1.9.2

```