# 1. 라이브러리 선언

In [1]:
try:
    import google.colab
    inColab = True
except ImportError:
    inColab = False

In [2]:
# Hugging Face에서 개발한 도구로, 딥러닝 모델의 훈련과 추론 속도를 높이기 위한 최적화
# PEFT( Parameter-Efficient Fine-Tuning) 딥러닝 모델의 파라미터를 효율적으로 미세 조정하는 방법 제공
# bitsandbytes 딥러닝 모델을 더 작은 메모리 풋프린트로 실행할 수 있게 해주는 라이브러리
# transformers 사전 학습된 모델들을 제공하는 도구, 대형 모델을 쉽게 사용할 수 있게 해줌
# trl(Transformers Reinforcement Learning) 사전 학습된 트랜스포머 모델에 강화 학습 적용하여 특정 작업에 맞게 모델을 더 정밀하게 조정하는 데 사용
# dataset 다양한 데이터셋을 쉽게 로드, 처리, 변환, 분석할 수 있는 라이브러리
# -U 옵션: 지정된 패키지가 이미 설치되어 있는 경우에도 최신 버전으로 업그레이드
if inColab == True:
    !pip install -U pandas==2.2.2 numpy==2.0.2 scipy==1.14.1 accelerate==1.6.0 peft==0.15.2 bitsandbytes==0.45.5 transformers==4.51.3 trl==0.16.1 datasets==3.5.0 tensorboard==2.19.0

Collecting scipy==1.14.1
  Downloading scipy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes==0.45.5
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl==0.16.1
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting datasets==3.5.0
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting tensorboard==2.19.0
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets==3.5.0)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets==3.5.0)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets==3.5.0)
  Downloading multiprocess-0.70.16-py311-none-any.w

In [3]:
!pip install uvicorn fastapi
!pip install nest-asyncio pyngrok

Collecting uvicorn
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Downloading uvicorn-0.34.2-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fastapi-0.115.12-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading starlette-0.46.2-py3-none-any.whl (72 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uvicorn, starlette, fastapi
Successfully installed fastapi-0.115.12 starlette-0.46.2 uvicorn-0.34.2
Collecting pyngrok
  Downloading pyngrok-7.2.8-py3-n

In [4]:
# 서버 관리용 fastapi 의존 라이브러리
import uvicorn

# fast api 라이브러리
from fastapi import FastAPI

# 머신러닝 모델 관리용 라이브러리 - 지금은 머신러닝 안 해서 안 필요함
# import pickle

# 데이터프레임 및 수 처리 라이브러리
import pandas as pd
import numpy as np

# 인터페이스 데이터 관리를 위한 라이브러리
from pydantic import BaseModel

# ngrok
import nest_asyncio
from pyngrok import ngrok
import uvicorn

# LLM
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging
)
from transformers import AutoConfig,AutoModel
import torch
from peft import PeftModel, PeftConfig

In [5]:
# CORS 라이브러리 선언
from fastapi.middleware.cors import CORSMiddleware
origins = ["*"]
app = FastAPI(title="TOTAL_SUM API")

# CORS 미들웨어 추가
app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"], # 모든 origin 허용
	allow_credentials=True,
	allow_methods=["GET", "POST", "PUT", "DELETE"],
	allow_headers=["*"],
)

# 2. 모델 불러오기

In [6]:
## base 모델
base_model= "limjh12/beomi_law"
### 베이스모델 불러오기
baseModel= AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map= "auto" # T4 GPU 사용 시
    # device_map= {"": 0} # L4 이상 GRU 사용시
)

### 토크나이저 불러오기
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token= tokenizer.eos_token
tokenizer.padding_side= "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/335 [00:00<?, ?B/s]

# 3. 인터페이스 데이터 정의

In [7]:
class InDataset(BaseModel):
    question : str

# 4. 예측용 함수 정의

In [8]:
## LLM 모델
@app.post("/chat", status_code=200)
async def predict_tf(x: InDataset):
    print(x)
    question = x.question

    response = generate_response(system_message, instruction = question, input_text = input )
    print(response)

    return {"result": response }

@app.get('/')
async def root():
    return {"message": "online"}

In [9]:
def generate_response(system_message: str, instruction: str, input_text: str = "", max_new_tokens=512):
    user_content = instruction

    if input_text:
        system_message += f"\n{input_text}"

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_content}
    ]

    prompt_text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = tokenizer(prompt_text, return_tensors="pt").to(baseModel.device)

    outputs = baseModel.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.1,
        top_p=0.95,
        eos_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "assistant" in response:
        response = response.split("assistant")[-1].strip()
    return response.strip()

In [10]:
system_message = (
    "항상 사용자 질문에 친절하고 정확하게 답변하고, 답변 마지막에 '감사합니다'라고 말해야 해."
)

input = ""

print(generate_response(system_message, instruction = "언제 임금피크제가 시행됐어?", input_text = input ))

2003년


# 5. 서버오픈 (colab & ngrok 용)

In [11]:
auth_token= "2ucBQfSRlkSjzVHsJAWLyywZKxu_5eCGwG9AW3qVQVZ6KFzAS"
ngrok.set_auth_token(auth_token)
ngrokTunnel= ngrok.connect(9999)
print("공용 URL", ngrokTunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app, port=9999)



INFO:     Started server process [231]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:9999 (Press CTRL+C to quit)


공용 URL https://422f-34-126-140-206.ngrok-free.app
INFO:     58.232.163.125:0 - "OPTIONS /chat HTTP/1.1" 200 OK
question='임금피크제는 언제 시행됐어?'
2003년
INFO:     58.232.163.125:0 - "POST /chat HTTP/1.1" 200 OK
question='공무원 연금법상 연금을 받을 수 있는 연령은?'
65세 이상
INFO:     58.232.163.125:0 - "POST /chat HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [231]
