# LLM(거대언어모델)

## NLP(자연어처리)

In [None]:
# 사전 설치 : pip install konlpy (konlpy의 Okt는 Java가 설치되어 있어야 작동)
# pip install JPype1==1.4.1   // Jpype1 : 파이썬 코드 내에서 Java 코드 호출 (cf. konlpy okt가 Java 엔진 사용 )
# jdk17 사전설치필요(환경변수 설정 포함)
# 실습은 한글경로(X), 반드시 영문폴더 경로 지정
from konlpy.tag import Okt    # 한국어 자연어처리 라이브러리 (형태소 분석)
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

In [None]:
# 1. 텍스트 데이터 (입력 문장)
sentences = [
    "자연어 처리는 재미있는 분야입니다.",
    "딥러닝은 많은 데이터를 필요로 합니다.",
    "한국어 NLP는 정말 재미있어요!"
]

In [None]:
# 2. 토크나이징 (Tokenizing)
okt = Okt()
tokenized_sentences = [okt.morphs(sentence) for sentence in sentences]
print("토크나이징 결과:", tokenized_sentences)

토크나이징 결과: [['자연어', '처리', '는', '재미있는', '분야', '입니다', '.'], ['딥', '러닝', '은', '많은', '데이터', '를', '필요', '로', '합니다', '.'], ['한국어', 'NLP', '는', '정말', '재미있어요', '!']]


In [None]:
# 3. 인코딩 (Encoding): 단어를 숫자로 변환
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_sentences)
encoded_sentences = tokenizer.texts_to_sequences(tokenized_sentences)
print("인코딩 결과:", encoded_sentences)

인코딩 결과: [[3, 4, 1, 5, 6, 7, 2], [8, 9, 10, 11, 12, 13, 14, 15, 16, 2], [17, 18, 1, 19, 20, 21]]


In [None]:
# 4. 패딩 (Padding): 길이를 맞추기 위해 0으로 채우기
max_len = 10  # 최대 길이 설정
padded_sentences = pad_sequences(encoded_sentences, maxlen=max_len, padding='post')
print("패딩 결과:", padded_sentences)

패딩 결과: [[ 3  4  1  5  6  7  2  0  0  0]
 [ 8  9 10 11 12 13 14 15 16  2]
 [17 18  1 19 20 21  0  0  0  0]]


In [None]:
# 5. 임베딩 (Embedding)
vocab_size = len(tokenizer.word_index) + 1  # 단어 사전 크기
embedding_dim = 8  # 임베딩 차원 크기

In [None]:
# 간단한 임베딩 모델 생성
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))
model.compile('rmsprop', 'mse')



In [None]:
# 패딩된 문장을 임베딩 층에 통과
embeddings = model.predict(padded_sentences)
print("임베딩 결과 (첫 번째 문장):\n", embeddings[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167ms/step
임베딩 결과 (첫 번째 문장):
 [[-0.04125472 -0.01821507 -0.01394305 -0.04672713  0.0074228   0.01054945
   0.03111986  0.00029426]
 [ 0.02926726 -0.01578426 -0.03569571  0.00534181 -0.04906088  0.03512448
  -0.03559586  0.0223146 ]
 [ 0.03061527 -0.03420999 -0.01664606  0.02818399 -0.02243457 -0.01598765
  -0.02374363  0.03878769]
 [ 0.03877256  0.00805925 -0.04046869 -0.04127825 -0.04737352 -0.01836224
   0.02997616 -0.00887823]
 [ 0.00207663  0.00361109 -0.04349653  0.03285105  0.01747863 -0.03267775
  -0.00853581  0.00315981]
 [-0.02562077 -0.01019185  0.01029575 -0.01034442  0.04485333  0.04570898
  -0.00669671 -0.01878443]
 [ 0.0153535  -0.02555364  0.01312387 -0.0088132  -0.01409877 -0.00684059
  -0.00585586  0.0477985 ]
 [ 0.0402821   0.02068522 -0.00020845 -0.04897171  0.03199425 -0.01470655
  -0.01713451  0.00952457]
 [ 0.0402821   0.02068522 -0.00020845 -0.04897171  0.03199425 -0.01470655
  -0.01713451  0.00952457]

## 트랜스포머(Transformer)

### Hugging Face를 사용한 BERT 테스트

In [None]:
!# 감정분석(zero-shot classification)
### transformers 라이브러리 사전 설치 : pip install transformers
### tf-keras 라이브러리 사전 설치 : pip install tf-keras
from transformers import pipeline

In [None]:
 classifier = pipeline("sentiment-analysis")
classifier("오늘 기분이 좋아요")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.8848781585693359}]

In [None]:
classifier(
    ["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"]
)

[{'label': 'POSITIVE', 'score': 0.9598049521446228},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

In [None]:
# 텍스트 생성(text generation)
generator = pipeline("text-generation")
generator("In this course, we will teach you how to")

No model was supplied, defaulted to openai-community/gpt2 and revision 607a30d (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this course, we will teach you how to create a customized plugin to run on Linux, Windows and OS X.\n\nLearn how to use Java 3, 4 and 5 as well as the various technologies of Java.\n\nThe course will'}]

In [None]:
# question-answering
question_answerer = pipeline("question-answering")
question_answerer(
    question="Where do I work?",
    context="My name is Sylvain and I work at Hugging Face in Brooklyn",
)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Device set to use cpu


{'score': 0.6949766278266907, 'start': 33, 'end': 45, 'answer': 'Hugging Face'}

In [None]:
# 요약(Summarization)
summarizer = pipeline("summarization")
summarizer(
    """
    America has changed dramatically during recent years. Not only has the number of
    graduates in traditional engineering disciplines such as mechanical, civil,
    electrical, chemical, and aeronautical engineering declined, but in most of
    the premier American universities engineering curricula now concentrate on
    and encourage largely the study of engineering science. As a result, there
    are declining offerings in engineering subjects dealing with infrastructure,
    the environment, and related issues, and greater concentration on high
    technology subjects, largely supporting increasingly complex scientific
    developments. While the latter is important, it should not be at the expense
    of more traditional engineering.

    Rapidly developing economies such as China and India, as well as other
    industrial countries in Europe and Asia, continue to encourage and advance
    the teaching of engineering. Both China and India, respectively, graduate
    six and eight times as many traditional engineers as does the United States.
    Other industrial countries at minimum maintain their output, while America
    suffers an increasingly serious decline in the number of engineering graduates
    and a lack of well-educated engineers.
"""
)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Device set to use cpu


[{'summary_text': ' The number of engineering graduates in the United States has declined in recent years . China and India graduate six and eight times as many traditional engineers as the U.S. does . Rapidly developing economies such as China continue to encourage and advance the teaching of engineering . There are declining offerings in engineering subjects dealing with infrastructure, infrastructure, the environment, and related issues .'}]

## langchain 예제

In [None]:
# 사전설치 : pip install faiss-cpu sentence_transformers langchain-community langchain-core
from langchain_community.vectorstores import FAISS   # 벡터 저장소
from langchain_core.output_parsers import StrOutputParser    # 문자열로 출력
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough    #입력데이터 전체를 다음단계로 전달
from langchain_community.embeddings import HuggingFaceEmbeddings

In [None]:
vectorstore = FAISS.from_texts(
    [
        "영준은 랭체인 주식회사에서 근무를 하였습니다.",
        "설현은 테디와 같은 회사에서 근무하였습니다.",
        "영준의 직업은 개발자입니다.",
        "설현의 직업은 디자이너입니다.",
    ],
    embedding = HuggingFaceEmbeddings(model_name='jhgan/ko-sroberta-multitask'),
)

In [None]:
retriever = vectorstore.as_retriever()

In [None]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

In [None]:
prompt = ChatPromptTemplate.from_template(template)

In [None]:
from langchain_community.llms import Ollama

In [None]:
# model = Ollama(model = "llama3:8b")
model = Ollama(model = "gemma2")

In [None]:
retrieval_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
retrieval_chain.invoke("설현의 직업은?")

In [None]:
retrieval_chain.invoke("영준이 근무한 곳은?")

In [None]:
retrieval_chain.invoke("영준이 개발자라면 유망한 SW 기술 추천해줘.")

## langchain + sql 예제 (대화이력 저장)

In [None]:
from langchain_community.chat_message_histories import SQLChatMessageHistory

In [None]:
chat_message_history = SQLChatMessageHistory(
    session_id = "sql_chat_history",
    connection_string = "mysql+pymysql://root:비밀번호@localhost:3306/test"
)

In [None]:
chat_message_history.add_user_message(
    "안녕. 나는 영준이야. 직업은 웹프로그래머이고 만나서 반가워!"
)

In [None]:
chat_message_history.add_user_message(
    "요즘 날씨가 추운데 건강 조심하고 즐거운 하루 보내!"
)

In [None]:
chat_message_history.messages

## langchain + ollama 예제

In [None]:
# 사전 설치 : pip install langchain, pip install langchain-community
from langchain_community.llms import Ollama

In [None]:
llm = Ollama(model = "gemma2")

  llm = Ollama(model = "gemma2")


In [None]:
llm.invoke("안녕, 간단하게 소개해줘.")

## RAG 예제: ollama + gemma2를 활용

In [None]:
# 사전 설치 : pip install langchain, pip install sentence-transformers torch tf-keras faiss-cpu
# ollama 설치 : https://ollama.com/download/windows
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import Ollama

In [None]:
# 1. 데이터 로드
loader = TextLoader("./dataset/history.txt", encoding='UTF8')  # 텍스트 파일 로드
documents = loader.load()  # 문서 로드

In [None]:
# 2. 벡터 임베딩 생성 (Hugging Face 사용)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(documents, embeddings)

In [None]:
# 3. 검색기(retriever) 설정
retriever = vectorstore.as_retriever()

In [None]:
# 4. Ollama Gemma2 모델 초기화
llm = Ollama(model="gemma2", base_url="http://localhost:11434")  # Ollama 서버 설정

In [None]:
# 5. RAG 체인 구성
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

In [None]:
# 6. 질문 실행
query = "고조선은 언제 설립되었는지 알려줘."
response = qa_chain.run(query)
print("Response:", response)

## 이미지 시각 특성 예제 : olllama + gemma2 + llava 를 활용

In [None]:
import json
from langchain_community.chat_models import ChatOllama
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

In [None]:
json_schema = {
    "title" : "Person",
    "description": "Identifying information about a person.",
    "type": "object",
    "properties": {
        "name" : {"title": "Name", "description": "The person's name", "type": "string"},
        "age": {"title": "Age", "description": "The person's age", "type": "integer"},
        "occupation": {
            "title": "Occupation",
            "description": "The person's Occupation",
            "type": "string",
        },
    },
    "required": ["name", "age"],
}

In [None]:
llm = ChatOllama(model="gemma2")

In [None]:
messages = [
    HumanMessage(
        content = "Please tell me about a person using the following JSON schema:"
    ),
    HumanMessage(content="{dumps}"),
    HumanMessage(
        content="""Now, considering the schema, please describe following person:
        Her name is Eun-Chae Lee, she is 25 years old, and she is a software engineer.
        """
    ),
]

In [None]:
prompt = ChatPromptTemplate.from_messages(
    messages
)

In [None]:
dumps = json.dumps(json_schema, indent=2) # JSON 형식의 문자열로 변환, indent=2 : 들여쓰기 수준 지정

In [None]:
chain = (
    prompt | llm | StrOutputParser()
)

In [None]:
chain.invoke({"dumps": dumps})

In [None]:
# 사전설치 : pip install pillow
import base64
from io import BytesIO
from IPython.display import HTML, display
from PIL import Image

In [None]:
def convert_to_base64(pil_image):
    buffered = BytesIO()
    pil_image.save(buffered, format="JPEG")
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return img_str

In [None]:
def plt_img_base64(img_base64):
    image_html = f'<img src="data:image/jpeg;base64,{img_base64}" />'
    display(HTML(image_html))

In [None]:
file_path = "./imgs/sample01.webp"
pil_image = Image.open(file_path)
pil_image = pil_image.resize((256, 256))

In [None]:
image_b64 = convert_to_base64(pil_image)
plt_img_base64(image_b64)

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.messages import HumanMessage

In [None]:
llm = ChatOllama(model="llava:7b", temperature=0)

In [None]:
def prompt_func(data):  # 프롬프트 함수를 정의합니다.
    text = data["text"]  # 데이터에서 텍스트를 가져옵니다.
    image = data["image"]  # 데이터에서 이미지를 가져옵니다.

    image_part = {  # 이미지 부분을 정의합니다.
        "type": "image_url",  # 이미지 URL 타입을 지정합니다.
        "image_url": f"data:image/jpeg;base64,{image}",  # 이미지 URL을 생성합니다.
    }

    content_parts = []  # 콘텐츠 부분을 저장할 리스트를 초기화합니다.

    text_part = {"type": "text", "text": text}  # 텍스트 부분을 정의합니다.

    content_parts.append(image_part)  # 이미지 부분을 콘텐츠 부분에 추가합니다.
    content_parts.append(text_part)  # 텍스트 부분을 콘텐츠 부분에 추가합니다.

    return [HumanMessage(content=content_parts)]  # HumanMessage 객체를 반환합니다.

In [None]:
chain = prompt_func | llm | StrOutputParser()

In [None]:
query_chain = chain.invoke(
    {"text": "Describe a picture in bullet points.", "image": image_b64}
)
query_chain

## 기타(워드클라우드 예제)

In [None]:
# 필요한 라이브러리 임포트
# 사전 설치 : pip install wordcloud
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import os

In [None]:
# 텍스트 파일 읽기
with open('./dataset/history.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [None]:
# 워드클라우드 설정 및 생성
wordcloud = WordCloud(
    font_path='malgun',  # 한글 폰트 설정 (맑은 고딕)
    background_color='white',
    width=800,
    height=600,
    max_words=200,
    max_font_size=100,
    min_font_size=10,
    random_state=42
).generate(text)

In [None]:
# 워드 클라우드 이미지 시각화
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')

## 기타(워드클라우드 gradio 예제)

In [None]:
import gradio as gr
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
# 워드 클라우드 생성함수
def generate_wordcloud(file_obj):
    try:
        # 파일이 없는 경우 처리
        if file_obj is None:
            return None

        # Gradio의 파일 객체에서 파일 경로 가져오기
        file_path = file_obj.name

        # 파일 읽기
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # 워드클라우드 생성
        wordcloud = WordCloud(
            font_path='malgun',  # 한글 폰트 설정 (맑은 고딕)
            background_color='white',
            width=800,
            height=600,
            max_words=200,
            max_font_size=100,
            min_font_size=10,
            random_state=42
        ).generate(text)

        # matplotlib 그래프 초기화
        plt.clf()

        # 워드클라우드 이미지를 플롯
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.tight_layout()

        # 결과 이미지를 저장
        output_path = 'wordcloud.png'
        plt.savefig(output_path)
        plt.close()  # 메모리 누수 방지를 위해 figure 닫기
        return output_path

    except Exception as e:
        print(f"Error: {str(e)}")
        return None

In [None]:
# Gradio 인터페이스 생성
iface = gr.Interface(
    fn=generate_wordcloud,
    inputs=gr.File(label="Upload a .txt file"),
    outputs=gr.Image(type="filepath", label="Word Cloud")
)

In [None]:
iface.launch(server_port=7861, share=True, server_name="0.0.0.0")

In [None]:
iface.close()

## 기타(이미지 분석 gradio 예제)

In [None]:
import ollama
import gradio as gr
import base64
from PIL import Image
import io   # io : 메모리 버퍼에 데이터 저장

In [None]:
def analyze_image_with_gemma(image):
    """Ollama Gemma 3 4B model을 사용하여 이미지 분석"""
    try:
        # Convert Gradio image to PIL Image
        pil_image = Image.fromarray(image)   # PIL(이미지처리, 조작) 이미지 객체로 변환

        # Save image to a temporary buffer
        buffer = io.BytesIO()  # 메모리 상에 임시 버퍼를 만들어서 이미지 데이터 저장
        pil_image.save(buffer, format="PNG")

        # Encode image to base64
        encoded_image = base64.b64encode(buffer.getvalue()).decode('utf-8')  # 임시 버퍼에 저장된 이미지 데이터를 바이트 문자열로 가져옴

        # Perform image analysis using Ollama Gemma3
        response = ollama.chat(
            model='gemma3:4b',
            messages=[
                {
                    'role': 'user',
                    'content': '이미지의 내용을 자세히 설명해줘. 무엇이 보이는지, 색상, 구성, 감정 등을 포함해서 분석해줘.',
                    'images': [encoded_image]
                }
            ]
        )

        return response['message']['content']

    except Exception as e:
        return f"이미지 분석 중 오류 발생: {str(e)}"

In [None]:
iface = gr.Interface(
        fn=analyze_image_with_gemma,
        inputs=gr.Image(type="numpy", label="이미지 업로드"),
        outputs=gr.Textbox(label="이미지 분석 결과"),
        title="Ollama Gemma3 이미지 분석기",
        description="이미지를 업로드하면 Gemma 3 4B 모델이 분석해드립니다."
    )

In [None]:
iface.launch()