In [1]:
import os
import markdown
import torch
from bs4 import BeautifulSoup
from huggingface_hub import login
from sentence_transformers import SentenceTransformer

from langchain import FAISS
from langchain.text_splitter import SpacyTextSplitter 
from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader, TextLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.chat_models.base import BaseChatModel
from langchain.schema import HumanMessage, AIMessage, SystemMessage

from langchain.docstore.in_memory import InMemoryDocstore
from langchain.docstore.document import Document
import faiss
import numpy as np

In [2]:
key_path = '/Users/jaesolshin/key/HF_TOKEN.txt'
os.environ["HF_TOKEN"] = open(key_path, 'r', encoding='utf-8').read()
login(os.environ["HF_TOKEN"])

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /Users/jaesolshin/.cache/huggingface/token
Login successful


In [3]:
device = torch.device("mps" if torch.backends.mps.is_available else "cpu")

In [4]:
from langchain import document_loaders

In [5]:
# 문서 로드 및 텍스트 분할

# 폴더 내 모든 파일을 UnstructuredFileLoader로 로드
loader = DirectoryLoader(
    "readmes",  # 폴더 경로
    glob="*.md",  # 특정 형식의 파일만 로드 (예: *.md는 Markdown 파일)
    loader_cls=TextLoader  # 파일을 처리할 로더 클래스
)

documents = loader.load()

In [6]:
# 문서 내용 출력
for doc in documents:
    print(doc.page_content)

<!---
Copyright 2020 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<p align="center">
  <picture>
    <source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
    <source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
    <img alt="Hugging Face Transformers Library" src=

In [7]:
#!pip install spacy
#!spacy download ko_core_news_sm
from langchain.text_splitter import SpacyTextSplitter #자연어 처리를 위한 여러 기능을 제공한다.
text_splitter = SpacyTextSplitter(
    chunk_size=300, #분할할 크기를 설정
    pipeline='ko_core_news_sm' #분할에 사용할 언어모델을 설정
)

splitted_documents = text_splitter.split_documents(documents) #문서를 분할

print(f'분할 전 문서 개수: {len(documents)}')
print(f'분할 후 문서 개수: {len(splitted_documents)}')

Created a chunk of size 3544, which is longer than the specified 300
Created a chunk of size 467, which is longer than the specified 300
Created a chunk of size 5045, which is longer than the specified 300
Created a chunk of size 724, which is longer than the specified 300
Created a chunk of size 312, which is longer than the specified 300
Created a chunk of size 1191, which is longer than the specified 300
Created a chunk of size 661, which is longer than the specified 300
Created a chunk of size 363, which is longer than the specified 300
Created a chunk of size 366, which is longer than the specified 300
Created a chunk of size 596, which is longer than the specified 300
Created a chunk of size 374, which is longer than the specified 300
Created a chunk of size 378, which is longer than the specified 300
Created a chunk of size 365, which is longer than the specified 300
Created a chunk of size 329, which is longer than the specified 300
Created a chunk of size 315, which is longer 

분할 전 문서 개수: 3
분할 후 문서 개수: 447


In [8]:
# SentenceTransformer 모델 로드
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# SentenceTransformer 적재를 위해 Document 객체에서 텍스트 추출
texts = [doc.page_content for doc in splitted_documents]

# 문서 임베딩 생성
embeddings = embedding_model.embed_documents(texts)

# FAISS 인덱스 생성
d = len(embeddings[0])  # 임베딩 차원 (예: 384차원)
index = faiss.IndexFlatL2(d)  # L2 거리 기반의 FAISS 인덱스 생성
index.add(np.array(embeddings))

# LangChain의 FAISS와 연결
docstore = InMemoryDocstore({i: Document(page_content=texts[i]) for i in range(len(texts))})
faiss_index = FAISS(embedding_function=embedding_model, index=index, docstore=docstore, index_to_docstore_id={i: i for i in range(len(texts))})

# FAISS 인덱스가 생성되었습니다.
print("Documents added to the FAISS index.")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Documents added to the FAISS index.


In [9]:
embeddings = embedding_model

database = faiss_index

query = "who are the authors of transformer?"

results = faiss_index.similarity_search(query, k=5) 

context = "Relevant documents:\n"  #조회를 통해 얻은 정보를 저장할 변수 초기화

for i, result in enumerate(results):
    context += f"""
-----------------------------------
Doc {i+1}:
{result.page_content}
"""

prompt = PromptTemplate(
    template="""Answer the question based on the given information.

Information:
{document}

Question:{query} 
""",
    input_variables=['documents', 'query']
)

prompt = prompt.format(document=context, query=query)
print(prompt)

Answer the question based on the given information.

Information:
Relevant documents:

-----------------------------------
Doc 1:
1.

**[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
1.

-----------------------------------
Doc 2:
1.

**[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.


1.

-----------------------------------
Doc 3:
**[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.


1.

-----------------------------------
Doc 4:
1.

**

## Language Model 호출

In [10]:
#!pip install --upgrade -q langchain langchain-google-vertexai

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers

model_id = "google/gemma-2b-it"
dtype = torch.bfloat16

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="mps",
    torch_dtype=dtype,
)

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
outputs = model.generate(input_ids=inputs.to(model.device), max_new_tokens=150)
print(tokenizer.decode(outputs[0]).split("Answer:")[1])

The authors of transformer are Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya, Iz Beltagy, Matthew E. Peters, Arman Cohan, Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.<eos>
