In [1]:
embed_model_name = "../models/bge-small-en-v1.5-strans"

In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("BAAI/bge-small-en-v1.5")

model.save(embed_model_name)

## get documents

In [3]:
# import requests

# def get_wikipedia_page_links(title):
#     S = requests.Session()

#     URL = 'https://en.wikipedia.org/w/api.php'

#     params = {
#         'action': 'query',
#         'format': 'json',
#         'prop': 'links',
#         'titles': title,
#         "pllimit": "max"
#     }

#     all_links = []

#     while True:
#         response = S.get(url=URL, params=params).json()
#         pages = response.get('query', {}).get('pages', {})
#         for page_id, page_content in pages.items():
#             links = page_content.get('links', [])
#             for link in links:
#                 all_links.append(link['title'])
#         if 'continue' in response:
#             params['plcontinue'] = response['continue']['plcontinue']
#         else:
#             break

#     return all_links

In [4]:
# links_lm = get_wikipedia_page_links("Large language model")
# len(links_lm)

In [5]:
from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader("../dataset/llamaindex_data", glob="*", show_progress=True)
docs = loader.load()

100%|██████████| 2/2 [00:02<00:00,  1.36s/it]


In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " "],
    chunk_size=500, 
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

documents = text_splitter.split_documents(docs)

In [7]:
from langchain_community.embeddings import HuggingFaceEmbeddings

# 아무런 모델을 전달하지 않아도 임베딩 모델을 initialize할 수 있다.
# 다른 모델을 사용하고 싶으면 인자를 다르게 전달하여 불러올 수 있다.
embed_model = HuggingFaceEmbeddings()

In [8]:
query = "what is deep learning?"
embed_model.embed_query(query)[:10]

[-0.02370470203459263,
 -0.002275307197123766,
 -0.04430978745222092,
 -0.00528702000156045,
 -0.011042587459087372,
 0.03278699889779091,
 0.006998020224273205,
 0.0012319524539634585,
 0.0028871202375739813,
 0.004598280880600214]

In [9]:
# 허깅페이스에서 특정 임베딩 모델을 로드
embed_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

In [10]:
query = "what is deep learning?"
embed_model.embed_query(query)[:10]

[-0.007744791451841593,
 -0.02521648071706295,
 -0.024298150092363358,
 -0.005299203097820282,
 0.05355462804436684,
 0.06392054259777069,
 0.041832830756902695,
 -0.02230319008231163,
 0.08070062845945358,
 0.0017158148111775517]

In [11]:
# 로컬에서 임베딩 모델을 불러오는 경우에는 모델의 경로를 인자로 전달하면 된다.
embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_name,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

In [12]:
query = "what is deep learning?"
embed_model.embed_query(query)[:10]

[-0.007744791451841593,
 -0.02521648071706295,
 -0.024298150092363358,
 -0.005299203097820282,
 0.05355462804436684,
 0.06392054259777069,
 0.041832830756902695,
 -0.02230319008231163,
 0.08070062845945358,
 0.0017158148111775517]

In [13]:
# bge model을 사용할 때는 HuggingFaceBgeEmbeddings를 사용할 수 있다.
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

embed_model = HuggingFaceBgeEmbeddings(
    model_name=embed_model_name,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

In [14]:
query = "what is deep learning?"
embed_model.embed_query(query)[:10]

[-0.0032269565854221582,
 -0.016340699046850204,
 -0.037212371826171875,
 0.012990484945476055,
 0.05193881317973137,
 0.05887174606323242,
 0.039916761219501495,
 -0.021628746762871742,
 0.07608038187026978,
 -0.01793430931866169]

#### HuggingFaceEmbedding과 결과가 다른 이유?

* `HuggingFaceEmbedding`
```python
def embed_documents(self, texts: List[str]) -> List[List[float]]:
    import sentence_transformers

    texts = list(map(lambda x: x.replace("\n", " "), texts))
    if self.multi_process:
        pool = self.client.start_multi_process_pool()
        embeddings = self.client.encode_multi_process(texts, pool)
        sentence_transformers.SentenceTransformer.stop_multi_process_pool(pool)
    else:
        embeddings = self.client.encode(
            texts, show_progress_bar=self.show_progress, **self.encode_kwargs
        )

    return embeddings.tolist()

def embed_query(self, text: str) -> List[float]:

    return self.embed_documents([text])[0]
```

* `HuggingFaceBgeEmbeddings`
```python
def embed_query(self, text: str) -> List[float]:
    text = text.replace("\n", " ")
    embedding = self.client.encode(
        self.query_instruction + text, **self.encode_kwargs
    )
    return embedding.tolist()

```

- `HuggingFaceBgeEmbeddings`의 경우에는 임베딩을 진행할 때 query_intruction과 query를 결합하는 과정을 거친다.
- `HuggingFaceEmbeddings`는 이러한 query_instruction을 결합하는 과정이 들어있지 않기 때문에 두 임베딩 결과의 차이가 존재하는 것이다.
- query_intruction은 "Represent this quetion for searching relevant passages:"이며 이를 query 앞에 붙인다면 `HuggingFaceEmbeddings 또한 동일한 임베딩 결과가 나올 것이다.

In [15]:
embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_name,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

In [16]:
query = "Represent this question for searching relevant passages: What is deep learning?"
embed_model.embed_query(query)[:10]

[-0.0032269565854221582,
 -0.016340699046850204,
 -0.037212371826171875,
 0.012990484945476055,
 0.05193881317973137,
 0.05887174606323242,
 0.039916761219501495,
 -0.021628746762871742,
 0.07608038187026978,
 -0.01793430931866169]

In [17]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

embed_model = HuggingFaceBgeEmbeddings(
    model_name=embed_model_name,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

In [18]:
query = "What is deep learning?"
embed_model.embed_query(query)[:10]

[-0.0032269565854221582,
 -0.016340699046850204,
 -0.037212371826171875,
 0.012990484945476055,
 0.05193881317973137,
 0.05887174606323242,
 0.039916761219501495,
 -0.021628746762871742,
 0.07608038187026978,
 -0.01793430931866169]

# Retrieval

In [24]:
from langchain_community.vectorstores import FAISS

vector_index = FAISS.from_documents(documents, embed_model)

In [25]:
retriever = vector_index.as_retriever(serach_type="mmr")

In [26]:
retrieved = retriever.get_relevant_documents("Waht country is Mistral AI based in?")

In [27]:
retrieved[0].page_content

'24. 4. 10. 오후 3:22\n\nMistral AI - Wikipedia\n\nMistral AI\n\nMistral AI is a French company selling artificial intelligence (AI) products. It was founded in April 2023 by previous employees of Meta Platforms and Google DeepMind.[1] The company raised €385 million in October 2023[2] and in December 2023 it was valued at more than $2 billion.[3][4][5]\n\nMistral AI'

# Local LLM

In [28]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("Writer/camel-5b-hf")
model = AutoModel.from_pretrained("Writer/camel-5b-hf")

tokenizer.save_pretrained("../models/camel-5b-hf")
model.save_pretrained("../models/camel-5b-hf")

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/9.99G [00:00<?, ?B/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [29]:
model_name = "../models/camel-5b-hf"

In [30]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained("../models/camel-5b-hf")
model = AutoModelForCausalLM.from_pretrained("../models/camel-5b-hf")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [31]:
pipe("Waht is OpenAI Sora?")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Waht is OpenAI Sora?\nOpenAI Sora is a deep learning platform developed by OpenAI, a leading AI company. It is designed to enable developers to create and train AI models using a wide range of data sources, including text, images, and videos. The platform offers a wide range of AI models, such as sentiment analysis, image recognition, and natural language processing, which can be used for various applications, such as customer reviews, product recommendations, and video summarization.'}]

In [35]:
# 허깅페이스의 pipeline을 그대로 chain에 사용하면 랭체인의 문법과 맞지 않아 오류가 발생한다.
from langchain.prompts import PromptTemplate

template = """Quation: {question}
Answer: """

prompt = PromptTemplate(template=template, input_variables=["question"])

chain = prompt | pipe

In [37]:
chain.invoke({"question" : "What is OpenAI Sora?"})

TypeError: can only concatenate str (not "StringPromptValue") to str

In [38]:
# 랭체인에서 제공하는 파이프라인으로 한번 감싸줘야 오류가 발생하지 않는다.
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline(pipeline=pipe)

chain = prompt | hf

In [39]:
chain.invoke({"question" : "What is OpenAI Sora?"})

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' OpenAI Sora is a language model developed by OpenAI, a company founded by Elon Musk. It is a state-of-the-art language model that can understand, interpret, and generate human-like text, making it suitable for various applications such as chatbots, virtual assistants, and text summarization.'

In [41]:
# 파이프라인을 먼저 만들지 않고 바로 HuggingFacePipeline을 통해서 파이프라인을 불러올 수 있다.
hf = HuggingFacePipeline.from_model_id(
    model_id=model_name,
    task='text-generation',
    pipeline_kwargs={'max_new_tokens': 512}
)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Device has 1 GPUs available. Provide device={deviceId} to `from_model_id` to use availableGPUs for execution. deviceId is -1 (default) for CPU and can be a positive integer associated with CUDA device id.


In [42]:
chain = prompt | hf

In [44]:
chain.invoke({"question" : "What is OpenAI Sora?"})

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' OpenAI Sora is a language model developed by OpenAI, a company founded by Elon Musk. It is a state-of-the-art language model that can understand, interpret, and generate human-like text, making it suitable for various applications such as chatbots, virtual assistants, and text summarization.'

# Putting it all together

In [1]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
def merge_docs(retrieved_docs):
    return "\n\n".join([d.page_content for d in retrieved_docs])

loader = DirectoryLoader("../dataset/llamaindex_data", glob="*", show_progress=True)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " "],
    chunk_size=500,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False
)

documents = text_splitter.split_documents(docs)

100%|██████████| 2/2 [00:03<00:00,  1.73s/it]


In [3]:
embed_model_name = "../models/bge-small-en-v1.5-strans"
model_name = "../models/camel-5b-hf"

In [4]:
embed_model = HuggingFaceBgeEmbeddings(
    model_name=embed_model_name,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True}
)

In [5]:
vector_index = FAISS.from_documents(documents, embed_model)
retriever = vector_index.as_retriever(search_type="mmr")

In [6]:
hf = HuggingFacePipeline.from_model_id(
    model_id=model_name,
    task="text-generation",
    pipeline_kwargs={"max_new_tokens": 512}
)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Device has 1 GPUs available. Provide device={deviceId} to `from_model_id` to use availableGPUs for execution. deviceId is -1 (default) for CPU and can be a positive integer associated with CUDA device id.


In [7]:
template = """
Utilizing the context given below, answer the question
[context]
{context}

question: {query}
"""

prompt = ChatPromptTemplate.from_template(template)

In [8]:
chain = RunnableParallel({"context": retriever|merge_docs, "query": RunnablePassthrough()}) \
        | prompt \
        | hf \
        | StrOutputParser()

In [9]:
chain.invoke("What is OpenAI sora")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"Answer: OpenAI's Sora is a text-to-video model that can generate videos based on short descriptive prompts as well as extend existing videos forwards or backwards in time. It can generate videos with resolution up to 1920x1080 or 1080x1920. The maximal length of generated videos is unknown. Sora's technology is an adaptation of the technology behind the DALL·E 3 text-to-image model. OpenAI trained by adapting the DALL·E 3 technology to create a text-to-video model named Sora."

# vector index save & load

In [10]:
vector_index.save_local("../models/faiss_local.json")

In [11]:
vector_index = FAISS.load_local("../models/faiss_local.json", embeddings=embed_model, allow_dangerous_deserialization=True)

In [14]:
retriever = vector_index.as_retriever(search_type="mmr")
retrieved = retriever.get_relevant_documents("What county is Mistral AI based in?")
retrieved[0].page_content

"History\n\nScientist)\n\nMistral AI was co-founded in April 2023 by Arthur Mensch, Guillaume Lample and Timothée Lacroix. Prior to co-founding Mistral AI, Arthur Mensch worked at Google DeepMind which is Google's artificial intelligence laboratory, while Guillaume Lample and Timothée Lacroix worked at Meta Platforms.[11] The co-founders met while students at École polytechnique. Mistral is named for a strong wind that blows in France.[12]\n\nTimothée Lacroix (Co-Founder & CTO)"