<h1> Prepare Data

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma

In [None]:


# Text Splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=0)

# txt file load -> List[Document] 
loader1 = TextLoader("data/nlp-keywords.txt")
loader2 = TextLoader("data/finance-keywords.txt")

# Split
split_doc1 = loader1.load_and_split(text_splitter)
split_doc2 = loader2.load_and_split(text_splitter)

len(split_doc1), len(split_doc2)

<h1> Generate Vector Store (Chroma)


In [None]:

db = Chroma.from_documents(
    documents=split_doc1, embedding=OpenAIEmbeddings(), collection_name="my_db"
)


In [None]:
# Directory to store the database
DB_PATH = "./chroma_db"

# Create a vector database
persist_db = Chroma.from_documents(
    split_doc1, OpenAIEmbeddings(), persist_directory=DB_PATH, collection_name="my_db"
)

In [None]:
# Check data
persist_db.get()


# documents: The raw text or content of the documents you added
# embeddings: The vector embeddings of the documents 
# metadatas: Any associated metadata for the documents 
# ids: Unique identifiers for each document in the databas

<h1> Vector Store Retriever

In [None]:
# Create DB again
db = Chroma.from_documents(
    documents=split_doc1 + split_doc2,
    embedding=OpenAIEmbeddings(),
    collection_name="nlp",
)

In [None]:
# Maximum Marginal Relevance (MMR) Search
# k: Number of results to return.
# lambda_mult: Controls the trade-off between relevance and diversity.
# fetch_k: Number of results fetched before applying MMR

retriever = db.as_retriever(
    search_type="mmr", search_kwargs={"k": 6, "lambda_mult": 0.25, "fetch_k": 10}
)
retriever.invoke("Word2Vec 에 대하여 알려줘")

In [None]:
# Similarity Score Threshold
# Retrieves documents with a similarity score above 0.8
retriever = db.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.8}
)

retriever.invoke("Word2Vec 에 대하여 알려줘")

In [None]:
# Filtered Retrieval
# Adds a filter to restrict retrieval to documents with metadata "source": "data/finance-keywords.txt"
retriever = db.as_retriever(
    search_kwargs={"filter": {"source": "data/finance-keywords.txt"}, "k": 2}
)
retriever.invoke("ESG 에 대하여 알려줘")

<h1> Multimodal

In [None]:
# load the COCO dataset using the Hugging Face datasets library, 
# display a set of images along with their labels in a grid, 
# and save them to a specified folder

import os
from datasets import load_dataset
from matplotlib import pyplot as plt

# COCO 데이터셋 로드
dataset = load_dataset(
    path="detection-datasets/coco", name="default", split="train", streaming=True
)

# 이미지 저장 폴더와 이미지 개수 설정
IMAGE_FOLDER = "tmp"
N_IMAGES = 20

# 그래프 플로팅을 위한 설정
plot_cols = 5
plot_rows = N_IMAGES // plot_cols
fig, axes = plt.subplots(plot_rows, plot_cols, figsize=(plot_rows * 2, plot_cols * 2))
axes = axes.flatten()

# 이미지를 폴더에 저장하고 그래프에 표시
dataset_iter = iter(dataset)
os.makedirs(IMAGE_FOLDER, exist_ok=True)
for i in range(N_IMAGES):
    # 데이터셋에서 이미지와 레이블 추출
    data = next(dataset_iter)
    image = data["image"]
    label = data["objects"]["category"][0]  # 첫 번째 객체의 카테고리를 레이블로 사용

    # 그래프에 이미지 표시 및 레이블 추가
    axes[i].imshow(image)
    axes[i].set_title(label, fontsize=8)
    axes[i].axis("off")

    # 이미지 파일로 저장
    image.save(f"{IMAGE_FOLDER}/{i}.jpg")

# 그래프 레이아웃 조정 및 표시
plt.tight_layout()
plt.show()

In [None]:
!pip install langchain open-clip-torch
import open_clip
import pandas as pd
from langchain_experimental.open_clip import OpenCLIPEmbeddings
from langchain_teddynote.models import MultiModal
from langchain_openai import ChatOpenAI

In [None]:

# Model/Checkpoint 
pd.DataFrame(open_clip.list_pretrained(), columns=["model_name", "checkpoint"]).head(10)


In [None]:
# OpenCLIP image embedding function
image_embedding_function = OpenCLIPEmbeddings(
    model_name="ViT-H-14-378-quickgelu", checkpoint="dfn5b"

In [None]:
# Path
image_uris = sorted(
    [
        os.path.join("tmp", image_name)
        for image_name in os.listdir("tmp")
        if image_name.endswith(".jpg")
    ]
)


In [None]:

# ChatOpenAI model
llm = ChatOpenAI(model="gpt-4o-mini")

# MultiModal model configuration
model = MultiModal(
    model=llm,
    system_prompt="Your mission is to describe the image in detail",  # explain image
    user_prompt="Description should be written in one sentence(less than 60 characters)",
)

In [None]:
# Generate image descrption by model
model.invoke(image_uris[0])

# Iterate Over Image URIs and invoke model
descriptions = dict()

for image_uri in image_uris:
    descriptions[image_uri] = model.invoke(image_uri, display_image=False)