# Chroma DB

In [18]:
# 데이터셋 로드 
from datasets import load_dataset

dataset = load_dataset('sciq', split='train')    # sciq : 과학 관련 질문 
dataset = dataset.filter(lambda x: x['support'] != "")
dataset

Dataset({
    features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
    num_rows: 10481
})

In [19]:
# 필요한 모듈 임포트
import chromadb
from sentence_transformers import SentenceTransformer
import torch

# 1. 임베딩 모델 초기화 (올바른 인터페이스 사용)
class CustomEmbeddingFunction:
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)
    
    def __call__(self, input):  # 'texts' 대신 'input' 사용
        # NumPy 변환을 건너뛰고 텐서에서 직접 리스트로 변환
        embeddings = self.model.encode(input, convert_to_numpy=False)
        return [tensor.cpu().detach().tolist() for tensor in embeddings]

# 2. 임베딩 함수 생성
embedding_function = CustomEmbeddingFunction('all-MiniLM-L6-v2')

# 3. ChromaDB 클라이언트 초기화
client = chromadb.Client()

# 4. 기존 컬렉션이 있다면 삭제
try:
    client.delete_collection("my_collection")
except:
    pass

# 5. 새 컬렉션 생성 (임베딩 함수 지정)
collection = client.create_collection(
    name="my_collection",
    embedding_function=embedding_function
)

# 6. 데이터 준비
supports = dataset['support'][:100]

# 7. 임베딩 계산 (같은 커스텀 임베딩 함수 사용)
support_embeddings = embedding_function(supports)

# 8. 데이터 추가
collection.add(
    ids=[str(i) for i in range(100)],
    embeddings=support_embeddings,
    metadatas=[{'type': 'support', 'text': text} for text in supports]
)

# 9. 쿼리 실행
query_text = ['This is a query document about vietnam']
query_embedding = embedding_function(query_text)

# 10. 임베딩을 직접 사용하여 쿼리
results = collection.query(
    query_embeddings=query_embedding,
    n_results=2
)

print(results)

{'ids': [['61', '80']], 'embeddings': None, 'documents': [[None, None]], 'uris': None, 'data': None, 'metadatas': [[{'text': 'Mariana Ruiz Villarreal (LadyofHats) for CK-12 Foundation. The nitrogen cycle tracks the flow of nitrogen through an ecosystem . CC BY-NC 3.0.', 'type': 'support'}, {'text': 'All of the changes of state that occur between solid, liquid and gas are summarized in the diagram in the figure below. Freezing is the opposite of melting and both represent the equilibrium between the solid and liquid states. Evaporation occurs when a liquid turns to a gas. Condensation is the opposite of vaporization and both represent the equilibrium between the liquid and gas states. Deposition is the opposite of sublimation and both represent the equilibrium between the solid and gas states.', 'type': 'support'}]], 'distances': [[1.6950156688690186, 1.695570707321167]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'me

### SciQ dataset 활용 ChromaDB 검색

In [20]:
from datasets import load_dataset

dataset = load_dataset('sciq', split='train')

In [21]:
# chroma db 클라이언트 객체 및 콜렉션 생성
import chromadb

client = chromadb.Client()
collection = client.create_collection(name="sciq_support")

In [22]:
# 임베딩 모델 로드
from sentence_transformers import SentenceTransformer
import torch

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

supports = dataset['support'][:100]

# NumPy 변환 과정을 건너뛰고 직접 텐서에서 리스트로 변환합니다
embeddings = embedding_model.encode(supports, convert_to_numpy=False)
support_embeddings = [tensor.cpu().detach().tolist() for tensor in embeddings]

In [23]:
len(support_embeddings)

100

In [24]:
collection.add(
    ids=[str(i) for i in range(0, 100)],
    embeddings=support_embeddings,
    metadatas=[{'type': 'support', 'text': text} for text in supports]
)

In [25]:
# 앞서 정의한 CustomEmbeddingFunction 사용
questions = dataset['question'][:3]
question_embeddings = embedding_function(questions)

results = collection.query(
    query_embeddings=question_embeddings,
    n_results=1
)

In [26]:
results

{'ids': [['38'], ['1'], ['2']],
 'embeddings': None,
 'documents': [[None], [None], [None]],
 'uris': None,
 'data': None,
 'metadatas': [[{'text': 'Agents of Decomposition The fungus-like protist saprobes are specialized to absorb nutrients from nonliving organic matter, such as dead organisms or their wastes. For instance, many types of oomycetes grow on dead animals or algae. Saprobic protists have the essential function of returning inorganic nutrients to the soil and water. This process allows for new plant growth, which in turn generates sustenance for other organisms along the food chain. Indeed, without saprobe species, such as protists, fungi, and bacteria, life would cease to exist as all organic carbon became “tied up” in dead organisms.',
    'type': 'support'}],
  [{'text': 'Without Coriolis Effect the global winds would blow north to south or south to north. But Coriolis makes them blow northeast to southwest or the reverse in the Northern Hemisphere. The winds blow north

In [27]:
for i, q in enumerate(questions):
    print('Question:', q)
    print('Support:', results['metadatas'][i][0]['text'])

Question: What type of organism is commonly used in preparation of foods such as cheese and yogurt?
Support: Agents of Decomposition The fungus-like protist saprobes are specialized to absorb nutrients from nonliving organic matter, such as dead organisms or their wastes. For instance, many types of oomycetes grow on dead animals or algae. Saprobic protists have the essential function of returning inorganic nutrients to the soil and water. This process allows for new plant growth, which in turn generates sustenance for other organisms along the food chain. Indeed, without saprobe species, such as protists, fungi, and bacteria, life would cease to exist as all organic carbon became “tied up” in dead organisms.
Question: What phenomenon makes global winds blow northeast to southwest or the reverse in the northern hemisphere and northwest to southeast or the reverse in the southern hemisphere?
Support: Without Coriolis Effect the global winds would blow north to south or south to north. B

### Chroma DB를 활용한 키워드 기반 검색

In [28]:
documents = [
    '인공지능은 인간의 작업을 자동화하는 기술이다.',
    '기계 학습은 데이터에서 패턴을 학습하여 예측하는 기술이다.',
    '벡터 데이터베이스는 유사도를 기반으로 데이터를 겁색하는 DB이다.'
    ]

In [29]:
import chromadb
from sentence_transformers import SentenceTransformer

# ChromaDB 클라이언트, 컬렉션 생성
client = chromadb.PersistentClient(path='./chroma_db')      # 해당 데이터에 경로를 지정
collection = client.get_or_create_collection(name='ai_documents')

# 텍스트 임베딩 모델 생성
model = SentenceTransformer('all-MiniLM-L6-v2')

In [30]:
# NumPy 변환을 건너뛰는 함수
def encode_to_list(text):
    # convert_to_numpy=False로 설정하여 텐서 반환
    embedding_tensor = model.encode(text, convert_to_numpy=False)
    # 텐서를 리스트로 직접 변환
    if isinstance(embedding_tensor, list):
        return [t.cpu().detach().tolist() for t in embedding_tensor]
    else:
        return embedding_tensor.cpu().detach().tolist()

# 문서 추가
for i, doc in enumerate(documents):
    embedding = encode_to_list(doc)
    collection.add(ids=[str(i)], embeddings=[embedding], metadatas=[{'text': doc}])

Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Insert of existing embedding ID: 0
Add of existing embedding ID: 0
Insert of existing embedding ID: 1
Add of existing embedding ID: 1
Insert of existing embedding ID: 2
Add of existing embedding ID: 2


In [31]:
query_keyword = 'AI'
query_embedding = encode_to_list(query_keyword)
results = collection.query(query_embeddings=query_embedding, n_results=2)

for result in results['metadatas'][0]:
    print('검색된 문서:', result['text'])

검색된 문서: As humanity picks up the pieces, following the conclusion of "Transformers: Dark of the Moon," Autobots and Decepticons have all but vanished from the face of the planet. However, a group of powerful, ingenious businessman and scientists attempt to learn from past Transformer incursions and push the boundaries of technology beyond what they can control - all while an ancient, powerful Transformer menace sets Earth in his cross-hairs.
검색된 문서: WALL·E is the last robot left on an Earth that has been overrun with garbage and all humans have fled to outer space. For 700 years he has continued to try and clean up the mess, but has developed some rather interesting human-like qualities. When a ship arrives with a sleek new type of robot, WALL·E thinks he's finally found a friend and stows away on the ship when it leaves.


### 영화 추천 시스템


- title = 이걸로 찾기, overview = 유사도 파악(임베딩)
- vector db에서 검색 시 추천영화 나오도록 하게 함

In [32]:
# 데이터셋 로드
import pandas as pd
df = pd.read_csv('./data/tmdb_5000_movies.csv')
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [33]:
# chroma db 클라이언트 객체 및 콜렉션 생성
import chromadb
from sentence_transformers import SentenceTransformer

client = chromadb.PersistentClient()
model = SentenceTransformer('all-MiniLM-L6-v2')

In [34]:
# 임베딩 모델 로드
movies = [
    {
        'id': str(index),
        'title': row['title'],
        'overview': row['overview'] if pd.notna(row['overview']) else ""  
    } for index, row in df.iterrows()    
]

In [48]:
for movie in movies:
    if movie['overview']:
        overview_embedding = model.encode(movie['overview'], convert_to_numpy=False).tolist()
        collection.add(
            ids=[movie['id']],
            embeddings=[overview_embedding],
            metadatas=[{'title': movie['title'], 'text': movie['overview']}]    
        )

In [47]:
# 1. 제목 입력 -> 줄거리를 찾아 -> 줄거리로 유사도 검색
input_title = 'Inception'
query_text = df.loc[df['title'] == input_title, 'overview'].iloc[0]

query_embedding = model.encode(query_text, convert_to_numpy=False).tolist()

results = collection.query(query_embeddings = [query_embedding], n_results=5)

for result in results['metadatas'][0]:
    print(result['title'])
    print(result['text'])
    print()

In [None]:
# 2. 원하는 줄거리 입력 -> 유사도 검색
query_text = 'Korea'

query_embedding = model.encode(query_text, convert_to_numpy=False).tolist()

results = collection.query(query_embeddings = [query_embedding], n_results=5)

for result in results['metadatas'][0]:
    print(result['title'])
    print(result['text'])
    print()

Insert of existing embedding ID: 0
Insert of existing embedding ID: 1
Insert of existing embedding ID: 2
Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2


### 논문 PDF 내용 검색

In [None]:
# !pip install PyPDF2

In [54]:
import chromadb
from sentence_transformers import SentenceTransformer

client = chromadb.PersistentClient(path='./chroma_db')
# client.delete_collection('papers')    # 컬렉션 삭제
collection = client.get_or_create_collection(name='pdf_documents')

model = SentenceTransformer('all-MiniLM-L6-v2')

In [50]:
papers = [
    {'id': '1', 'title': '딥러닝', 'path': './data/deep_learning.pdf'},
    {'id': '2', 'title': '자연어처리', 'path': './data/nlp_paper.pdf'},
]

In [51]:
import PyPDF2

def extract_text_from_pdf(path):
    with open(path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = ' '.join([page.extract_text() for page in reader.pages if page.extract_text()])

        return text

In [55]:
for paper in papers:
    text = extract_text_from_pdf(paper['path'])
    embedding = model.encode(text).tolist()
    collection.add(
        ids=[paper['id']],    
        embeddings=[embedding],
        metadatas=[{'title': paper['title']}],
        documents=[text]
    )

In [56]:
collection.get()

{'ids': ['1', '2'],
 'embeddings': None,
 'documents': ['Deep Learning:\nMethods and Applications\nLi Deng\nMicrosoft Research\nOne Microsoft Way\nRedmond, WA 98052; USA\ndeng@microsoft.com\nDong Yu\nMicrosoft Research\nOne Microsoft Way\nRedmond, WA 98052; USA\nDong.Yu@microsoft.com\nBoston — Delft\nFull text available at: http://dx.doi.org/10.1561/2000000039 Foundations and TrendsR/circlecopyrtin Signal Processing\nPublished, sold and distributed by:\nnow Publishers Inc.\nPO Box 1024Hanover, MA 02339\nUnited States\nTel. +1-781-985-4510www.nowpublishers.com\nsales@nowpublishers.com\nOutside North America:\nnow Publishers Inc.PO Box 179\n2600 AD Delft\nThe NetherlandsTel. +31-6-51115274\nThe preferred citation for this publication is\nL. Deng and D. Yu. Deep Learning: Methods and Applications . Foundations and\nTrends\nR/circlecopyrtin Signal Processing, vol. 7, nos. 3–4, pp. 197–387, 2013.\nThis Foundations and TrendsR/circlecopyrtissue was typeset in LATEX using a class ﬁle designed

In [58]:
query_text = 'Natural Language'
query_embedding = model.encode(query_text).tolist()
results = collection.query(query_embeddings=[query_embedding], n_results=1)

results['metadatas'][0][0]['title']
results

{'ids': [['2']],
 'embeddings': None,
 'documents': [['48    IEEE COMPUTATIONAL INTELLIGENCE MAGAZINE | MAY 2014 1556-603X/14/$31.00©2014IEEENatural language processing (NLP) is a theory-motivated range of computational tech-niques for the automatic analysis and representation of human language. NLP research has evolved from the era of punch cards and batch processing (in which the analysis of a sentence could take up to 7 minutes) to the era of Google and the likes of it (in which millions of webpages can be processed in less than a second). This review paper draws on recent developments in NLP research to look at the past, pres-ent, and future of NLP technology in a new light. Borrowing the paradigm of ‘jumping curves’ from the field of  business management and marketing prediction, this survey article reinter-prets the evolution of NLP research as the intersection of three overlapping curves-namely Syntactics, Semantics, and Pragmatics Curves- which will eventually lead NLP research