# Chroma DB

In [1]:
import chromadb

chroma_client = chromadb.Client()

In [2]:
collection = chroma_client.create_collection(name="my_collection")

In [3]:
collection.add(
    documents=[
        'This is a document about pineapple',
        'This is a document about mango',
        'This is a document about strawberry'
    ],
    ids=['id1', 'id2', 'id3']
)

C:\Users\USER\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:08<00:00, 9.35MiB/s]  


In [8]:
results = collection.query(
    query_texts=['This is a query document about vietnam'],
    n_results=3
)

In [9]:
results

{'ids': [['id1', 'id2', 'id3']],
 'embeddings': None,
 'documents': [['This is a document about pineapple',
   'This is a document about mango',
   'This is a document about strawberry']],
 'uris': None,
 'data': None,
 'metadatas': [[None, None, None]],
 'distances': [[1.2225853204727173, 1.2783520221710205, 1.3223111629486084]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

### SciQ dataset 활용 ChromaDB 검색

In [11]:
# 데이터셋 로드
from datasets import load_dataset

dataset = load_dataset("sciq", split="train")
dataset  = dataset.filter(lambda x: x["support"] != "")

dataset

Filter: 100%|██████████| 11679/11679 [00:00<00:00, 103824.38 examples/s]


Dataset({
    features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
    num_rows: 10481
})

In [None]:
# chroma db 클라이언트 객체 및 콜렉션 생성
import chromadb

client = chromadb.Client()
collection = client.create_collection(name="sciq_support")

In [13]:
# 임베딩 모델 로드
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [14]:
supports = dataset["support"][:100]
support_embeddings = embedding_model.encode(supports).tolist()

In [16]:
len(support_embeddings[0])

384

In [18]:
collection.add(
    ids=[str(i) for i in range(0, 100)],
    embeddings=support_embeddings,
    metadatas=[{"type": "support", "text": text} for text in supports]
)

In [19]:
questions = dataset["question"][:3]
question_embeddings = embedding_model.encode(questions).tolist()

results = collection.query(
    query_embeddings=question_embeddings,
    n_results=1
)

In [20]:
results

{'ids': [['36'], ['1'], ['2']],
 'embeddings': None,
 'documents': [[None], [None], [None]],
 'uris': None,
 'data': None,
 'metadatas': [[{'text': 'Agents of Decomposition The fungus-like protist saprobes are specialized to absorb nutrients from nonliving organic matter, such as dead organisms or their wastes. For instance, many types of oomycetes grow on dead animals or algae. Saprobic protists have the essential function of returning inorganic nutrients to the soil and water. This process allows for new plant growth, which in turn generates sustenance for other organisms along the food chain. Indeed, without saprobe species, such as protists, fungi, and bacteria, life would cease to exist as all organic carbon became “tied up” in dead organisms.',
    'type': 'support'}],
  [{'text': 'Without Coriolis Effect the global winds would blow north to south or south to north. But Coriolis makes them blow northeast to southwest or the reverse in the Northern Hemisphere. The winds blow north

In [22]:
for i, q in enumerate(questions):
    print("Question:", q)
    print("Support:", results['metadatas'][i][0]['text'])
    print()

Question: What type of organism is commonly used in preparation of foods such as cheese and yogurt?
Support: Agents of Decomposition The fungus-like protist saprobes are specialized to absorb nutrients from nonliving organic matter, such as dead organisms or their wastes. For instance, many types of oomycetes grow on dead animals or algae. Saprobic protists have the essential function of returning inorganic nutrients to the soil and water. This process allows for new plant growth, which in turn generates sustenance for other organisms along the food chain. Indeed, without saprobe species, such as protists, fungi, and bacteria, life would cease to exist as all organic carbon became “tied up” in dead organisms.

Question: What phenomenon makes global winds blow northeast to southwest or the reverse in the northern hemisphere and northwest to southeast or the reverse in the southern hemisphere?
Support: Without Coriolis Effect the global winds would blow north to south or south to north. 

### Chroma DB를 활용한 키워드 기반 검색

In [23]:
documents = [
    "인공지능은 인간의 작업을 자동화하는 기술이다.",
    "기계 학습은 데이터에서 패턴을 학습하여 예측하는 기술이다.",
    "벡터 데이터베이스는 유사도를 기반으로 데이터를 검색하는 DB이다.",
    "AI는 사람들이 하는 일을 대신 수행할 수 있도록 돕는 기술이다.",
    "머신 러닝은 컴퓨터가 데이터를 분석하여 스스로 배우는 알고리즘이다.",
    "벡터 DB는 데이터를 빠르게 검색하기 위해 유사도 기반으로 동작한다.",
    "인공지능은 알고리즘을 사용하여 문제를 해결하는 시스템이다.",
    "기계 학습을 통해 컴퓨터는 경험을 바탕으로 더 나은 결정을 내릴 수 있다.",
    "벡터 데이터베이스는 고속 검색과 유사도 검색에 강점을 가진 데이터베이스이다.",
    "AI 기술은 자동화된 시스템을 통해 사람들의 작업을 효율적으로 돕는다.",
    "기계 학습 모델은 주어진 데이터를 학습하여 미래의 결과를 예측한다.",
    "벡터 DB는 데이터의 유사도를 계산하여 관련성이 높은 정보를 빠르게 찾는다.",
    "인공지능은 복잡한 문제를 해결하기 위해 인간의 지능을 모방하는 시스템이다.",
    "기계 학습 알고리즘은 데이터를 통해 패턴을 발견하고, 이를 통해 예측을 개선한다.",
    "벡터 데이터베이스는 대량의 데이터를 효율적으로 처리하고 빠른 검색을 가능하게 한다.",
    "대형 언어 모델(LLM)은 자연어를 이해하고 생성할 수 있는 인공지능 모델이다.",
    "트랜스포머 모델은 대형 언어 모델에서 주로 사용되는 네트워크 아키텍처이다.",
    "자연어 처리(NLP)는 텍스트 데이터에서 의미를 추출하고 분석하는 기술이다.",
    "파인튜닝(fine-tuning)은 사전 학습된 모델을 특정 작업에 맞게 조정하는 과정이다.",
    "GPT는 대형 언어 모델 중 하나로, 텍스트 생성에 매우 뛰어난 성능을 보인다.",
    "언어 모델은 문법적 의미를 이해하고, 자연스러운 문장을 생성하는데 사용된다.",
    "BERT는 텍스트의 양방향 문맥을 학습하여 자연어 처리에서 뛰어난 성능을 발휘하는 모델이다.",
    "자연어 생성(NLG)은 기계가 인간처럼 문장을 생성하는 기술이다.",
    "LLM은 방대한 양의 텍스트 데이터를 학습하여 질문에 답하고 정보를 제공할 수 있다.",
    "어텐션 메커니즘은 트랜스포머 모델에서 핵심적인 역할을 하는 기술로, 입력 문장에 대한 중요한 부분에 집중한다.",
    "GPT 모델은 대화형 AI 시스템에서 자연스러운 대화 생성을 가능하게 한다."
]

In [24]:
import chromadb
from sentence_transformers import SentenceTransformer

# ChromaDB 클라이언트, 컬렉션 생성
client = chromadb.PersistentClient(path='./chroma_db')
collection = client.get_or_create_collection(name='ai_documents')

# 텍스트 임베딩 모델 로드 
model = SentenceTransformer('all-MiniLM-L6-v2')

In [25]:
for i, doc in enumerate(documents):
    embedding = model.encode(doc).tolist()
    collection.add(
        ids=[str(i)], 
        embeddings=[embedding], 
        metadatas=[{"text": doc}]
    )

In [29]:
query_keyword = "NLP"

query_embedding = model.encode(query_keyword).tolist()

results = collection.query(query_embeddings=query_embedding, n_results=2)

for result in results["metadatas"][0]:
    print('검색된 문서:', result['text'])

검색된 문서: 자연어 처리(NLP)는 텍스트 데이터에서 의미를 추출하고 분석하는 기술이다.
검색된 문서: 자연어 생성(NLG)은 기계가 인간처럼 문장을 생성하는 기술이다.


### 영화 추천 시스템

In [33]:
import pandas as pd
df = pd.read_csv('./data/tmdb_5000_movies.csv')
# df.columns

In [34]:
import chromadb
from sentence_transformers import SentenceTransformer

client = chromadb.PersistentClient(path='./chroma_db')
collection = client.get_or_create_collection(name="movies")

model = SentenceTransformer('all-MiniLM-L6-v2')

In [35]:
movies = [
    {
        "id": str(index),
        "title": row["title"],
        "overview": row["overview"] if pd.notna(row["overview"]) else ""
    } for index, row in df.iterrows()
]

In [37]:
for movie in movies:
    if movie['overview']:
        overview_embedding = model.encode(movie['overview']).tolist()
        collection.add(
            ids=[movie['id']],
            embeddings=[overview_embedding],
            metadatas=[{'title': movie['title'], 'text': movie['overview']}]
        )

In [38]:
# 1. 제목 입력 -> 줄거리를 찾아 -> 줄거리로 유사도 검색
input_title = 'Inception'
query_text = df.loc[df['title'] == input_title, 'overview'].iloc[0]

query_embedding = model.encode(query_text).tolist()

results = collection.query(query_embeddings=[query_embedding], n_results=5)

for result in results['metadatas'][0]:
    print(result['title'])
    print(result['text'])
    print()

Inception
Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: "inception", the implantation of another person's idea into a target's subconscious.

Identity Thief
When a mild-mannered businessman learns his identity has been stolen, he hits the road in an attempt to foil the thief -- a trip that puts him in the path of a deceptively harmless-looking woman.

The Master of Disguise
A sweet-natured Italian waiter named Pistachio Disguisey at his father Fabbrizio's restaurant, who happens to be a member of a family with supernatural skills of disguise. But moments later the patriarch of the Disguisey family is kidnapped Fabbrizio's former arch-enemy, Devlin Bowman, a criminal mastermind in an attempt to steal the world's most precious treasures from around the world. And it's up to Pistachio to track down Bowman and save his family before Bowman ki

In [40]:
# 2. 원하는 줄거리 입력 -> 유사도 검색
# query_text = "a movie that penetrates the subconscious and saves the world"
query_text = "korea"

query_embedding = model.encode(query_text).tolist()

results = collection.query(query_embeddings=[query_embedding], n_results=5)

for result in results['metadatas'][0]:
    print(result['title'])
    print(result['text'])
    print()

Silmido
On 31 January 1968, 31 North Korean commandos infiltrated South Korea in a failed mission to assassinate President Park Chung-hee. In revenge, the South Korean military assembled a team of 31 criminals on the island of Silmido to kill Kim Il-sung for a suicide mission to redeem their honor, but was cancelled, leaving them frustrated. It is loosely based on a military uprising in the 1970s.

Tae Guk Gi: The Brotherhood of War
In 1950, in South Korea, shoe-shiner Jin-tae Lee and his 18-year-old old student brother, Jin-seok Lee, form a poor but happy family with their mother, Jin-tae's fiancé Young-shin Kim, and her young sisters. Jin-tae and his mother are tough workers, who sacrifice themselves to send Jin-seok to the university. When North Korea invades the South, the family escapes to a relative's house in the country, but along their journey, Jin-seok is forced to join the army to fight in the front, and Jin-tae enlists too to protect his young brother. The commander promise

### 논문 pdf 내용 검색

In [41]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [55]:
import chromadb
from sentence_transformers import SentenceTransformer

client = chromadb.PersistentClient(path='./chroma_db')
client.delete_collection('papers')    # 컬렉션 삭제
collection = client.get_or_create_collection(name='papers')

model = SentenceTransformer('all-MiniLM-L6-v2')

In [43]:
papers = [
    {'id': '1', 'title': '딥러닝', 'path': './data/deep_learning.pdf'},
    {'id': '2', 'title': '자연어처리', 'path': './data/nlp_paper.pdf'},
]

In [44]:
import PyPDF2

def extract_text_from_pdf(path):
    with open(path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
        
    return text

In [56]:
for paper in papers:
    text = extract_text_from_pdf(paper['path'])
    embedding = model.encode(text).tolist()
    collection.add(
        ids=[paper['id']],
        embeddings=[embedding],
        metadatas=[{'title': paper['title']}],
        documents=[text]
    )

In [57]:
collection.get()

{'ids': ['1', '2'],
 'embeddings': None,
 'documents': ['HAL Id: hal-04206682\nhttps://hal.science/hal-04206682v1\nSubmitted on 14 Sep 2023\nHAL is a multi-disciplinary open access\narchive for the deposit and dissemination of sci-\nentific research documents, whether they are pub-\nlished or not. The documents may come from\nteaching and research institutions in F rance or\nabroad, or from public or private research centers.L’archive ouverte pluridisciplinaire HAL , est\ndestinée au dépôt et à la diffusion de documents\nscientifiques de niveau recherche, publiés ou non,\némanant des établissements d’enseignement et de\nrecherche français ou étrangers, des laboratoires\npublics ou privés.\nDeep learning\nY ann Lecun, Y oshua Bengio, Geoffrey Hinton\nT o cite this version:\nY ann Lecun, Y oshua Bengio, Geoffrey Hinton. Deep learning. Nature, 2015, 521 (7553), pp.436-444.\n\uffff10.1038/nature14539\uffff. \uffffhal-04206682\uffff 1Facebook AI Research, 770 Broadway, New York, New York 10

In [59]:
results = collection.get(
    include=['embeddings', 'documents', 'metadatas']
)

for emb in results['embeddings']:
    print(emb)

384
384


In [61]:
query_text = 'Natural Language'
query_embedding = model.encode(query_text).tolist()
results = collection.query(query_embeddings=[query_embedding], n_results=1)

results['metadatas'][0][0]['title']
results

{'ids': [['2']],
 'embeddings': None,
 'documents': [[" \n \n \nChowdhury, G. (2003) Natural language processing. Annual Review of \nInformation Science and Technology, 37. pp. 51-89. ISSN 0066-4200 \n \n \n \nhttp://eprints.cdlr.strath.ac.uk/2611/\n \n \n \nThis is an author-produced versio n of a paper published in The \nAnnual Review of Information Science and Technology  ISSN 0066-4200 . \nThis version has been peer-reviewed, but does not include the \nfinal publisher proof corrections, published layout, or pagination. \n \nStrathprints is designed to allow users to access the research \noutput of the University of St rathclyde. Copyright © and Moral \nRights for the papers on this site are retained by the individual \nauthors and/or other copyright  owners. Users may download \nand/or print one copy of any articl e(s) in Strathprints to facilitate \ntheir private study or for non-co mmercial research. You may not \nengage in further distribution of th e material or use it for any 