# Chroma DB

In [58]:
# 데이터셋 로드 
from datasets import load_dataset

dataset = load_dataset('sciq', split='train')    # sciq : 과학 관련 질문 
dataset = dataset.filter(lambda x: x['support'] != "")
dataset

Filter: 100%|██████████| 11679/11679 [00:00<00:00, 61945.05 examples/s]


Dataset({
    features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
    num_rows: 10481
})

In [59]:
# 필요한 모듈 임포트
import chromadb
from sentence_transformers import SentenceTransformer
import torch

# 1. 임베딩 모델 초기화 (올바른 인터페이스 사용)
class CustomEmbeddingFunction:
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)
    
    def __call__(self, input):  # 'texts' 대신 'input' 사용
        # NumPy 변환을 건너뛰고 텐서에서 직접 리스트로 변환
        embeddings = self.model.encode(input, convert_to_numpy=False)
        return [tensor.cpu().detach().tolist() for tensor in embeddings]

# 2. 임베딩 함수 생성
embedding_function = CustomEmbeddingFunction('all-MiniLM-L6-v2')

# 3. ChromaDB 클라이언트 초기화
client = chromadb.Client()

# 4. 기존 컬렉션이 있다면 삭제
try:
    client.delete_collection("my_collection")
except:
    pass

# 5. 새 컬렉션 생성 (임베딩 함수 지정)
collection = client.create_collection(
    name="my_collection",
    embedding_function=embedding_function
)

# 6. 데이터 준비
supports = dataset['support'][:100]

# 7. 임베딩 계산 (같은 커스텀 임베딩 함수 사용)
support_embeddings = embedding_function(supports)

# 8. 데이터 추가
collection.add(
    ids=[str(i) for i in range(100)],
    embeddings=support_embeddings,
    metadatas=[{'type': 'support', 'text': text} for text in supports]
)

# 9. 쿼리 실행
query_text = ['This is a query document about vietnam']
query_embedding = embedding_function(query_text)

# 10. 임베딩을 직접 사용하여 쿼리
results = collection.query(
    query_embeddings=query_embedding,
    n_results=2
)

print(results)

{'ids': [['61', '80']], 'embeddings': None, 'documents': [[None, None]], 'uris': None, 'data': None, 'metadatas': [[{'text': 'Mariana Ruiz Villarreal (LadyofHats) for CK-12 Foundation. The nitrogen cycle tracks the flow of nitrogen through an ecosystem . CC BY-NC 3.0.', 'type': 'support'}, {'text': 'All of the changes of state that occur between solid, liquid and gas are summarized in the diagram in the figure below. Freezing is the opposite of melting and both represent the equilibrium between the solid and liquid states. Evaporation occurs when a liquid turns to a gas. Condensation is the opposite of vaporization and both represent the equilibrium between the liquid and gas states. Deposition is the opposite of sublimation and both represent the equilibrium between the solid and gas states.', 'type': 'support'}]], 'distances': [[1.6950156688690186, 1.695570707321167]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'me

### SciQ dataset 활용 ChromaDB 검색

In [13]:
from datasets import load_dataset

dataset = load_dataset('sciq', split='train')

Generating train split: 100%|██████████| 11679/11679 [00:00<00:00, 222182.65 examples/s]
Generating validation split: 100%|██████████| 1000/1000 [00:00<00:00, 163100.95 examples/s]
Generating test split: 100%|██████████| 1000/1000 [00:00<00:00, 147127.26 examples/s]


In [None]:
# chroma db 클라이언트 객체 및 콜렉션 생성
import chromadb

client = chromadb.Client()
collection = client.create_collection(name="sciq_support")

In [37]:
# 임베딩 모델 로드
from sentence_transformers import SentenceTransformer
import torch

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

supports = dataset['support'][:100]

# NumPy 변환 과정을 건너뛰고 직접 텐서에서 리스트로 변환합니다
embeddings = embedding_model.encode(supports, convert_to_numpy=False)
support_embeddings = [tensor.cpu().detach().tolist() for tensor in embeddings]

In [38]:
len(support_embeddings)

100

In [43]:
collection.add(
    ids=[str(i) for i in range(0, 100)],
    embeddings=support_embeddings,
    metadatas=[{'type': 'support', 'text': text} for text in supports]
)

Insert of existing embedding ID: 0
Insert of existing embedding ID: 1
Insert of existing embedding ID: 2
Insert of existing embedding ID: 3
Insert of existing embedding ID: 4
Insert of existing embedding ID: 5
Insert of existing embedding ID: 6
Insert of existing embedding ID: 7
Insert of existing embedding ID: 8
Insert of existing embedding ID: 9
Insert of existing embedding ID: 10
Insert of existing embedding ID: 11
Insert of existing embedding ID: 12
Insert of existing embedding ID: 13
Insert of existing embedding ID: 14
Insert of existing embedding ID: 15
Insert of existing embedding ID: 16
Insert of existing embedding ID: 17
Insert of existing embedding ID: 18
Insert of existing embedding ID: 19
Insert of existing embedding ID: 20
Insert of existing embedding ID: 21
Insert of existing embedding ID: 22
Insert of existing embedding ID: 23
Insert of existing embedding ID: 24
Insert of existing embedding ID: 25
Insert of existing embedding ID: 26
Insert of existing embedding ID: 27
In

In [None]:
# 앞서 정의한 CustomEmbeddingFunction 사용
questions = dataset['question'][:3]
question_embeddings = embedding_function(questions)

results = collection.query(
    query_embeddings=question_embeddings,
    n_results=1
)

In [46]:
results

{'ids': [['38'], ['1'], ['2']],
 'embeddings': None,
 'documents': [[None], [None], [None]],
 'uris': None,
 'data': None,
 'metadatas': [[{'text': 'Agents of Decomposition The fungus-like protist saprobes are specialized to absorb nutrients from nonliving organic matter, such as dead organisms or their wastes. For instance, many types of oomycetes grow on dead animals or algae. Saprobic protists have the essential function of returning inorganic nutrients to the soil and water. This process allows for new plant growth, which in turn generates sustenance for other organisms along the food chain. Indeed, without saprobe species, such as protists, fungi, and bacteria, life would cease to exist as all organic carbon became “tied up” in dead organisms.',
    'type': 'support'}],
  [{'text': 'Without Coriolis Effect the global winds would blow north to south or south to north. But Coriolis makes them blow northeast to southwest or the reverse in the Northern Hemisphere. The winds blow north

In [52]:
for i, q in enumerate(questions):
    print('Question:', q)
    print('Support:', results['metadatas'][i][0]['text'])

Question: What type of organism is commonly used in preparation of foods such as cheese and yogurt?
Support: Agents of Decomposition The fungus-like protist saprobes are specialized to absorb nutrients from nonliving organic matter, such as dead organisms or their wastes. For instance, many types of oomycetes grow on dead animals or algae. Saprobic protists have the essential function of returning inorganic nutrients to the soil and water. This process allows for new plant growth, which in turn generates sustenance for other organisms along the food chain. Indeed, without saprobe species, such as protists, fungi, and bacteria, life would cease to exist as all organic carbon became “tied up” in dead organisms.
Question: What phenomenon makes global winds blow northeast to southwest or the reverse in the northern hemisphere and northwest to southeast or the reverse in the southern hemisphere?
Support: Without Coriolis Effect the global winds would blow north to south or south to north. B

### Chroma DB를 활용한 키워드 기반 검색

In [48]:
documents = [
    '인공지능은 인간의 작업을 자동화하는 기술이다.',
    '기계 학습은 데이터에서 패턴을 학습하여 예측하는 기술이다.',
    '벡터 데이터베이스는 유사도를 기반으로 데이터를 겁색하는 DB이다.'
    ]

In [None]:
import chromadb
from sentence_transformers import SentenceTransformer

# ChromaDB 클라이언트, 컬렉션 생성
client = chromadb.PersistentClient(path='./chroma_db')      # 해당 데이터에 경로를 지정
collection = client.get_or_create_collection(name='ai_documents')

# 텍스트 임베딩 모델 생성
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# NumPy 변환을 건너뛰는 함수
def encode_to_list(text):
    # convert_to_numpy=False로 설정하여 텐서 반환
    embedding_tensor = model.encode(text, convert_to_numpy=False)
    # 텐서를 리스트로 직접 변환
    if isinstance(embedding_tensor, list):
        return [t.cpu().detach().tolist() for t in embedding_tensor]
    else:
        return embedding_tensor.cpu().detach().tolist()

# 문서 추가
for i, doc in enumerate(documents):
    embedding = encode_to_list(doc)
    collection.add(ids=[str(i)], embeddings=[embedding], metadatas=[{'text': doc}])

Insert of existing embedding ID: 0
Add of existing embedding ID: 0
Insert of existing embedding ID: 1
Add of existing embedding ID: 1
Insert of existing embedding ID: 2
Add of existing embedding ID: 2


In [100]:
query_keyword = 'AI'
query_embedding = encode_to_list(query_keyword)
results = collection.query(query_embeddings=query_embedding, n_results=2)

for result in results['metadatas'][0]:
    print('검색된 문서:', result['text'])

검색된 문서: As humanity picks up the pieces, following the conclusion of "Transformers: Dark of the Moon," Autobots and Decepticons have all but vanished from the face of the planet. However, a group of powerful, ingenious businessman and scientists attempt to learn from past Transformer incursions and push the boundaries of technology beyond what they can control - all while an ancient, powerful Transformer menace sets Earth in his cross-hairs.
검색된 문서: WALL·E is the last robot left on an Earth that has been overrun with garbage and all humans have fled to outer space. For 700 years he has continued to try and clean up the mess, but has developed some rather interesting human-like qualities. When a ship arrives with a sleek new type of robot, WALL·E thinks he's finally found a friend and stows away on the ship when it leaves.


### 영화 추천 시스템


- title = 이걸로 찾기, overview = 유사도 파악(임베딩)
- vector db에서 검색 시 추천영화 나오도록 하게 함

In [None]:
# 데이터셋 로드
import pandas as pd
df = pd.read_csv('./data/tmdb_5000_movies.csv')
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [None]:
# chroma db 클라이언트 객체 및 콜렉션 생성
import chromadb

client = chromadb.Client()
collection = client.create_collection(name='movie_search')

In [None]:
# 임베딩 모델 로드
from sentence_transformers import SentenceTransformer
import torch

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

overviews = df['overview'][:100]

# NumPy 변환 과정 건너 뜀
embeddings = embedding_model.encode(overviews, convert_to_numpy=False)
overview_embeddings = [tensor.cpu().detach().tolist() for tensor in embeddings]

In [None]:
# 내용들 벡터화 시킴
collection.add(
    ids=[str(i) for i in range(0, 100)],
    embeddings=overview_embeddings,
    metadatas=[{'type': 'overview', 'text': text} for text in overviews]
)

In [90]:
# chroma db로 벡터화 사용
titles = df['title'][:10]
title_embeddings = embedding_function(titles)

results = collection.query(
    query_embeddings=title_embeddings,
    n_results=1
)

results

{'ids': [['81'],
  ['17'],
  ['2'],
  ['3'],
  ['4'],
  ['5'],
  ['95'],
  ['26'],
  ['8'],
  ['9']],
 'embeddings': None,
 'documents': [[None],
  [None],
  [None],
  [None],
  [None],
  [None],
  [None],
  [None],
  [None],
  [None]],
 'uris': None,
 'data': None,
 'metadatas': [[{'text': "The untold story of Disney's most iconic villain from the 1959 classic 'Sleeping Beauty'. A beautiful, pure-hearted young woman, Maleficent has an idyllic life growing up in a peaceable forest kingdom, until one day when an invading army threatens the harmony of the land.  Maleficent rises to be the land's fiercest protector, but she ultimately suffers a ruthless betrayal – an act that begins to turn her heart into stone. Bent on revenge, Maleficent faces an epic battle with the invading King's successor and, as a result, places a curse upon his newborn infant Aurora. As the child grows, Maleficent realizes that Aurora holds the key to peace in the kingdom - and to Maleficent's true happiness as we

In [91]:
for i, q in enumerate(titles):
    print('Question:', q)
    print('Support:', results['metadatas'][i][0]['text'])

Question: Avatar
Support: The untold story of Disney's most iconic villain from the 1959 classic 'Sleeping Beauty'. A beautiful, pure-hearted young woman, Maleficent has an idyllic life growing up in a peaceable forest kingdom, until one day when an invading army threatens the harmony of the land.  Maleficent rises to be the land's fiercest protector, but she ultimately suffers a ruthless betrayal – an act that begins to turn her heart into stone. Bent on revenge, Maleficent faces an epic battle with the invading King's successor and, as a result, places a curse upon his newborn infant Aurora. As the child grows, Maleficent realizes that Aurora holds the key to peace in the kingdom - and to Maleficent's true happiness as well.
Question: Pirates of the Caribbean: At World's End
Support: Captain Jack Sparrow crosses paths with a woman from his past, and he's not sure if it's love -- or if she's a ruthless con artist who's using him to find the fabled Fountain of Youth. When she forces hi

In [96]:
movie_name = input('당신이 원하는 영화는 무엇인가요?')
for i in range(len(df)):
    if movie_name == df['title'][i]:
        cnt = i
        break
    movie_embedding = embedding_model.encode(movie_name, convert_to_numpy=False).cpu().detach().tolist()
    
result = collection.query(
    query_embeddings=[movie_embedding],
    n_results=1
)

print(f"{movie_name}의 줄거리입니다. \n -> {result['metadatas'][0][0]['text']}")

Harry Potter and the Half-Blood Prince의 줄거리입니다. 
 -> As Harry begins his sixth year at Hogwarts, he discovers an old book marked as 'Property of the Half-Blood Prince', and begins to learn more about Lord Voldemort's dark past.


In [101]:
def recommend_movies(movie_name, top_n=5):
    """
    사용자가 입력한 영화와 유사한 영화를 추천하는 함수
    
    Args:
        movie_name (str): 사용자가 입력한 영화 제목
        top_n (int): 추천할 영화 수
    
    Returns:
        list: 추천 영화 정보 리스트
    """
    # DataFrame에서 영화 제목 찾기
    movie_idx = df[df['title'] == movie_name].index
    
    if not movie_idx.empty:
        # 찾은 영화의 인덱스
        idx = movie_idx[0]
        
        # 찾은 영화의 overview
        movie_overview = df.loc[idx, 'overview']
        
        # 영화 줄거리로 임베딩 생성
        movie_embedding = embedding_model.encode(movie_overview, convert_to_numpy=False).cpu().detach().tolist()
        
        # ChromaDB로 유사한 영화 검색 (top_n+1개 가져옴 - 자기 자신 포함될 수 있으므로)
        results = collection.query(
            query_embeddings=[movie_embedding],
            n_results=top_n+1
        )
        
        # 추천 영화 정보 저장할 리스트
        recommendations = []
        
        # 결과 중에서 입력 영화와 다른 영화들만 선택
        for i, metadata in enumerate(results['metadatas'][0]):
            # 검색된 영화의 줄거리
            found_overview = metadata['text']
            
            # 검색된 영화의 제목 찾기
            found_idx = df[df['overview'] == found_overview].index
            
            if not found_idx.empty:
                found_title = df.loc[found_idx[0], 'title']
                
                # 입력한 영화와 다른 경우에만 추가
                if found_title != movie_name:
                    # 유사도 점수 계산 (거리가 작을수록 유사함)
                    similarity_score = 1 - results['distances'][0][i]  # 거리를 유사도로 변환
                    
                    recommendations.append({
                        'title': found_title,
                        'overview': found_overview,
                        'similarity': similarity_score
                    })
                    
                    # top_n개 채웠으면 종료
                    if len(recommendations) >= top_n:
                        break
        
        return recommendations
    else:
        return []

# 사용 예시
movie_name = input('어떤 영화와 비슷한 영화를 추천받고 싶으신가요? ')
recommendations = recommend_movies(movie_name)

if recommendations:
    print(f"\n'{movie_name}'와(과) 비슷한 영화 추천:")
    for i, movie in enumerate(recommendations, 1):
        print(f"\n{i}. {movie['title']} (유사도: {movie['similarity']:.2f})")
        print(f"줄거리: {movie['overview'][:150]}...")  # 줄거리는 150자만 보여주기
else:
    print(f"'{movie_name}' 영화를 찾을 수 없거나 추천할 영화가 없습니다.")


'John Carter'와(과) 비슷한 영화 추천:

1. Battleship (유사도: -0.10)
줄거리: When mankind beams a radio signal into space, a reply comes from ‘Planet G’, in the form of several alien crafts that splash down in the waters off Ha...

2. Pirates of the Caribbean: At World's End (유사도: -0.10)
줄거리: Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with Will Turner and Elizabeth Swann. But not...

3. Waterworld (유사도: -0.21)
줄거리: In a futuristic world where the polar ice caps have melted and made Earth a liquid planet, a beautiful barmaid rescues a mutant seafarer from a floati...

4. Iron Man 3 (유사도: -0.24)
줄거리: When Tony Stark's world is torn apart by a formidable terrorist called the Mandarin, he starts an odyssey of rebuilding and retribution....

5. Terminator Salvation (유사도: -0.24)
줄거리: All grown up in post-apocalyptic 2018, John Connor must lead the resistance of humans against the increasingly dominating militaristic robots. But whe...
