### title을 uique하게 DB 갱신 그리고 여기 ID를 PK로 사용

In [3]:
import sqlalchemy
from sqlalchemy import create_engine
import pymysql
import pandas as pd

MYSQL_HOSTNAME = '103.57.61.85'
MYSQL_USER = 'zioni'
MYSQL_PASSWORD = 'kitty'
MYSQL_DATABASE = 'perfume_db'

connection_string = f'mysql+pymysql://{MYSQL_USER}:{MYSQL_PASSWORD}@{MYSQL_HOSTNAME}/{MYSQL_DATABASE}'
con = create_engine(connection_string)

In [4]:
query_1 = """
SELECT * FROM PERFUME_GROUPS
""".format(MYSQL_DATABASE)

perfume_by_grp = pd.read_sql(query_1, con)

query_2 = """
SELECT * FROM PERFUME_NOTES
""".format(MYSQL_DATABASE)

perfume_by_nt = pd.read_sql(query_2, con)

query_3 = """
SELECT * FROM NOTES_MAP
""".format(MYSQL_DATABASE)

nt_map = pd.read_sql(query_3, con)

In [5]:
perfume_by_nt = perfume_by_nt.drop_duplicates(subset='title')
perfume_by_nt.reset_index(inplace=True, drop=True)

perfume_by_grp = perfume_by_grp.drop_duplicates(subset='title')
perfume_by_grp.reset_index(inplace=True, drop=True)

In [6]:
perfume_by_grp.to_sql('PERFUME_GROUPS', con=con, index=False, if_exists='replace')
perfume_by_nt.to_sql('PERFUME_NOTES', con=con, index=False, if_exists='replace')

30968

##### Embedding (top, middle, base)

In [7]:
from sentence_transformers import SentenceTransformer

# SBERT 모델 로드
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
2024-09-23 13:26:23.221808: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-23 13:26:23.234291: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-23 13:26:23.251079: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-23 13:26:23.256004: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-23 13:26:23.268440

In [8]:
map = {
    'citrus' : 0,
    'fruit' : 1,
    'floral' : 2,
    'white_floral' : 3,
    'green' : 4,
    'spicy' : 5,
    'sweet' : 6,
    'woody' : 7,
    'beverage' : 8,
    'synthetic' : 9,
    'animal' : 10,
    'balsamic' : 11  
}

In [9]:
from tqdm import tqdm
import numpy as np

dists = []
embs = []
name_embs = []

for i in tqdm(range(len(perfume_by_grp))):
    temp = list(perfume_by_grp.iloc[i])[2:-1]
    name = list(perfume_by_grp.iloc[i])[1]
    temp = [item for item in temp if item is not None]
    
    # group distribution vector
    dist = [0 for i in range(12)]
    for i in temp:
        dist[map[i]] += 1
    
    dist = np.array(dist) / len(temp)
    dists.append(dist)
    
    # note group embedding vector
    n = len(temp)
    split1 = n // 3
    split2 = split1 + (n % 3)  # 나머지를 가운데 부분에 추가
    split3 = n - (split1 + split2) 

    part1 = temp[:split1]
    part2 = temp[split1:split1 + split2]
    part3 = temp[split1 + split2:]

    emb = []
    for part in [part1, part2, part3]:
        set_part = list(set(part))
        emb.extend(model.encode(' '.join(set_part)))
    
    name_embs.append(model.encode(name))    
    embs.append(np.array(emb))
    

  2%|▏         | 496/30968 [00:10<09:51, 51.51it/s]

##### Chroma DB에 저장

In [2]:
# DB 클라이언트 생성 
import chromadb
from chromadb.config import Settings

client = chromadb.Client(Settings(
    persist_directory="./chroma"  # 데이터가 저장될 디렉토리 경로 지정
))

In [None]:

perfume_name = client.get_collection("perfume_name")
perfume_note = client.get_collection("perfume_note")
perfume_dist = client.get_collection("perfume_dist")

In [11]:
names = list(perfume_by_grp['title'])

for idx, name in enumerate(names):
    
    # Numpy 배열을 1차원 리스트로 변환하여 추가
    perfume_name.add(
        documents=[name],  # 문서 데이터 추가
        embeddings=[name_embs[idx].tolist()],  # Numpy 배열을 완전히 리스트로 변환
        ids=[str(idx)]  # 고유 ID
    )
    
    perfume_note.add(
        documents=[name],  # 문서 데이터 추가
        embeddings=[embs[idx].tolist()],  # Numpy 배열을 완전히 리스트로 변환
        ids=[str(idx)]  # 고유 ID
    )
    
    perfume_dist.add(
        documents=[name],  # 문서 데이터 추가
        embeddings=[dists[idx].tolist()],  # Numpy 배열을 완전히 리스트로 변환
        ids=[str(idx)]  # 고유 ID
    )


##### 유사도 비교 테스트

In [2]:
import chromadb
from sentence_transformers import SentenceTransformer

# SBERT 모델 로드
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# 클라이언트 생성
client = chromadb.Client()

# 컬렉션 로드 
collection = client.get_collection('perfume_name')

InvalidCollectionException: Collection perfume_name does not exist.