### title을 uique하게 DB 갱신 그리고 여기 ID를 PK로 사용

In [8]:
import sqlalchemy
from sqlalchemy import create_engine
import pymysql
import pandas as pd

MYSQL_HOSTNAME = '103.57.61.85'
MYSQL_USER = 'zioni'
MYSQL_PASSWORD = 'kitty'
MYSQL_DATABASE = 'perfume_db'

connection_string = f'mysql+pymysql://{MYSQL_USER}:{MYSQL_PASSWORD}@{MYSQL_HOSTNAME}/{MYSQL_DATABASE}'
con = create_engine(connection_string)

In [9]:
query_1 = """
SELECT * FROM PERFUME_GROUPS
""".format(MYSQL_DATABASE)

perfume_by_grp = pd.read_sql(query_1, con)

query_2 = """
SELECT * FROM PERFUME_NOTES
""".format(MYSQL_DATABASE)

perfume_by_nt = pd.read_sql(query_2, con)

query_3 = """
SELECT * FROM NOTES_MAP
""".format(MYSQL_DATABASE)

nt_map = pd.read_sql(query_3, con)

In [10]:
perfume_by_nt = perfume_by_nt.drop_duplicates(subset='title')
perfume_by_nt.reset_index(inplace=True, drop=True)

perfume_by_grp = perfume_by_grp.drop_duplicates(subset='title')
perfume_by_grp.reset_index(inplace=True, drop=True)

In [11]:
perfume_by_grp.to_sql('PERFUME_GROUPS', con=con, index=False, if_exists='replace')
perfume_by_nt.to_sql('PERFUME_NOTES', con=con, index=False, if_exists='replace')

30968

##### Embedding (top, middle, base)

In [71]:
from sentence_transformers import SentenceTransformer

# SBERT 모델 로드
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [72]:
map = {
    'citrus' : 0,
    'fruit' : 1,
    'floral' : 2,
    'white_floral' : 3,
    'green' : 4,
    'spicy' : 5,
    'sweet' : 6,
    'woody' : 7,
    'beverage' : 8,
    'synthetic' : 9,
    'animal' : 10,
    'balsamic' : 11  
}

In [73]:
from tqdm import tqdm
import numpy as np

dists = []
embs = []
name_embs = []
names = []

for i in tqdm(range(len(perfume_by_grp))):
    temp = list(perfume_by_grp.iloc[i])[2:-1]
    name = list(perfume_by_grp.iloc[i])[1]
    temp = [item for item in temp if item is not None]
    
    # group distribution vector
    dist = [0 for i in range(12)]
    for i in temp:
        dist[map[i]] += 1
    
    dist = np.array(dist) / len(temp)
    dists.append(dist.tolist())
    
    # note group embedding vector
    n = len(temp)
    split1 = n // 3
    split2 = split1 + (n % 3)  # 나머지를 가운데 부분에 추가
    split3 = n - (split1 + split2) 

    part1 = temp[:split1]
    part2 = temp[split1:split1 + split2]
    part3 = temp[split1 + split2:]

    emb = []
    for part in [part1, part2, part3]:
        set_part = list(set(part))
        emb.extend(model.encode(' '.join(set_part)))
    
    names.append(name)
    name_embs.append(model.encode(name))    
    embs.append(emb)
    

100%|██████████| 30968/30968 [09:41<00:00, 53.26it/s]


In [74]:
import pickle

with open('names.pkl', 'wb') as f:
    pickle.dump(names, f)
    
with open('name_embs.pkl', 'wb') as f:
    pickle.dump(name_embs, f)

with open('embs.pkl', 'wb') as f:
    pickle.dump(embs, f)

with open('dists.pkl', 'wb') as f:
    pickle.dump(dists, f)

##### Chroma DB에 저장

In [85]:
# DB 클라이언트 생성 
import chromadb
from chromadb.config import Settings

client = chromadb.HttpClient(host='localhost', port=8000)

client.delete_collection("perfume_name")
client.delete_collection("perfume_note")
client.delete_collection("perfume_dist")

perfume_name = client.get_or_create_collection("perfume_name")
perfume_note = client.get_or_create_collection("perfume_note")
perfume_dist = client.get_or_create_collection("perfume_dist")

In [63]:
##### embedding function을 지정하는 방식 

from langchain.embeddings.openai import OpenAIEmbeddings
from chromadb.utils import embedding_functions
from langchain_openai import AzureOpenAIEmbeddings

from openai import AzureOpenAI
import yaml

with open('openai.yaml') as f:
    config = yaml.safe_load(f)

api_version = config["config"]["api_version"]
endpoint = config["config"]["endpoint"]
api_key = config["config"]["api_key"]


emb_func = embedding_functions.OpenAIEmbeddingFunction(
            api_key = api_key,
            model_name = "text-embedding-3-small",
            api_type = 'azure',
            api_version = api_version,
            api_base = endpoint
)

# client.delete_collection("perfume_name")
# client.delete_collection("perfume_note")
# client.delete_collection("perfume_dist")

perfume_name = client.get_or_create_collection("perfume_name", embedding_functions=emb_func)
perfume_note = client.get_or_create_collection("perfume_note", embedding_functions=emb_func)
perfume_dist = client.get_or_create_collection("perfume_dist", embedding_functions=emb_func)

                    azure_endpoint was transferred to model_kwargs.
                    Please confirm that azure_endpoint is what you intended.


In [64]:
##### embedding function을 지정하는 방식 

from tqdm import tqdm 

names = list(perfume_by_nt['title'])

for idx, name in enumerate(names[:10]):
    perfume_name.add(
        documents=[name],
        ids=[str(idx)]
    )

In [76]:
##### embedding 값을 밀어넣는 방식 

import pickle

with open('names.pkl', 'rb') as file:
    names = pickle.load(file)

with open('dists.pkl', 'rb') as file:
    dists = pickle.load(file)
    
with open('embs.pkl', 'rb') as file:
    embs = pickle.load(file)

with open('name_embs.pkl', 'rb') as file:
    name_embs = pickle.load(file)

In [86]:

##### embedding 값을 밀어넣는 방식 

for idx, name in enumerate(tqdm(names)):
    
    name_embedding = [float(x) for x in name_embs[idx]]  
    note_embedding = [float(x) for x in embs[idx]]  
    dist_embedding = [float(x) for x in dists[idx]]  
    
    
    # Numpy 배열을 1차원 리스트로 변환하여 추가
    perfume_name.add(
        documents=[name],  # 문서 데이터 추가
        embeddings=[name_embedding],  # Numpy 배열을 완전히 리스트로 변환
        ids=[str(idx)]  # 고유 ID
    )
    
    perfume_note.add(
        documents=[name],  # 문서 데이터 추가
        embeddings=[note_embedding],  # Numpy 배열을 완전히 리스트로 변환
        ids=[str(idx)]  # 고유 ID
    )
    
    perfume_dist.add(
        documents=[name],  # 문서 데이터 추가
        embeddings=[dist_embedding],  # Numpy 배열을 완전히 리스트로 변환
        ids=[str(idx)]  # 고유 ID
    )


100%|██████████| 30968/30968 [14:26<00:00, 35.74it/s]


In [97]:
embeddings_data = perfume_name.get(include=['embeddings'])

# 상위 5개의 임베딩 출력
for idx, embedding in enumerate(embeddings_data['embeddings'][:5]):
    print(f"Embedding {idx + 1}: {embedding}")

Embedding 1: [0.12044352293014526, 0.6100152134895325, 0.061317916959524155, -0.06964324414730072, -0.1281007081270218, 0.187954381108284, 0.6017221808433533, -0.59052973985672, 0.016505518928170204, 0.09139701724052429, 0.4567641317844391, -0.14289559423923492, 0.014559010975062847, -0.3271092474460602, 0.581083357334137, -0.030272841453552246, 0.07918926328420639, -0.3867795467376709, 0.26964032649993896, 0.4166233241558075, 0.2720031440258026, -0.12849010527133942, 0.20846430957317352, -0.17366564273834229, 0.15252164006233215, -0.39548876881599426, 0.2576245367527008, 0.23566289246082306, 0.27224233746528625, -0.2006712555885315, 0.0911150798201561, 0.4868374764919281, 0.4327421486377716, 0.3971332013607025, -0.3933189809322357, 1.002736210823059, -0.030878841876983643, 0.19656549394130707, -0.0338793508708477, 0.06550727039575577, 0.27918511629104614, -0.35250118374824524, 0.6771685481071472, -0.29038745164871216, -0.1375468522310257, -0.13096001744270325, 0.48537516593933105, 0.0

##### 유사도 비교 테스트

In [1]:
# DB 클라이언트 생성 
import chromadb
from chromadb.config import Settings

client = chromadb.HttpClient(host='localhost', port=8000)


from sentence_transformers import SentenceTransformer

# SBERT 모델 로드
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# 컬렉션 로드 
perfume_name = client.get_collection("perfume_name")
perfume_note = client.get_collection("perfume_note")
perfume_dist = client.get_collection("perfume_dist")

  from tqdm.autonotebook import tqdm, trange
2024-09-30 13:32:54.551415: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-30 13:32:54.877128: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-30 13:32:54.998511: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-30 13:32:55.033631: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-30 13:32:55.279963

In [None]:
##### name



In [3]:
enc = model.encode(['Neroli Portofino Acqua Tom Ford for women and men'])

In [7]:
res = perfume_name.query(
    enc,
    n_results=3
)

In [12]:
res['ids'][0][0]

'1856'