## text embedding

In [19]:
import pandas as pd
import os
from time import time
from tqdm import tqdm
import psycopg2
from langchain_openai import OpenAIEmbeddings

# 결과 조회용 함수
def show(rows):
    for row in rows:
        print(row)

# OpenAI API Key 설정
os.environ['OPENAI_API_KEY'] = open('API_KEY', 'r').read()
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# 데이터 로드 및 전처리
imdb = pd.read_csv('movie_metadata.csv')
imdb = imdb.dropna()
texts = imdb['plot_keywords'][:1000].tolist()

# 임베딩 생성 및 저장
embeddings_list = []
for text in tqdm(texts):
    embeddings_list.append(embeddings.embed_query(text))

# 임베딩 차원 확인
emb_dim = len(embeddings_list[0])
print(emb_dim)

# 임베딩 파일을 임시 CSV로 저장
import csv
filename = "imdb_embedding.csv"
with open(filename, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["keywords", "embedding"])
    for i in range(len(embeddings_list)):
        writer.writerow([texts[i], embeddings_list[i]])

100%|██████████| 1000/1000 [07:34<00:00,  2.20it/s]


In [20]:
# PostgreSQL 연결
conn_config = {'host': 'localhost', 'dbname': 'my_vec_db', 'user': 'jaesolshin', 'port': 5432}
conn = psycopg2.connect(**conn_config)
cursor = conn.cursor()

# 테이블 생성
cursor.execute("DROP TABLE IF EXISTS imdb;")
cursor.execute("CREATE TABLE IF NOT EXISTS imdb (id serial primary key, keywords text, embedding vector(%s));" % emb_dim)
conn.commit()

# 커밋 및 연결 종료
conn.close()

print("Data successfully loaded into PostgreSQL using COPY.")

Data successfully loaded into PostgreSQL using COPY.


In [27]:
# shell에서 psql 실행하여 \COPY 명령 실행
import subprocess

# 환경 변수에 패스워드 설정
os.environ["PGPASSWORD"] = "postgres1016" 

# 업로드할 파일 위치
csv_path = os.path.join(os.getcwd(), "imdb_embedding.csv") 

# COPY 명령어를 사용하여 데이터 로드
copy_command = (
    f"psql -h {conn_config['host']} -p {conn_config['port']} -d {conn_config['dbname']} -U {conn_config['user']} -c "
    f"\"\\COPY imdb (keywords, embedding) FROM '{csv_path}' WITH (FORMAT CSV, HEADER TRUE);\""
)
subprocess.run(copy_command, shell=True, check=True)

print("Data successfully loaded into PostgreSQL using \\COPY.")

COPY 1000
Data successfully loaded into PostgreSQL using \COPY.


In [28]:
# PG에 연결
conn = psycopg2.connect(host='localhost', dbname='my_vec_db',user='jaesolshin',port=5432)
cursor = conn.cursor()


# 검색할 문자열
test_words = "cyberpunk|dystopia|paraplegic|marine|warrior"  # 검색할 문자열
test_vec = embeddings.embed_query(test_words)
emb_dim = len(test_vec)
k = 5  # 상위 5개 유사 벡터 검색


# 유사도 기반 검색
# <-> : L2거리
s = time()
cursor.execute("""
    SELECT id,
        keywords,
        embedding <-> '{}' AS similarity
    FROM imdb2
    ORDER BY similarity
    LIMIT 5
""".format(test_vec))

time_spent = time() - s
print("time:", time_spent, "\n")

result = cursor.fetchall()
show(result)

# PG에 커밋
conn.commit()
conn.close()

time: 0.0241241455078125 

(11, 'avatar|future|marine|native|paraplegic', 0.7837149072848963)
(1, 'avatar|future|marine|native|paraplegic', 0.7837149072848963)
(313, 'alien|cyborg|pirate|planet|treasure', 0.9300232609536802)
(115, 'alternate timeline|cyborg|future|robot|time machine', 0.9371269119485395)
(673, 'betrayal|futuristic|gladiator|robot|scientist', 0.9467197693842733)
