In [7]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from supabase import create_client, Client
import json

# 필수 구성 요소 초기화
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Supabase 클라이언트 초기화
url = 'https://nhcmippskpgkykwsumqp.supabase.co'
key = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Im5oY21pcHBza3Bna3lrd3N1bXFwIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MjE2MjYyNzEsImV4cCI6MjAzNzIwMjI3MX0.quApu8EwzqcTgcxdWezDvpZIHSX9LKVQ_NytpLBeAiY' 
supabase: Client = create_client(url, key)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
# GloVe 모델 로드 함수
def load_glove_model(glove_file_path):
    glove_model = {}
    with open(glove_file_path, 'r', encoding='utf8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array([float(val) for val in split_line[1:]])
            glove_model[word] = embedding
    return glove_model

# 텍스트 벡터화 함수
def get_sentence_vector(sentence, glove_model, embedding_dim=100):
    words = word_tokenize(sentence.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    word_vectors = []
    for word in words:
        try:
            word_vectors.append(glove_model[word])
        except KeyError:
            continue
    if not word_vectors:
        return np.zeros(embedding_dim)
    return np.mean(word_vectors, axis=0)

# emst 컬럼 설정 함수
def set_emst(dp, summary):
    if dp and summary:
        return 'dpsm'
    elif dp:
        return 'dp'
    elif summary:
        return 'sm'
    else:
        return None

# 데이터베이스에서 필요한 데이터 가져오기 (한 번에 여러 행씩, emst 업데이트가 필요한 데이터만)
def fetch_data(batch_size=1000):
    response = supabase.table('steamsearcher_duplicate').select(
        'appid', 'name', 'genre', 'dp', 'summary', 'keyphrase', 'emst'
    ).execute()
    data = response.data
    df = pd.DataFrame(data)

    # emst 컬럼 업데이트가 필요한 데이터 필터링
    df['new_emst'] = df.apply(lambda row: set_emst(row['dp'], row['summary']), axis=1)
    df_to_update = df[df['emst'] != df['new_emst']]

    return df_to_update.head(batch_size)

# 임베딩 벡터를 데이터베이스에 저장 (한 번에 여러 행씩)
def save_embeddings_to_db(glove_model):
    while True:
        df = fetch_data()
        if df.empty:
            print("No more data to process")
            break
        
        for index, row in tqdm(df.iterrows(), total=len(df), desc="Calculating embeddings", leave=False):
            # 모든 필드를 결합한 텍스트의 임베딩 벡터 계산
            text = ' '.join([str(row[field]) for field in ['name', 'genre', 'dp', 'summary', 'keyphrase'] if row[field]])
            embedding_vector = get_sentence_vector(text, glove_model)
            name_embedding_vector = get_sentence_vector(str(row['name']), glove_model)

            # emst 컬럼 설정
            emst_value = set_emst(row['dp'], row['summary'])

            # 임베딩 벡터 저장 및 emst 컬럼 업데이트
            supabase.table('steamsearcher_duplicate').update({
                'embedding': json.dumps(embedding_vector.tolist()),  # JSON 형식으로 저장
                'name_embedding': json.dumps(name_embedding_vector.tolist()),  # JSON 형식으로 저장
                'emst': emst_value
            }).eq('appid', row['appid']).execute()
        
        print(f"Processed {len(df)} rows. Fetching next batch...")

# GloVe 모델 경로
glove_file_path = 'glove.6B.100d.txt'

# 메인 함수 (임베딩 벡터 저장)
def main():
    glove_model = load_glove_model(glove_file_path)
    save_embeddings_to_db(glove_model)
    print("Embeddings saved to database.")

if __name__ == "__main__":
    main()


                                                                           

Processed 1000 rows. Fetching next batch...


Calculating embeddings:  30%|██▉       | 296/1000 [00:21<00:46, 15.22it/s]