In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from supabase import create_client, Client
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import json

# 필수 구성 요소 초기화
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
punctuations = set(string.punctuation)

# Supabase 클라이언트 초기화
url = 'https://nhcmippskpgkykwsumqp.supabase.co'
key = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Im5oY21pcHBza3Bna3lrd3N1bXFwIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MjE2MjYyNzEsImV4cCI6MjAzNzIwMjI3MX0.quApu8EwzqcTgcxdWezDvpZIHSX9LKVQ_NytpLBeAiY' 
supabase: Client = create_client(url, key)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/downtown/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/downtown/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/downtown/nltk_data...


In [3]:

# GloVe 모델 로드 함수
def load_glove_model(glove_file_path):
    glove_model = {}
    with open(glove_file_path, 'r', encoding='utf8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array([float(val) for val in split_line[1:]])
            glove_model[word] = embedding
    return glove_model

# 텍스트 벡터화 함수 (KeyError 예외 처리 추가)
def get_sentence_vector(sentence, glove_model, embedding_dim=100):
    words = word_tokenize(sentence.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and word not in punctuations]
    word_vectors = []
    for word in words:
        try:
            word_vectors.append(glove_model[word])
        except KeyError:
            continue
    # KeyError가 발생하면 단어를 무시하고 넘어갑니다.
    if not word_vectors:
        return np.zeros(embedding_dim)
    return np.mean(word_vectors, axis=0)

# 데이터베이스에서 검색 필드 데이터 가져오기 (SQL 쿼리 사용)
def fetch_search_data():
    response = supabase.table("steamsearcher_duplicate").select("*").execute()
    data = response.data
    df = pd.DataFrame(data)
    
    # 필터링 조건 적용
    df = df[df['embedding'].notna() & df['name_embedding'].notna()]
    
    # JSON 문자열을 numpy 배열로 변환
    df['embedding'] = df['embedding'].apply(lambda x: np.array(json.loads(x)) if x else np.zeros(100))
    df['name_embedding'] = df['name_embedding'].apply(lambda x: np.array(json.loads(x)) if x else np.zeros(100))

    return df

# 유사도 계산 함수
def find_most_similar_rows(query, glove_model, top_n=10):
    df = fetch_search_data()
    query_vector = get_sentence_vector(query, glove_model)
    
    def calculate_similarity(row):
        row_vector = np.array(row['embedding'])
        return cosine_similarity([query_vector], [row_vector])[0][0]
    
    df['similarity'] = df.apply(calculate_similarity, axis=1)
    most_similar_rows = df.nlargest(top_n, 'similarity')
    return most_similar_rows

# 키워드 유사도 계산 함수
def find_most_similar_keywords(query, row, glove_model, top_n=10):
    query_vector = get_sentence_vector(query, glove_model)
    keywords = ' '.join([str(row[field]) for field in ['name', 'genre', 'dp', 'summary', 'keyphrase'] if row[field]])
    keyword_list = word_tokenize(keywords)
    keyword_list = [word for word in keyword_list if word not in stop_words and word not in punctuations]
    keyword_vectors = {word: glove_model[word] for word in keyword_list if word in glove_model}
    
    def calculate_keyword_similarity(word):
        return cosine_similarity([query_vector], [keyword_vectors[word]])[0][0]
    
    if not keyword_vectors:
        return []  # 키워드 벡터가 비어있을 경우 빈 리스트 반환

    sorted_keywords = sorted(keyword_vectors.keys(), key=calculate_keyword_similarity, reverse=True)
    sorted_keywords_with_scores = [(word, calculate_keyword_similarity(word)) for word in sorted_keywords]
    return sorted_keywords_with_scores[:top_n]

# 메인 함수 (유사도 검색)
def main():
    glove_file_path = 'glove.6B.100d.txt'
    glove_model = load_glove_model(glove_file_path)
    query = input("Enter your search query: ")
    most_similar_rows = find_most_similar_rows(query, glove_model)
    
    for index, row in most_similar_rows.iterrows():
        print(f"Most similar game: {row['name']} (Similarity: {row['similarity']:.4f})")
        most_similar_keywords = find_most_similar_keywords(query, row, glove_model)
        keywords_with_scores = [f"{keyword} ({score:.4f})" for keyword, score in most_similar_keywords]
        print(f"Most similar keywords: {keywords_with_scores}")

if __name__ == "__main__":
    main()


Most similar game: Birthday Boy and Benny (Similarity: 0.7332)
Most similar keywords: ['friend (1.0000)', 'mother (0.7818)', 'mom (0.6288)', 'best (0.5370)', 'happy (0.5341)', 'birthday (0.4955)', 'therapist (0.4653)', 'year (0.3862)', 'party (0.3777)', 'sad (0.3364)']
Most similar game: William's Love Prelude (Similarity: 0.7008)
Most similar keywords: ['friend (1.0000)', 'man (0.7171)', 'boy (0.6790)', 'love (0.6468)', 'william (0.5418)', 'childhood (0.5393)', "'s (0.5328)", 'turn (0.4650)', 'strong (0.3895)', 'style (0.3430)']
Most similar game: A Date for the Ages (Similarity: 0.6922)
Most similar keywords: ['man (0.7171)', 'lady (0.5946)', 'alice (0.5712)', 'paul (0.5614)', 'divorced (0.4942)', 'things (0.4764)', 'pair (0.4564)', 'chair (0.3995)', 'herman (0.3365)', 'date (0.3336)']
Most similar game: A World of Wishes (Similarity: 0.6920)
Most similar keywords: ['father (0.8364)', 'mother (0.7818)', 'life (0.5967)', 'nicole (0.4809)', 'choice (0.4794)', 'murder (0.4757)', 'image 