In [10]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sqlalchemy import create_engine
import json

# nltk 데이터 다운로드
nltk.download('punkt')
nltk.download('stopwords')

# Supabase 데이터베이스 연결
DATABASE_URL = "postgresql://postgres.nhcmippskpgkykwsumqp:123$tiger_BJs@aws-0-ap-northeast-2.pooler.supabase.com:6543/postgres"
engine = create_engine(DATABASE_URL)

# GloVe 벡터 로드
glove_file = "glove.6B.100d.txt"
embeddings_index = {}
with open(glove_file, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# 벡터 생성 함수
def get_vector(word):
    return embeddings_index.get(word, np.zeros(100))

# 문장 벡터 계산 함수
def sentence_vector(sentence):
    words = nltk.word_tokenize(sentence)
    words = [word.lower() for word in words if word.isalnum()]
    words = [word for word in words if word not in stopwords.words('english')]
    if len(words) == 0:
        return np.zeros(100)
    vectors = [get_vector(word) for word in words]
    return np.mean(vectors, axis=0)

# 데이터베이스에서 벡터 로드
vector_query = "SELECT appid, embedding, name, recommendation_count, description_phrases, name_vector, genre_vector FROM steamsearcher_duplicate"
vector_df = pd.read_sql(vector_query, engine)

# 벡터가 JSON 문자열인지 확인하고 변환
def parse_embedding(embedding):
    if isinstance(embedding, str):
        return json.loads(embedding)
    return embedding

vector_df['embedding'] = vector_df['embedding'].apply(parse_embedding)
vector_df['name_vector'] = vector_df['name_vector'].apply(lambda x: np.array(json.loads(x)))
vector_df['genre_vector'] = vector_df['genre_vector'].apply(lambda x: np.array(json.loads(x)))

# 모든 벡터가 동일한 크기인지 확인하고 None 값을 처리
embedding_size = 100  # GloVe 벡터 크기와 일치시킴
def check_and_fix_embedding(embedding):
    if embedding is None or len(embedding) != embedding_size:
        return np.zeros(embedding_size).tolist()
    return embedding

vector_df['embedding'] = vector_df['embedding'].apply(check_and_fix_embedding)

# description_phrases가 JSON 문자열인지 확인하고 변환
def parse_description_phrases(description_phrases):
    if isinstance(description_phrases, str):
        return json.loads(description_phrases)
    return description_phrases

vector_df['description_phrases'] = vector_df['description_phrases'].apply(parse_description_phrases)

sentence_vectors = np.array(vector_df['embedding'].tolist())
name_vectors = np.array(vector_df['name_vector'].tolist())
genre_vectors = np.array(vector_df['genre_vector'].tolist())
sentences = vector_df['description_phrases'].tolist()
names = vector_df['name'].tolist()
recommendation_counts = vector_df['recommendation_count'].tolist()

# 검색어 입력 및 유사 문장 찾기
def find_similar_sentences(query, top_n=5, weights=(0.5, 0.25, 0.25)):
    query_vector = sentence_vector(query)
    name_query_vector = sentence_vector(query)
    genre_query_vector = sentence_vector(query)
    
    # 유사도 계산
    description_similarities = cosine_similarity([query_vector], sentence_vectors)[0]
    name_similarities = cosine_similarity([name_query_vector], name_vectors)[0]
    genre_similarities = cosine_similarity([genre_query_vector], genre_vectors)[0]
    
    # 유사도 점수를 정규화
    description_similarities = (description_similarities + 1) / 2
    name_similarities = (name_similarities + 1) / 2
    genre_similarities = (genre_similarities + 1) / 2
    
    # 가중치를 적용하여 최종 유사도 계산
    final_similarities = (
        weights[0] * description_similarities +
        weights[1] * name_similarities +
        weights[2] * genre_similarities
    )
    
    sorted_indices = np.argsort(final_similarities)[::-1][:top_n]
    results = []
    for i in sorted_indices:
        similar_keywords = [sentences[i][j] for j in range(min(5, len(sentences[i]))) if not sentences[i][j].isdigit()]
        results.append({
            'name': names[i],
            'recommendation_count': recommendation_counts[i],
            'similar_keywords': similar_keywords,
            'similarity_score': final_similarities[i] * 100  # 퍼센트로 변환
        })
    return results

# 검색어 입력 및 결과 출력
query = input("검색어를 입력하세요: ")
similar_sentences = find_similar_sentences(query)
print(f"검색어: {query}")
for result in similar_sentences:
    print(f"Name: {result['name']}, Recommendation Count: {result['recommendation_count']}, Similar Keywords: {result['similar_keywords']}, Similarity Score: {result['similarity_score']:.2f}%")


[nltk_data] Downloading package punkt to /home/downtown/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/downtown/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


검색어: gun
Name: Nuclear Option, Recommendation Count: 1745.0, Similar Keywords: ['accurately', 'advanced', 'aerodynamic', 'affects'], Similarity Score: 0.5626903787551667
Name: Wanderlost, Recommendation Count: 0.0, Similar Keywords: ['advanced', 'amass', 'america', 'apocalyptic', 'bad'], Similarity Score: 0.5483831629435686
Name: Madden NFL 23, Recommendation Count: 4723.0, Similar Keywords: ['accurately', 'activities', 'add', 'addition'], Similarity Score: 0.5402115118454782
Name: 不二臣The Only  Master, Recommendation Count: 0.0, Similar Keywords: ['bl游戏在国内真的好难', 'ps', 'resdiy素材自助生成平台', '一个he', '上架时间一再推迟'], Similarity Score: 0.5272921459301934
Name: Legends of Astravia, Recommendation Count: 0.0, Similar Keywords: ['abilities', 'age', 'allows', 'arc'], Similarity Score: 0.5237884038914411
