In [12]:
import os
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import pke
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from supabase import create_client, Client

# 필수 구성 요소 초기화
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Supabase 클라이언트 초기화
url = 'https://nhcmippskpgkykwsumqp.supabase.co'
key = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Im5oY21pcHBza3Bna3lrd3N1bXFwIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MjE2MjYyNzEsImV4cCI6MjAzNzIwMjI3MX0.quApu8EwzqcTgcxdWezDvpZIHSX9LKVQ_NytpLBeAiY' 
supabase: Client = create_client(url, key)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
# 데이터베이스에서 데이터 가져오기
def fetch_data(batch_size=1000):
    response = supabase.table('steamsearcher_duplicate')\
                       .select('appid', 'detailed_description')\
                       .is_('dp', None)\
                       .neq('detailed_description', None)\
                       .limit(batch_size)\
                       .execute()
    data = response.data
    return pd.DataFrame(data)

# HTML 태그와 이미지의 src 속성 제거 함수
def clean_html(raw_html):
    if raw_html is None:
        return None
    soup = BeautifulSoup(raw_html, "html.parser")
    text = soup.get_text()
    text = re.sub(r'http\S+', '', text)  # 링크 제거
    text = re.sub(r'\s*src="[^"]*"', '', text)  # src 속성 제거
    return text if text.strip() else None

# 텍스트 전처리 함수
def preprocess_text(text):
    words = word_tokenize(text)
    words = [word for word in words if word.lower() not in stop_words and not word.isdigit()]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# 키프레이즈 추출 함수
def extract_keyphrases(text):
    extractor = pke.unsupervised.MultipartiteRank()
    extractor.load_document(input=text, language='en')
    extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
    extractor.candidate_weighting()
    keyphrases = extractor.get_n_best(n=5)  # 원하는 키프레이즈 개수 설정
    return ', '.join([keyphrase[0] for keyphrase in keyphrases])

# 한 행씩 데이터 처리 및 업데이트
def process_and_update_row(row):
    cleaned_text = clean_html(row['detailed_description'])
    if cleaned_text is None:
        dp = None
    else:
        preprocessed_text = preprocess_text(cleaned_text)
        dp = extract_keyphrases(preprocessed_text)
    supabase.table('steamsearcher_duplicate').update({'dp': dp}).eq('appid', row['appid']).execute()

# 메인 함수
def main():
    while True:
        df = fetch_data()
        if df.empty:
            print("No more data to process")
            break
        for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
            try:
                process_and_update_row(row)
            except Exception as e:
                print(f"Error processing appid {row['appid']}: {e}")
        print("Batch processed. Fetching next batch...")

if __name__ == "__main__":
    main()


Processing rows:   8%|▊         | 82/1000 [01:01<11:13,  1.36it/s]