In [None]:
from platform import uname

import pandas as pd
import requests

data = 'data/preprocessed_data.csv'

df = pd.read_csv(data, encoding='utf-8')

print(df.head())

In [None]:
singer_title_list = []

for idx, row in df.iterrows():
    singer_title_list.append((row['singer'], row['title']))
    
print(singer_title_list[:5])

In [None]:
base_url = 'https://search.naver.com/search.naver?sm=tab_hty.top&where=nexearch&ssc=tab.nx.all&query={}+{}+%EA%B3%A1%EC%A0%95%EB%B3%B4&oquery={}+%EA%B3%A1%EC%A0%95%EB%B3%B4&tqi=juVnQsqVN8wssvMPvflssssstDG-260394&ackey=cb7tpj47'

url_list = []

for singer, title in singer_title_list:
    query_url = base_url.format(singer, title, title)
    url_list.append(query_url)

print(url_list[:5])

In [None]:
print(len(url_list))

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, WebDriverException
import time

def has_valid_song_info_card(url):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    driver = None

    print(f"[시작] {url}")
    try:
        driver = webdriver.Chrome(options=options)

        driver.set_page_load_timeout(15)

        driver.get(url)

        time.sleep(2)

        info_boxes = driver.find_elements(By.CLASS_NAME, 'cm_info_box')

        for i, box in enumerate(info_boxes):
            if "장르" in box.text:
                return True
        return False

    except TimeoutException:
        return False

    except WebDriverException as we:
        return False

    except Exception as e:
        return False

    finally:
        print("  - 드라이버 종료 시도 중...")
        if driver:
            try:
                driver.quit()
            except Exception as qerr:
                print(f"드라이버 종료 실패: {qerr}")

In [None]:
def extract_valid_urls(url_list):
    valid_url_list = []

    for i, url in enumerate(url_list):
        print(f"[{i+1}/{len(url_list)}] 검사 중: {url}")
        try:
            if has_valid_song_info_card(url):
                print("유효한 곡정보 페이지")
                valid_url_list.append(url)
            else:
                print("곡정보 없음")
        except Exception as loop_err:
            print(f"[오류] {url}\n{loop_err}")
            continue

    return valid_url_list

In [None]:
valid_url_list = extract_valid_urls(url_list)

In [None]:
print(len(valid_url_list))
print(valid_url_list[:5])

In [None]:
valid_singer_title_list = []

for (singer, title), url in zip(singer_title_list, url_list):
    if url in valid_url_list:
        valid_singer_title_list.append((singer, title))
        
print(len(valid_singer_title_list))
print(valid_singer_title_list[:5])

In [None]:
lyric_dict = {
    (row['singer'], row['title']): row['lyric']
    for _, row in df.iterrows()
}

valid_lyric_list = [lyric_dict.get((singer, title)) for singer, title in valid_singer_title_list]


print(len(valid_lyric_list))
print(valid_lyric_list[:5])

In [None]:
def extract_genre_from_url(url):
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.options import Options
    from selenium.common.exceptions import TimeoutException
    import time

    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    driver = None

    try:
        driver = webdriver.Chrome(options=options)
        driver.set_page_load_timeout(15)

        driver.get(url)
        time.sleep(2)

        info_boxes = driver.find_elements(By.CLASS_NAME, 'cm_info_box')
        for box in info_boxes:
            text = box.text
            if "장르" in text:
                lines = text.split('\n')
                for i, line in enumerate(lines):
                    if "장르" in line and i + 1 < len(lines):
                        return lines[i + 1].strip()
        return None

    except Exception as e:
        print(f"[장르 추출 실패] {url}\n{e}")
        return None

    finally:
        if driver:
            try:
                driver.quit()
            except:
                pass

In [None]:
genre_list = []

for i, url in enumerate(valid_url_list):
    print(f"[{i+1}/{len(valid_url_list)}]")
    genre = extract_genre_from_url(url)
    genre_list.append(genre)

In [None]:
print(genre_list[:5])

In [None]:
import pandas as pd

df_final = pd.DataFrame({
    "singer": [s for s, _ in valid_singer_title_list],
    "title": [t for _, t in valid_singer_title_list],
    "lyric": valid_lyric_list,
    "genre": genre_list
})

df_final.to_csv("data/crawling_data.csv", index=False, encoding="utf-8-sig")