In [38]:
!pip install feedparser



In [50]:
import re, time, requests
from bs4 import BeautifulSoup
import pandas as pd

HEADERS = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0",
    "Accept-Language": "ko-KR,ko;q=0.9,en;q=0.8",
    "Referer": "https://www.naver.com/"
}

# ✅ 기사 링크 패턴을 대폭 확장
ARTICLE_PATTERNS = [
    re.compile(r"https?://n\.news\.naver\.com/article/\d+/\d+"),
    re.compile(r"https?://n\.news\.naver\.com/mnews/article/\d+/\d+"),
    re.compile(r"https?://news\.naver\.com/main/read\.naver.*[?&]oid=\d+.*[?&]aid=\d+"),
    re.compile(r"https?://mnews\.naver\.com/article/\d+/\d+"),
]

def is_article_url(href: str) -> bool:
    if not href: 
        return False
    return any(p.search(href) for p in ARTICLE_PATTERNS)

def get_news_links_html(query, pages=10, sleep_sec=0.25, issue_keywords=None):
    """
    네이버 통합 뉴스검색에서 기사 링크/제목 수집.
    우선 폭넓게 긁고(패턴기반), 나중에 제목으로 필터링.
    """
    base = "https://search.naver.com/search.naver"
    rows = []
    for start_idx in range(1, pages*10, 10):  # 1,11,21...
        params = {"where":"news","query":query,"start":start_idx}
        res = requests.get(base, headers=HEADERS, params=params, timeout=10)
        soup = BeautifulSoup(res.text, "html.parser")

        # 1) 대표 셀렉터 시도
        anchors = soup.select("a.title_link")
        if not anchors:
            anchors = soup.select("a.news_tit")

        # 2) 안전망: 모든 a[href] 중 기사 패턴만 남김
        if not anchors:
            anchors = [a for a in soup.find_all("a", href=True)]

        kept = 0
        for a in anchors:
            href = a.get("href","")
            title = a.get_text(strip=True)
            if not is_article_url(href):
                continue

            # (선택) 제목 키워드 필터 – 처음엔 None으로 끄고 테스트 권장
            if issue_keywords:
                t = title.lower()
                if not any(k.lower() in t for k in issue_keywords):
                    continue

            rows.append({"title": title, "url": href, "query": query})
            kept += 1

        print(f"page {start_idx}: kept {kept}")
        if kept == 0 and start_idx == 1:
            # 첫 페이지부터 0이면 바로 중단 (키워드/구조 문제)
            break

        time.sleep(sleep_sec)

    return pd.DataFrame(rows).drop_duplicates(subset=["url"]).reset_index(drop=True)


In [None]:
def get_naver_comments(oid, aid, max_pages=20, sleep_sec=0.2):
    """
    기사 한 개에 대해 댓글 리스트 반환.
    반환: [{'user':..., 'contents':..., 'regTime':...}, ...]
    """
    out = []
    for page in range(1, max_pages+1):
        api = "https://apis.naver.com/commentBox/cbox/web_naver_list_jsonp.json"
        params = {
            "ticket": "news",
            "templateId": "default_news",
            "pool": "cbox5",
            "lang": "ko",
            "country": "KR",
            "objectId": f"news{oid},{aid}",
            "pageSize": 20,
            "pageType": 1,
            "page": page,
            "sort": "FAVORITE"
        }
        try:
            res = requests.get(api, headers=HEADERS, params=params, timeout=10)
        except Exception as e:
            print("  comments req err:", e)
            break

        txt = res.text.strip()
        m = re.search(r"\((\{.*\})\)$", txt)  # JSONP → JSON
        if not m:
            break
        data = json.loads(m.group(1))
        lst = data.get("result", {}).get("commentList", [])
        if not lst:
            break
        for c in lst:
            out.append({
                "user": c.get("userNameMasked"),
                "contents": c.get("contents"),
                "regTime": c.get("regTime")
            })
        time.sleep(sleep_sec)
    return out


In [43]:
from pathlib import Path

def collect_comments_for_links(df_links, brand_code, max_articles=20, max_comment_pages=10):
    """
    df_links: columns=['title','url','query']
    brand_code: 'romand' 등 저장 파일명에 사용
    """
    rows = []
    for i, r in df_links.head(max_articles).iterrows():
        oid, aid = extract_oid_aid(r["url"])
        if not oid:
            continue
        print(f"[{i+1}/{min(max_articles,len(df_links))}] comments for oid={oid}, aid={aid}")
        cmts = get_naver_comments(oid, aid, max_pages=max_comment_pages)
        for c in cmts:
            rows.append({
                "brand": brand_code,
                "title": r["title"],
                "url": r["url"],
                "contents": c["contents"],
                "regTime": c["regTime"]
            })
        time.sleep(0.2)
    out = pd.DataFrame(rows)
    out_path = Path("../data/raw") / f"comments_{brand_code}.csv"
    out.to_csv(out_path, index=False, encoding="utf-8-sig")
    print("저장:", out_path, "rows:", len(out))
    return out

# 롬앤 예시 실행
brand_code = "romand"
if len(html_df) == 0:
    print("링크가 0건 → issue_keywords=None 로 다시 시도해보세요.")
else:
    comments_romand = collect_comments_for_links(html_df, brand_code, max_articles=10, max_comment_pages=10)
    comments_romand.head()


링크가 0건 → issue_keywords=None 로 다시 시도해보세요.
