In [9]:
# 라이브러리 임포트 및 버전 확인
# # 기본 라이브러리
import os, math, time
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Dict, Any, List, Optional
from urllib.parse import urlparse

# 외부 라이브러리
import requests
import pandas as pd

# 버전 출력 (환경 재현성 확인용)
print("requests:", requests.__version__)
print("pandas:", pd.__version__)


requests: 2.32.5
pandas: 3.0.0


In [10]:
# 디렉터리 구조 생성
# 원천(raw) 데이터와 가공(processed) 데이터 저장 경로
RAW_DIR = os.path.join("data", "raw")
PROCESSED_DIR = os.path.join("data", "processed")

# 디렉터리가 없으면 생성
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)

print("RAW_DIR:", RAW_DIR)
print("PROCESSED_DIR:", PROCESSED_DIR)


RAW_DIR: data\raw
PROCESSED_DIR: data\processed


In [11]:
# 설정(Config)과 공용 유틸 함수
# GDELT DOC API 엔드포인트
GDELT_DOC = "https://api.gdeltproject.org/api/v2/doc/doc"

@dataclass
class Config:
    """
    GDELT 수집 설정값을 한 곳에서 관리하기 위한 설정 클래스
    """
    query: str = "(bitcoin OR btc)"
    sourcelang: str = "eng"
    maxrecords: int = 250
    sort: str = "HybridRel"
    timeout_sec: int = 30
    retries: int = 3
    backoff_base_sec: float = 1.5
    max_pages_per_day: int = 40  # 하루 최대 수집량 제한(안전장치)

cfg = Config()

def ymdhms(dt: datetime) -> str:
    """GDELT API가 요구하는 YYYYMMDDHHMMSS 포맷"""
    return dt.strftime("%Y%m%d%H%M%S")

def date_range_days(start_day: datetime, end_day: datetime):
    """시작일부터 종료일까지 하루 단위로 순회"""
    cur = start_day
    while cur <= end_day:
        yield cur
        cur += timedelta(days=1)

def domain_from_url(u: str) -> Optional[str]:
    """URL에서 도메인만 추출 (중복/출처 분석용)"""
    try:
        return urlparse(u).netloc.lower()
    except Exception:
        return None


In [12]:
# 기사 단위 정규화 
def normalize_article(a: Dict[str, Any], day_utc: datetime) -> Dict[str, Any]:
    """
    GDELT 원본 기사 JSON을
    분석/병합에 쓰기 쉬운 스키마로 정규화
    """
    url = a.get("url")
    title = (a.get("title") or "").strip()

    # 게시 시각은 필드가 들쭉날쭉하므로 여러 후보를 순차 시도
    published_raw = (
        a.get("seendate")
        or a.get("sourceCollectionDate")
        or a.get("date")
        or a.get("datetime")
    )

    published_at_utc = None
    if isinstance(published_raw, str):
        for fmt in ("%Y%m%d%H%M%S", "%Y%m%dT%H%M%SZ"):
            try:
                dt = datetime.strptime(published_raw, fmt)
                published_at_utc = dt.replace(tzinfo=timezone.utc).isoformat()
                break
            except Exception:
                pass

    # 파싱 실패 시 해당 날짜 00:00 UTC로 대체
    if not published_at_utc:
        published_at_utc = day_utc.replace(
            hour=0, minute=0, second=0, microsecond=0
        ).isoformat()

    return {
        "published_at_utc": published_at_utc,
        "date_day": day_utc.date().isoformat(),  # 이후 병합 키
        "title": title,
        "url": url,
        "domain": domain_from_url(url) if isinstance(url, str) else None,
        "source": a.get("source") or a.get("sourceName"),
        "sourceCountry": a.get("sourceCountry"),
        "language": a.get("language"),
    }


In [13]:
def request_with_retries(params: Dict[str, Any], cfg: Config) -> Dict[str, Any]:
    last_err = None

    for attempt in range(1, cfg.retries + 1):
        try:
            r = requests.get(GDELT_DOC, params=params, timeout=cfg.timeout_sec)

            # 429: 너무 빨라서 막힘 → 오래 쉬었다가 재시도
            if r.status_code == 429:
                wait = max(6, int(cfg.backoff_base_sec ** attempt))  # 최소 6초
                time.sleep(wait)
                raise RuntimeError(f"HTTP 429 (rate limit). waited={wait}s body_head={r.text[:120]!r}")

            # 서버 오류 계열도 재시도
            if r.status_code in (500, 502, 503, 504):
                raise RuntimeError(f"HTTP {r.status_code} body_head={r.text[:120]!r}")

            r.raise_for_status()

            try:
                return r.json()
            except Exception as je:
                raise RuntimeError(f"Non-JSON response. head={r.text[:200]!r}") from je

        except Exception as e:
            last_err = e
            # 일반 실패 backoff
            time.sleep(cfg.backoff_base_sec ** attempt)

    raise RuntimeError(f"Request failed after retries: {last_err}")


In [14]:
def fetch_day_articles(day_utc: datetime, cfg: Config) -> pd.DataFrame:
    start = day_utc.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=timezone.utc)
    end = day_utc.replace(hour=23, minute=59, second=59, microsecond=0, tzinfo=timezone.utc)

    params = dict(
        query=cfg.query,
        mode="artlist",
        format="json",
        maxrecords=cfg.maxrecords,
        startdatetime=ymdhms(start),
        enddatetime=ymdhms(end),
        sort=cfg.sort,
        sourcelang=cfg.sourcelang,
        startrecord=1,
    )

    data = request_with_retries(params, cfg)
    time.sleep(6)  # ✅ 레이트리밋 준수(5초에 1번)

    articles = data.get("articles", []) or []
    total = int(data.get("totalArticles", len(articles)) or 0)

    rows: List[Dict[str, Any]] = [normalize_article(a, day_utc) for a in articles]

    pages = max(1, math.ceil(total / cfg.maxrecords)) if total else 1
    pages = min(pages, cfg.max_pages_per_day)

    for p in range(2, pages + 1):
        params["startrecord"] = 1 + (p - 1) * cfg.maxrecords
        d2 = request_with_retries(params, cfg)
        time.sleep(6)  # ✅ 페이지 요청도 간격 준수

        for a in (d2.get("articles", []) or []):
            rows.append(normalize_article(a, day_utc))

    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.dropna(subset=["url"]).copy()
        df["title"] = df["title"].fillna("").astype(str)

    return df


In [15]:
from urllib.parse import urlencode

test_day = datetime(2025, 9, 1, tzinfo=timezone.utc)

params = {
    "query": cfg.query,
    "mode": "artlist",
    "format": "json",
    "maxrecords": cfg.maxrecords,
    "startdatetime": ymdhms(test_day.replace(hour=0, minute=0, second=0, tzinfo=timezone.utc)),
    "enddatetime": ymdhms(test_day.replace(hour=23, minute=59, second=59, tzinfo=timezone.utc)),
    "sort": cfg.sort,
    "sourcelang": cfg.sourcelang,
    "startrecord": 1,
}

url = f"{GDELT_DOC}?{urlencode(params)}"
print("URL:", url)

r = requests.get(GDELT_DOC, params=params, timeout=30)
print("status:", r.status_code)
print("content-type:", r.headers.get("content-type"))
print("body head:", r.text[:300])


URL: https://api.gdeltproject.org/api/v2/doc/doc?query=%28bitcoin+OR+btc%29&mode=artlist&format=json&maxrecords=250&startdatetime=20250901000000&enddatetime=20250901235959&sort=HybridRel&sourcelang=eng&startrecord=1
status: 200
content-type: application/json; charset=utf-8
body head: {"articles": [ { "url": "https://www.fxstreet.com/cryptocurrencies/news/bitcoin-price-forecast-btc-recovers-above-109-000-as-fed-rate-cut-bets-offset-cautious-sentiment-202509011005", "url_mobile": "https://www.fxstreet.com/amp/cryptocurrencies/news/bitcoin-price-forecast-btc-recovers-above-109-000-


In [16]:
test_day = datetime(2025, 9, 1, tzinfo=timezone.utc)
df_test = fetch_day_articles(test_day, cfg)

print("기사 수:", len(df_test))
display(df_test.head())
# 1일 샘플 테스트

기사 수: 207


Unnamed: 0,published_at_utc,date_day,title,url,domain,source,sourceCountry,language
0,2025-09-01T11:00:00+00:00,2025-09-01,Can BTC rebound from its recent correction as ...,https://www.fxstreet.com/cryptocurrencies/news...,www.fxstreet.com,,,English
1,2025-09-01T20:15:00+00:00,2025-09-01,Investors Buy $13M : Bitcoin Hyper Continues P...,https://bravenewcoin.com/partner/bitcoin-hyper...,bravenewcoin.com,,,English
2,2025-09-01T05:00:00+00:00,2025-09-01,Bitcoin Price Warning : Is a Major Sell - Off ...,https://www.newsbtc.com/analysis/btc/bitcoin-p...,www.newsbtc.com,,,English
3,2025-09-01T16:00:00+00:00,2025-09-01,"Cryptocurrencies Price Prediction : Ethereum ,...",https://www.fxstreet.com/cryptocurrencies/news...,www.fxstreet.com,,,English
4,2025-09-01T10:30:00+00:00,2025-09-01,"Metaplanet ( 3350 ) Hits 20K BTC , Overtakes R...",https://www.coindesk.com/markets/2025/09/01/me...,www.coindesk.com,,,English


In [None]:
# 전체 기간 수집
start_day = datetime(2025, 9, 1, tzinfo=timezone.utc)
end_day = datetime(2025, 10, 31, tzinfo=timezone.utc)

for day in date_range_days(start_day, end_day):
    path = f"{RAW_DIR}/gdelt_articles_{day.date()}.csv"
    if os.path.exists(path):
        continue

    try:
        df = fetch_day_articles(day, cfg)
        df.to_csv(path, index=False, encoding="utf-8-sig")
        print(f"[OK] {day.date()} -> {len(df)} articles")
    except Exception as e:
        print(f"[ERR] {day.date()} -> {e}")


In [None]:
# 합본 + URL 중복 제거
dfs = []
for day in date_range_days(start_day, end_day):
    path = f"{RAW_DIR}/gdelt_articles_{day.date()}.csv"
    if os.path.exists(path):
        dfs.append(pd.read_csv(path))

all_df = pd.concat(dfs, ignore_index=True)

before = len(all_df)
all_df = all_df.drop_duplicates(subset=["url"])
after = len(all_df)

print(f"URL dedup: {before} → {after}")

out_path = f"{PROCESSED_DIR}/gdelt_articles_all.csv"
all_df.to_csv(out_path, index=False, encoding="utf-8-sig")


In [None]:
# 일자별 기사 수 QC
daily_counts = (
    all_df.groupby("date_day")
    .agg(n=("url", "count"))
    .reset_index()
)

daily_counts.to_csv(
    f"{PROCESSED_DIR}/gdelt_daily_counts.csv",
    index=False,
    encoding="utf-8-sig"
)

daily_counts.head()
