In [1]:
import pandas as pd
import re
import requests
from dateutil import parser

In [2]:
df = pd.read_csv('yahoo_news.csv').dropna()

In [3]:
df['pubdate']

0       Fri, September 12, 2025 at 1:55 PM GMT+9
1       Fri, September 12, 2025 at 7:00 AM GMT+9
2       Fri, September 12, 2025 at 4:17 AM GMT+9
3       Fri, September 12, 2025 at 3:16 AM GMT+9
4       Fri, September 12, 2025 at 3:00 AM GMT+9
                          ...                   
7237     Thu, October 30, 2025 at 11:00 PM GMT+9
7238      Thu, October 30, 2025 at 9:37 PM GMT+9
7239      Thu, October 30, 2025 at 9:29 PM GMT+9
7240      Thu, October 30, 2025 at 9:28 PM GMT+9
7241      Thu, October 30, 2025 at 8:00 PM GMT+9
Name: pubdate, Length: 7172, dtype: object

In [4]:
# 패턴 사전
patterns = {
    'pattern1': r'(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2},\s+\d{4}',
    'pattern2': r'\d{1,2}-[A-Za-z]{3}-\d{2,4}'
}

In [5]:
# datetime로 변환
def normalize_pubdate(date_str):
    if not isinstance(date_str, str) or not date_str.strip():
        return None

    try:
        clean_date = None

        for name, pattern in patterns.items():
            match = re.search(pattern, date_str)
            if match:
                clean_date = match.group(0)
                break

        if not clean_date:
            return None

        dt = parser.parse(clean_date)
        return dt

    except Exception:
        return None

In [6]:
if __name__ == "__main__":
    df["pubdate_norm"] = df["pubdate"].apply(normalize_pubdate)
    print(df[["pubdate", "pubdate_norm"]].tail(20))

                                      pubdate pubdate_norm
7222   Sat, November 1, 2025 at 8:00 PM GMT+9   2025-11-01
7223   Sat, November 1, 2025 at 8:00 PM GMT+9   2025-11-01
7224   Sat, November 1, 2025 at 5:30 PM GMT+9   2025-11-01
7225   Sat, November 1, 2025 at 2:37 PM GMT+9   2025-11-01
7226   Sat, November 1, 2025 at 9:30 PM GMT+9   2025-11-01
7227   Sat, November 1, 2025 at 9:08 PM GMT+9   2025-11-01
7228   Sat, November 1, 2025 at 8:02 PM GMT+9   2025-11-01
7229   Sat, November 1, 2025 at 8:00 PM GMT+9   2025-11-01
7230   Sat, November 1, 2025 at 5:19 PM GMT+9   2025-11-01
7231   Sat, November 1, 2025 at 4:00 PM GMT+9   2025-11-01
7232   Sat, November 1, 2025 at 8:39 AM GMT+9   2025-11-01
7233   Sat, November 1, 2025 at 7:37 AM GMT+9   2025-11-01
7234   Sat, November 1, 2025 at 8:38 AM GMT+9   2025-11-01
7235  Fri, October 31, 2025 at 11:50 PM GMT+9   2025-10-31
7236   Fri, October 31, 2025 at 8:31 PM GMT+9   2025-10-31
7237  Thu, October 30, 2025 at 11:00 PM GMT+9   2025-10-

In [7]:
df.dtypes

ticker                     object
link                       object
headline                   object
pubdate                    object
related_tickers            object
article                    object
pubdate_norm       datetime64[ns]
dtype: object