# 讲话数据清洗及预处理

@AUTHOR: WBZHANG / 01208663

@DATE: Oct. 31, 2024

In [63]:
import os
import sys

sys.path.append("../")
import json
import glob
import numpy as np
import pandas as pd
from utils.file_saver import json_load, json_dump
from utils.common import parse_datestring

import locale
# 保存当前区域设置
original_locale = locale.getlocale()
# 设置区域设置为英文
locale.setlocale(locale.LC_TIME, "en_US.UTF-8")

FED_SPEECHES_PATH = "../data/fed_speeches/"

speech_dirs = glob.glob(FED_SPEECHES_PATH + "*_speeches")
print(speech_dirs)

['../data/fed_speeches\\atlanta_fed_speeches', '../data/fed_speeches\\bog_fed_speeches', '../data/fed_speeches\\boston_fed_speeches', '../data/fed_speeches\\chicago_fed_speeches', '../data/fed_speeches\\cleveland_fed_speeches', '../data/fed_speeches\\dallas_fed_speeches', '../data/fed_speeches\\kansascity_fed_speeches', '../data/fed_speeches\\minneapolis_fed_speeches', '../data/fed_speeches\\newyork_fed_speeches', '../data/fed_speeches\\philadelphia_fed_speeches', '../data/fed_speeches\\richmond_fed_speeches', '../data/fed_speeches\\sanfrancisco_fed_speeches', '../data/fed_speeches\\stlouis_fed_speeches']


## 1. Chicago联储讲话数据日期清洗

In [73]:
chicago_speech_path = speech_dirs[3]
print(f"chicago_speech_path: {chicago_speech_path}")
# 读取所有json文件
chicago_speech_filenames = glob.glob(chicago_speech_path + "//*_speech_infos.json")
chicago_speech_filenames

chicago_speech_path: ../data/fed_speeches\chicago_fed_speeches


['../data/fed_speeches\\chicago_fed_speeches\\chicago_failed_speech_infos.json',
 '../data/fed_speeches\\chicago_fed_speeches\\chicago_speech_infos.json']

In [42]:
pd.to_datetime("2012-08-15").strftime(format="%B %d, %Y")

'August 15, 2012'

In [74]:
def extract_speech_date(speech: dict):
    try:
        # 年份
        year = speech["href"].split('/')[-2]
        # 最后的划分日期
        title = speech["href"].split("/")[-1]
        month, date = title.split("-")[0], title.split("-")[1]
        # 转为日期
        if month.isdigit():
            # print(f"{year}-{month}-{date}")
            speech_date = pd.to_datetime(f"{year}-{month}-{date}")
        else:
            # print(f"{month}. {date}, {year}")
            speech_date = pd.to_datetime(f"{month}. {date}, {year}")
        # 若记录日期为空、或者年份与识别日期年份不一致，则更新日期
        # print(
        #     "{} is going to be replace by {}".format(
        #         speech["date"], speech_date.date().strftime(format="%B %d, %Y")
        #     )
        # )
        return speech_date.strftime("%B %d, %Y")  # speech["date"] =
    except Exception as e:
        # msg = "Error  {} occured when processing {}.".format(repr(e), speech["href"])
        # print(msg)
        return None
    
def amend_record_speech_date(speeches: list):
    for speech in speeches:
        # print("-"*100)
        # print("{} | {}".format(speech["href"], speech["date"]))
        # 根据href进行识别
        year = speech["href"].split('/')[-2]
        # 如果日期存在且年份相同，则不识别作更换
        if speech["date"] and parse_datestring(speech["date"]).strftime("%Y") == year:
            continue
        # 如果解析成功，则做替换
        speech_date = extract_speech_date(speech)
        if speech_date:
            speech["date"] = speech_date
    # 查看是否还有没有日期为空的
    print("-"*50)
    for speech in speeches:
        if not speech["date"]:
            print("{} - {}".format(speech["href"], speech["date"]))
    print("-" * 50)
    return speeches


def amend_speech_date(speeches_filepath: str):
    """修正演讲日期

    Args:
        speeches_filepath (str): _description_
    """
    speeches = json_load(speeches_filepath)
    if isinstance(speeches, list):
        speeches = amend_record_speech_date(speeches)  
    elif isinstance(speeches, dict):
        for k, single_year_speeches in speeches.items():
            speeches[k] = amend_record_speech_date(single_year_speeches)

    json_dump(speeches, speeches_filepath)
    print("{} was Done!".format(speeches_filepath))

for filepath in chicago_speech_filenames:
    amend_speech_date(filepath)

--------------------------------------------------
--------------------------------------------------
../data/fed_speeches\chicago_fed_speeches\chicago_failed_speech_infos.json was Done!
--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
------------------------------------------------