# FED演讲稿数据集清洗
@author: zhangwubin

@date: 2024/11/28

In [26]:
import os
import sys
sys.path.append("../")
import glob
import pandas as pd

from utils.file_saver import (
    json_load, json_dump, json_update, 
    sort_speeches_records, sort_speeches_dict, 
    update_dict, update_records
)

import locale
from datetime import datetime

# 设置区域设置为英文（美国）
locale.setlocale(locale.LC_TIME, "en_US.UTF-8")

'en_US.UTF-8'

## FED Speeches数据集分析

In [27]:
speeches_dir = glob.glob("../data/fed_speeches/*_fed_speeches")
speeches_dir = [x.replace("\\", "/") for x in speeches_dir]
speeches_dir

['../data/fed_speeches/atlanta_fed_speeches',
 '../data/fed_speeches/bog_fed_speeches',
 '../data/fed_speeches/boston_fed_speeches',
 '../data/fed_speeches/chicago_fed_speeches',
 '../data/fed_speeches/cleveland_fed_speeches',
 '../data/fed_speeches/dallas_fed_speeches',
 '../data/fed_speeches/kansascity_fed_speeches',
 '../data/fed_speeches/minneapolis_fed_speeches',
 '../data/fed_speeches/newyork_fed_speeches',
 '../data/fed_speeches/philadelphia_fed_speeches',
 '../data/fed_speeches/richmond_fed_speeches',
 '../data/fed_speeches/sanfrancisco_fed_speeches',
 '../data/fed_speeches/stlouis_fed_speeches']

### FED SPEEHES数据集分析

In [28]:
speeches_exp = json_load(speeches_dir[1] + "/bog_speeches.json")

def speechees_dict2df(speeches: dict):
    records = []
    for year, single_year in speeches.items():
        single_year = pd.DataFrame.from_records(single_year)
        single_year["year"] = year
        records.append(single_year)
    records = pd.concat(records)
    return records

speeches_exp = speechees_dict2df(speeches_exp)
speeches_exp

Unnamed: 0,title,date,speaker,url,content,position,href,year
0,New Year’s Resolutions for Bank Regulatory Pol...,"January 08, 2024",Governor Michelle W. Bowman,https://www.federalreserve.gov/newsevents/spee...,"January 08, 2024\n\nGovernor Michelle W. Bowma...",,,2024
1,Almost as Good as It Gets…But Will It Last?,"January 16, 2024",Governor Christopher J. Waller,https://www.federalreserve.gov/newsevents/spee...,"January 16, 2024\n\nGovernor Christopher J. Wa...",,,2024
2,The Path Forward for Bank Capital Reform,"January 17, 2024",Governor Michelle W. Bowman,https://www.federalreserve.gov/newsevents/spee...,"January 17, 2024\n\nGovernor Michelle W. Bowma...",,,2024
3,Opening Remarks,"January 17, 2024",Vice Chair for Supervision Michael S. Barr,https://www.federalreserve.gov/newsevents/spee...,"January 17, 2024\n\nVice Chair for Supervision...",,,2024
4,The Future of Banking,"February 02, 2024",Governor Michelle W. Bowman,https://www.federalreserve.gov/newsevents/spee...,"February 02, 2024\n\nGovernor Michelle W. Bowm...",,,2024
...,...,...,...,...,...,...,...,...
68,The Benefits of Price Stability,2/24/2006,,,"February 24, 2006\n\nChairman Ben S. Bernanke\...",Chairman,https://www.federalreserve.gov/newsevents/spee...,2006
69,"Globalization, Insurers, and Regulators: Share...",2/23/2006,,,"February 23, 2006\n\nVice Chairman Roger W. Fe...",Vice Chairman,https://www.federalreserve.gov/newsevents/spee...,2006
70,Remarks at ceremonial swearing-in by President...,2/6/2006,,,"February 06, 2006\n\nChairman Ben S. Bernanke\...",Chairman,https://www.federalreserve.gov/newsevents/spee...,2006
71,The Continuous Challenges of Risk Management,2/2/2006,,,"February 02, 2006\n\nGovernor Susan Schmidt Bi...",Governor,https://www.federalreserve.gov/newsevents/spee...,2006


In [29]:
# 1) 日期格式统一化
def unify_speech_date(date_str: str):
    try: 
        pdate = pd.to_datetime(date_str)
        return pdate.strftime("%B %d, %Y")
    except: 
        return date_str

speeches_exp["date"] = speeches_exp['date'].transform(lambda x: unify_speech_date(x))
# 2) BOG演讲人职位清洗.
# 识别出所有的职称
position = [
    "Governor",
    "Chairman",
    "Vice Chairman",
    "Vice Chair for Supervision",
    "Vice Chair",
    "Chair",
]


def split_position(applenation: str):
    if not isinstance(applenation, str):
        return "", applenation
    # 识别出所有的职称
    position = [
        "Vice Chairman",
        "Vice Chair for Supervision",
        "Vice Chair",
        "Chairman",
        "Chair",
        "Governor",
    ]
    for p in position:
        if p in applenation:
            # 命中
            position = p.strip().title()
            speaker = applenation.replace(p, "").strip().title()
            return position, speaker
    return "Other", applenation


speeches_exp[["position", "name"]] = speeches_exp["speaker"].apply(
    lambda x: pd.Series(split_position(x))
)

# 3) 统一用url，取消href
cond_non_url = pd.isna(speeches_exp["url"]) & pd.notna(speeches_exp["href"])
speeches_exp.loc[cond_non_url, "url"] = speeches_exp.loc[cond_non_url, "href"]
speeches_exp.drop("href", axis=1, inplace=True)
speeches_exp

Unnamed: 0,title,date,speaker,url,content,position,year,name
0,New Year’s Resolutions for Bank Regulatory Pol...,"January 08, 2024",Governor Michelle W. Bowman,https://www.federalreserve.gov/newsevents/spee...,"January 08, 2024\n\nGovernor Michelle W. Bowma...",Governor,2024,Michelle W. Bowman
1,Almost as Good as It Gets…But Will It Last?,"January 16, 2024",Governor Christopher J. Waller,https://www.federalreserve.gov/newsevents/spee...,"January 16, 2024\n\nGovernor Christopher J. Wa...",Governor,2024,Christopher J. Waller
2,The Path Forward for Bank Capital Reform,"January 17, 2024",Governor Michelle W. Bowman,https://www.federalreserve.gov/newsevents/spee...,"January 17, 2024\n\nGovernor Michelle W. Bowma...",Governor,2024,Michelle W. Bowman
3,Opening Remarks,"January 17, 2024",Vice Chair for Supervision Michael S. Barr,https://www.federalreserve.gov/newsevents/spee...,"January 17, 2024\n\nVice Chair for Supervision...",Vice Chair For Supervision,2024,Michael S. Barr
4,The Future of Banking,"February 02, 2024",Governor Michelle W. Bowman,https://www.federalreserve.gov/newsevents/spee...,"February 02, 2024\n\nGovernor Michelle W. Bowm...",Governor,2024,Michelle W. Bowman
...,...,...,...,...,...,...,...,...
68,The Benefits of Price Stability,"February 24, 2006",,https://www.federalreserve.gov/newsevents/spee...,"February 24, 2006\n\nChairman Ben S. Bernanke\...",,2006,
69,"Globalization, Insurers, and Regulators: Share...","February 23, 2006",,https://www.federalreserve.gov/newsevents/spee...,"February 23, 2006\n\nVice Chairman Roger W. Fe...",,2006,
70,Remarks at ceremonial swearing-in by President...,"February 06, 2006",,https://www.federalreserve.gov/newsevents/spee...,"February 06, 2006\n\nChairman Ben S. Bernanke\...",,2006,
71,The Continuous Challenges of Risk Management,"February 02, 2006",,https://www.federalreserve.gov/newsevents/spee...,"February 02, 2006\n\nGovernor Susan Schmidt Bi...",,2006,


### 如何清洗并规范化Fed Speeches

In [31]:
district_speeches = {}
for speech_dir in speeches_dir:
    # 匹配该区域名称
    district = speech_dir.split("/")[-1].split("_")[0]
    print(f"district: {district}")
    # 去重排序
    # 1) 按年份进行整理的报告
    speech_files_by_year = glob.glob(speech_dir + f"/{district}_speeches_*.json")
    all_speeches = {}
    for filename in speech_files_by_year:
        filename = filename.replace("\\", "/")
        print(filename)
        # 获取年份
        year = filename.split('/')[-1].split('_')[-1].replace('.json','')
        if not year.isdigit():
            continue
        
        speech_single_year = json_load(filename)
        if not speech_single_year:
            continue
        if district in ['atlanta']:
            # 通过highlights获取speaker
            for speech in speech_single_year:
                if speech['speaker']:
                    continue
                else:
                    hl = speech['highlights']
                    splits = hl.split(',')
                    try:
                        speech['speaker'] = splits[0].replace('Remarks by', '').strip()
                        speech["position"] = splits[1].strip()
                    except:
                        pass
        _before_len = len(speech_single_year)
        print(f"original: {len(speech_single_year)}")
        speech_single_year = sort_speeches_records(speech_single_year, sort_filed='date',
                                                   required_keys=['speaker', 'date', 'title'])
        if len(speech_single_year)==_before_len:
            json_dump(speech_single_year, filename)
        else:
            print("-"*10 + f"many speeches drop after sorting. {filename}" + "-"*10)
        print(f"sorted: {len(speech_single_year)}")
        all_speeches[year] = speech_single_year
        print(f"{filename} sorted")
    # 2) speech info
    speech_info_files = speech_dir + f"/{district}_speech_infos.json"
    failed_speech_info_files = speech_dir + f"/{district}_failed_speech_infos.json"
    # 3) 所有的speeches
    # all_speeches_filename = speech_dir + f"/{district}_speeches.json"
    # all_speeches = json_load(all_speeches_filename)
    all_speeches = sort_speeches_dict(all_speeches)
    print(f"District: {district} has {len(all_speeches)} speeches")
    district_speeches[district] = all_speeches

district: atlanta
../data/fed_speeches/atlanta_fed_speeches/atlanta_speeches_1995.json
original: 3
sorted: 3
../data/fed_speeches/atlanta_fed_speeches/atlanta_speeches_1995.json sorted
../data/fed_speeches/atlanta_fed_speeches/atlanta_speeches_1996.json
original: 5
sorted: 5
../data/fed_speeches/atlanta_fed_speeches/atlanta_speeches_1996.json sorted
../data/fed_speeches/atlanta_fed_speeches/atlanta_speeches_1997.json
original: 10
sorted: 10
../data/fed_speeches/atlanta_fed_speeches/atlanta_speeches_1997.json sorted
../data/fed_speeches/atlanta_fed_speeches/atlanta_speeches_1998.json
original: 8
sorted: 8
../data/fed_speeches/atlanta_fed_speeches/atlanta_speeches_1998.json sorted
../data/fed_speeches/atlanta_fed_speeches/atlanta_speeches_1999.json
original: 7
sorted: 7
../data/fed_speeches/atlanta_fed_speeches/atlanta_speeches_1999.json sorted
../data/fed_speeches/atlanta_fed_speeches/atlanta_speeches_2000.json
original: 5
sorted: 5
../data/fed_speeches/atlanta_fed_speeches/atlanta_spee

In [None]:
# # 筛选出某个时间段的所有演讲
# BEGIN, END = '2024-06-17', '2024-06-22'
BEGIN, END = '2024-11-11', '2024-11-15'
_count = 0
df = []
for speech_dir in speeches_dir:
    # 匹配该区域名称
    district = speech_dir.split("/")[-1].split("_")[0]
    print(f"district: {district}")
    # 获取区域所有讲话列表
    single_district_speeches = speechees_dict2df(district_speeches[district])
    df.append(single_district_speeches)
    single_district_speeches['pdate'] = single_district_speeches['date'].transform(lambda x: pd.to_datetime(x))
    condition = single_district_speeches["pdate"].between(pd.to_datetime(BEGIN), pd.to_datetime(END))
    single_district_speeches = single_district_speeches.loc[condition]
    # print(single_district_speeches.head(10))
    _count += single_district_speeches.shape[0]
    print(single_district_speeches.shape[0])
print(_count)

district: atlanta
0
district: bog
5
district: boston
1
district: chicago
0
district: cleveland
0
district: dallas
1
district: kansascity
0
district: minneapolis
1
district: newyork
2
district: philadelphia
1
district: richmond
1
district: sanfrancisco
0
district: stlouis
1
13


In [None]:
all_speeches_df = pd.concat(df, axis=0)
all_speeches_df = all_speeches_df.sort_values(by=['date'], ascending=False).reset_index(drop=True)
# all_speeches_df = all_speeches_df.set_index('date')
# all_speeches_df.index = pd.to_datetime(all_speeches_df.index)
# all_speeches_df = all_speeches_df.sort_index(ascending=False)
# all_speeches_df["date"] = pd.to_datetime(all_speeches_df["date"])
# condition = (all_speeches_df['date'] >=pd.to_datetime(BEGIN)) & (all_speeches_df['date']<= pd.to_datetime(END))
# all_speeches_df = all_speeches_df.loc[condition]
all_speeches_df

Unnamed: 0,speaker,position,date,title,highlights,href,content,year,pdate,summary,item_id,text_url,api_url,pdf_url,youtube_link
0,"Susan M. Collins, President & Chief Executive ...",,2024-11-15,Remarks at “The Future of Finance: Implication...,68th Economic Conference Federal Reserve Bank ...,https://www.bostonfed.org/news-and-events/spee...,Opening Remarks at the \nFederal Reserve Ban...,2024,2024-11-15,68th Economic Conference Federal Reserve Bank ...,,,,,
1,John C. Williams,President and Chief Executive Officer,2024-11-15,100 Years at 33 Liberty Street,,,Introduction\n\nWelcome back!\n\nIt’s wonderfu...,2024,2024-11-15,,,,,,
2,Chair Jerome H. Powell,,2024-11-14,Economic Outlook,,,"November 14, 2024\n\nChair Jerome H. Powell\n\...",2024,2024-11-14,,,,,,
3,Governor Adriana D. Kugler,,2024-11-14,Central Bank Independence and the Conduct of M...,,,"November 14, 2024\n\nGovernor Adriana D. Kugle...",2024,2024-11-14,,,,,,
4,Chair Jerome H. Powell,Chair,2024-11-14,Economic Outlook,,https://www.federalreserve.gov/newsevents/spee...,"November 14, 2024\n\nChair Jerome H. Powell\n\...",2024,2024-11-14,,,,,,
5,Governor Adriana D. Kugler,Governor,2024-11-14,Central Bank Independence and the Conduct of M...,,https://www.federalreserve.gov/newsevents/spee...,"November 14, 2024\n\nGovernor Adriana D. Kugle...",2024,2024-11-14,,,,,,
6,John C. Williams,President and Chief Executive Officer,2024-11-14,X Marks the Spot: Making Missing Markets,,,Introduction\n\nLet me add my welcome to the N...,2024,2024-11-14,,,,,,
7,Lorie K. Logan,,2024-11-13,Navigating in shallow waters: Monetary policy ...,,https://www.dallasfed.org/news/speeches/logan/...,"Thank you, Kunal [Patel], for the kind introdu...",2024,2024-11-13,,,,,,
8,Alberto G. Musalem’,,2024-11-13,View video of the event and photos from his trip,,https://www.stlouisfed.org/from-the-president/...,"Nov. 13, 2024\nEvent\n\nSt. Louis Fed Presiden...",2024,2024-11-13,,,,,,
9,Governor Christopher J. Waller,,2024-11-12,What Roles Should the Private Sector and the F...,,,"November 12, 2024\n\nGovernor Christopher J. W...",2024,2024-11-12,,,,,,


In [72]:
print(all_speeches_df.iloc[1]['content'])

Introduction

Welcome back!

It’s wonderful to see so many of our alumni joining us today as we celebrate the 110th anniversary of the New York Fed—and the centennial of our Liberty Street home.

Buildings tell stories. And this unique building—made of sandstone and limestone, with vaulted ceilings and a gold vault in the basement—has much to say about where we’ve been, where we are, and where we’re going.

It stood as a beacon of strength during crises: Black Thursday, Black Monday, 9/11, the Global Financial Crisis, and the COVID-19 pandemic. And it’s witnessed remarkable change—in the global economy and financial markets, in technology and the ways we work, and, as you see in the photos, in hair styles and office attire.

But just as 33 Liberty Street kept its distinctive stone exterior as glass skyscrapers grew around it, our dedication to the mission of serving the American public has remained constant.

And throughout our history, our ability to be at the forefront of anticipatin

In [None]:
def drop_duplicates_speech_info_app(
    filepath: str=None, tag_fields: list = ["speaker", "date", "title"]
):
    if not filepath: 
        district = input("Enter district: ")
        filepath = (
            f"../data/fed_speeches/{district}_fed_speeches/{district}_speech_infos.json"
        )
    existed = json_load(filepath)
    unique = set()
    result = {}
    for year, single_year_infos in existed.items():
        for info in single_year_infos:
            tag = tuple(info.get(field) for field in tag_fields)
            if tag not in unique:
                unique.add(tag)
                result.setdefault(year, []).append(info)
    json_dump(result, filepath)

# drop_duplicates_speech_info_app()

In [None]:
def sort_speeches_app(district: str = "dallas"):
    filepath = f"../data/fed_speeches/{district}_fed_speeches/{district}_speeches.json"
    speeches = json_load(filepath)
    speeches = sort_speeches_dict(speeches)
    json_dump(speeches, filepath)
    print(f"{district}_speeches have sorted.")

## 合并FRB speeches和FED Speeches

In [None]:
def transfer_frb_speeches_app(existed_speech_path: str, new_speech_dirs: str):
    # 原有的演讲合并到现在的演讲目录中
    existed = json_load(existed_speech_path)
    for year in range(2006, 2016):
        filename = f"{new_speech_dirs}/philadelphia_speeches_{year}.json"
        original = json_load(filename)
        if not original:
            original = []
        existed = update_dict(existed, {f"{year}": original})

    existed = sort_speeches_dict(existed)
    json_dump(existed, existed_speech_path)

# transfer_frb_speeches_app(
#     "../data/fed_speeches/philadelphia_fed_speeches/philadelphia_speeches.json",
#     "../data/frb_speeches/philadelphia",
# )

## 打包all_speeches

In [None]:
def pack_speeches_only_app(version_path: str, target_path: str):
    import glob

    speech_dirs = glob.glob(version_path + "/*_fed_speeches")
    for dir in speech_dirs:
        target_filepath = glob.glob(dir + "/*_speeches.json")
        target_file = json_load(target_filepath[0])
        # 文件名
        # district = dir.replace("\\", "/").split("/")[-1].replace("_fed_speeches", "")
        # output_filepath = target_path + f"/{district}_speeches.json"
        output_filepath = (
            target_path + "/" + target_filepath[0].replace("\\", "/").split("/")[-1]
        )
        json_dump(target_file, output_filepath)
        print(f"{dir} have processed.")


# pack_speeches_only_app(
#     version_path="../data/fed_speeches",
#     target_path="../data/fed_speeches/0",
# )