In [1]:
#the code is to replicate the paper "Annual report readability, current earnings,and earnings persistence" by Feng Li, 2008
#using the data from 2016 to 2022
#collect my sample as follows: 
#(1)start with the intersection of CRSP-COMPUSTAT firm-years.
#(2) match GVKEY (from COMPUSTAT) and PERMNO (from CRSP) with the Central Index Key (CIK) used by SEC online Edgar system. Firms without matching CIK are dropped. 
#(3)download the 10-K filings from Edgar for every remaining firm-year. Those firm-years that do not have electronic 10-K filings on Edgar are then excluded.

In [3]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
import re

In [None]:
# 1. Load the data and merge to get CIK-CRSP-COMPUSTAT firm-years.
# 2. Download the 10-K filings from Edgar for every remaining firm-year.

In [3]:
####################
# 1. Load the data and merge to get CIK-CRSP-COMPUSTAT firm-years.
####################

In [2]:
compustat = pd.read_csv('COMPUSTAT.csv')
compustat['adate'] = pd.to_datetime(compustat['adate'])
compustat['year'] = compustat['adate'].dt.year
compustat_year = compustat[['gvkey', 'permno', 'year']].drop_duplicates()

In [3]:
CRSP = pd.read_csv('CRSP_Monthly_Stock.csv')
CRSP['date'] = pd.to_datetime(CRSP['date'])
CRSP['year'] = CRSP['date'].dt.year
CRSP_year = CRSP[['PERMNO', 'year']].drop_duplicates()

  CRSP = pd.read_csv('CRSP_Monthly_Stock.csv')


In [4]:
merged = pd.merge(compustat_year, CRSP_year, left_on=['permno', 'year'], right_on=['PERMNO', 'year'], how='inner')

In [5]:
cik = pd.read_csv('gvkeycik.csv')
cik_gvkey = cik[['gvkey', 'cik']].drop_duplicates()
cik_gvkey = cik_gvkey.dropna()

  cik = pd.read_csv('gvkeycik.csv')


In [7]:
intersection = pd.merge(merged, cik_gvkey, on=['gvkey'], how='inner')
intersection['gvkey'] = intersection['gvkey'].astype(str).str.zfill(6)
intersection['permno'] = intersection['permno'].astype(str).str.zfill(5)
intersection['cik'] = intersection['cik'].astype('Int64').astype(str)
intersection['year'] = intersection['year'].astype('Int64')

In [8]:
intersection.head(10)

Unnamed: 0,gvkey,permno,year,PERMNO,cik
0,1004,54594,2016,54594,1750
1,1004,54594,2017,54594,1750
2,1004,54594,2018,54594,1750
3,1004,54594,2019,54594,1750
4,1004,54594,2020,54594,1750
5,1004,54594,2021,54594,1750
6,1004,54594,2022,54594,1750
7,1045,21020,2016,21020,6201
8,1045,21020,2017,21020,6201
9,1045,21020,2018,21020,6201


In [None]:
####################
# 2. Download the 10-K filings from Edgar for every remaining firm-year.
####################

In [None]:
from edgar import Company,set_identity
set_identity("Wenqian Chen wc664@cornell.edu")

cik_list = intersection[["cik"]].drop_duplicates().values.flatten()

In [109]:
'''
filing_data = []

for cik in cik_list:
    company = Company(cik)
    filings = company.get_filings(form="10-K")

    for j in range(min(10, len(filings))):  
        filing_date = filings[j].filing_date
        filing_url = filings[j].text_url
        
        filing_data.append([cik, filing_date, filing_url])

df_filings = pd.DataFrame(filing_data, columns=["cik", "filing_date", "url"])
df_filings.to_csv("10-k_links.csv")
'''

stamina.retry_scheduled


In [None]:
df_filings = pd.read_csv("10-k_links.csv")
df_filings["filing_date"] = pd.to_datetime(df_filings["filing_date"], errors='coerce')
df_filings["year"] = df_filings["filing_date"].dt.year
df_filings['cik'] = df_filings['cik'].astype(str)

In [25]:
merged_df = pd.merge(intersection, df_filings, on=['cik', 'year'], how='inner')
merged_df = merged_df.drop(columns=['PERMNO'])

In [None]:
# below is the code sample to download the 10-K filings in 2017

In [35]:
merged_df_2017 = merged_df[merged_df["year"] == 2017]

In [36]:
dest_dir = "./fillings/2017"
os.makedirs(dest_dir, exist_ok=True)

def process_10k(df):
    url = df["url"]
    cik = df["cik"]
    date_filed = df["filing_date"]

    try:
        response = requests.get(url, headers={"User-Agent": "wc664@cornell.edu"})
        response.raise_for_status()
        content = response.content.decode('utf8')
        soup = BeautifulSoup(content, "html5lib")
        soup = soup.text
        lines=soup.split("\n")

        file_name = f"{cik}_{date_filed.strftime('%Y%m%d')}.txt"
        dest_path = os.path.join(dest_dir, file_name)
        with open(dest_path, 'w', encoding='utf-8') as fx:
            for line in lines:
                fx.write(line + '\n')
                
        print(f"Successfully saved {file_name}")
        
    except requests.exceptions.RequestException as e:
        print(f"Request error for {cik} ({date_filed}): {e}")
        
    except Exception as e:
        print(f"Error processing {cik} ({date_filed}): {e}")

In [None]:
#merged_df.apply(process_10k, axis=1)