In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install secedgar

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting secedgar
  Using cached secedgar-0.4.0-py3-none-any.whl (61 kB)
Collecting async-timeout<4.0,>=3.0
  Using cached async_timeout-3.0.1-py3-none-any.whl (8.2 kB)
Collecting aiohttp
  Using cached aiohttp-3.8.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
  Using cached aiohttp-3.7.4.post0-cp37-cp37m-manylinux2014_x86_64.whl (1.3 MB)
Installing collected packages: async-timeout, aiohttp, secedgar
  Attempting uninstall: async-timeout
    Found existing installation: async-timeout 4.0.2
    Uninstalling async-timeout-4.0.2:
      Successfully uninstalled async-timeout-4.0.2
  Attempting uninstall: aiohttp
    Found existing installation: aiohttp 3.8.1
    Uninstalling aiohttp-3.8.1:
      Successfully uninstalled aiohttp-3.8.1
Successfully installed aiohttp-3.7.4.post0 async-timeout-3.0.1 secedgar-0.4.0


In [None]:
!pip install nest_asyncio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nest_asyncio
  Downloading nest_asyncio-1.5.5-py3-none-any.whl (5.2 kB)
Installing collected packages: nest-asyncio
Successfully installed nest-asyncio-1.5.5


In [None]:
import pandas as pd
import numpy as np
import os
import warnings

from secedgar import FilingType, filings
from datetime import date
import string
import random
warnings.filterwarnings("ignore")

from tqdm import tqdm

In [None]:
data_path = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/data/"
data_path_10q = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/data/10Q/"
data_path_10k = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/data/10K/"

cik_lookup_filename = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/CIK_lookup_results_cleaned.csv"
sp500_constituents_path = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/sp500_constituents.csv"
sp500_id_path = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/sp500_w_addl_id.csv"

In [None]:
import nest_asyncio
nest_asyncio.apply()

#### The input file here is `CIK_lookup_results_cleaned.csv`. This is the same as `CIK_lookup_results.csv` we queried from WRDS CRSP, except for having less columns of data because some of the columns are not useful for us in this section.

In [None]:
cik_lookup = pd.read_csv(cik_lookup_filename)
cik_lookup.head()

Unnamed: 0,conm,cik,tic,LPERMNO,LINKDT,LINKENDDT
0,AMERICAN AIRLINES GROUP INC,6201.0,AAL,21020,2013/12/09,E
1,AMERICAN AIRLINES GROUP INC,6201.0,AAL,21020,1962/01/31,2012/01/04
2,AMERICAN AIRLINES GROUP INC,6201.0,AAL,21020,1950/01/01,1962/01/30
3,PINNACLE WEST CAPITAL CORP,764622.0,PNW,27991,1962/01/31,E
4,ABBOTT LABORATORIES,1800.0,ABT,20482,1950/01/01,1962/01/30


In [None]:
cik_lookup.shape

(878, 6)

#### There are 18 null CIK data in this dataframe

In [None]:
cik_lookup[["cik"]].isna().sum()

cik    18
dtype: int64

#### Drop these null values

In [None]:
cik_lookup = cik_lookup.loc[cik_lookup["cik"].isna() == False]

In [None]:
cik_lookup

Unnamed: 0,conm,cik,tic,LPERMNO,LINKDT,LINKENDDT
0,AMERICAN AIRLINES GROUP INC,6201.0,AAL,21020,2013/12/09,E
1,AMERICAN AIRLINES GROUP INC,6201.0,AAL,21020,1962/01/31,2012/01/04
2,AMERICAN AIRLINES GROUP INC,6201.0,AAL,21020,1950/01/01,1962/01/30
3,PINNACLE WEST CAPITAL CORP,764622.0,PNW,27991,1962/01/31,E
4,ABBOTT LABORATORIES,1800.0,ABT,20482,1950/01/01,1962/01/30
...,...,...,...,...,...,...
873,TRIPADVISOR INC,1526520.0,TRIP,13168,2011/12/21,E
874,CBRE GROUP INC,1138118.0,CBRE,90199,2004/06/10,E
875,WELLCARE HEALTH PLANS INC,1279363.0,WCG,90272,2004/07/07,2020/01/31
876,LYONDELLBASELL INDUSTRIES NV,1489393.0,LYB,12345,2010/10/14,E


We found that there are 18 null values of CIK and we dropped those 18 rows. `cik_lookup` is our new dataframe. And we list out all unique CIKs we have right now in the following cells. 

We also notice that we have duplicates of company names, CIK and ticker combination because they correspond to more than one LINKDT or LINKENDDT, so we drop duplicates. After these preprecessing, we have 726 CIKs left.

In [None]:
cik_lookup = cik_lookup.drop_duplicates(subset=["conm", "cik", "tic"], inplace=False)
cik_lookup = cik_lookup.reset_index(drop=True, inplace=False)

In [None]:
cik_lookup

Unnamed: 0,conm,cik,tic,LPERMNO,LINKDT,LINKENDDT
0,AMERICAN AIRLINES GROUP INC,6201.0,AAL,21020,2013/12/09,E
1,PINNACLE WEST CAPITAL CORP,764622.0,PNW,27991,1962/01/31,E
2,ABBOTT LABORATORIES,1800.0,ABT,20482,1950/01/01,1962/01/30
3,ADVANCED MICRO DEVICES,2488.0,AMD,61241,1972/12/14,E
4,AETNA INC,1122304.0,AET,88845,2000/12/14,2018/11/30
...,...,...,...,...,...,...
721,TRIPADVISOR INC,1526520.0,TRIP,13168,2011/12/21,E
722,CBRE GROUP INC,1138118.0,CBRE,90199,2004/06/10,E
723,WELLCARE HEALTH PLANS INC,1279363.0,WCG,90272,2004/07/07,2020/01/31
724,LYONDELLBASELL INDUSTRIES NV,1489393.0,LYB,12345,2010/10/14,E


In [None]:
print(cik_lookup['cik'].nunique())
print(cik_lookup['LPERMNO'].nunique())

726
715


Now we use the dataset provided by instructor and TAs to get the list of companies included in S&P500 for every quarter from 2011-2021. We first assign a label indicating the year and month for every entry. Since S&P500 rebalances its membership multiple times during a year, we think that grouping all members monthly would make sense for our analysis. Then we use the dataframe above with CIK and LPERMNO to assign CIK to each entry in the dataframe below.

In [None]:
sp500_w_addl_id = pd.read_csv(sp500_id_path).iloc[:, 1:]

In [None]:
sp500_w_addl_id

Unnamed: 0,date,permno,comnam,ncusip,shrcd,exchcd,hsiccd,ticker,gvkey,iid,start,ending,ret
0,2011-01-31,60986.0,NEWELL RUBBERMAID INC,65122910,11.0,1.0,3089.0,NWL,7875,01,1989-04-27,2022-03-31,0.058856
1,2011-01-31,85914.0,BEST BUY COMPANY INC,08651610,11.0,1.0,5731.0,BBY,2184,01,1999-06-30,2022-03-31,-0.008457
2,2011-01-31,80711.0,APARTMENT INVESTMENT & MGMT CO,03748R10,18.0,1.0,6798.0,AIV,30490,01,2003-03-14,2020-12-18,-0.010836
3,2011-01-31,59176.0,AMERICAN EXPRESS CO,02581610,11.0,1.0,6141.0,AXP,1447,01,1976-07-01,2022-03-31,0.014912
4,2011-01-31,75100.0,TIFFANY & CO NEW,88654710,11.0,1.0,5944.0,TIF,13646,01,2000-06-21,2021-01-06,-0.066485
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66113,2021-12-31,83143.0,IRON MOUNTAIN INC NEW,46284V10,18.0,1.0,4226.0,IRM,62374,1,2009-01-06,2022-03-31,0.165240
66114,2021-12-31,76605.0,AUTOZONE INC,05333210,11.0,1.0,5531.0,AZO,23809,1,1997-01-02,2022-03-31,0.153720
66115,2021-12-31,60097.0,MEDTRONIC PLC,G5960L10,12.0,1.0,3845.0,MDT,7228,1,1986-10-23,2022-03-31,-0.024555
66116,2021-12-31,27828.0,H P INC,40434L10,11.0,1.0,3571.0,HPQ,5606,1,1974-10-17,2022-03-31,0.074830


#### Merge two dataframes to assign CIK to SP500 companies

In [None]:
sp500_w_addl_id['permno'] = sp500_w_addl_id['permno'].apply(lambda x: int(x))
sp500_w_addl_id = sp500_w_addl_id.merge(right=cik_lookup[['cik','LPERMNO']], 
                                        left_on='permno', right_on='LPERMNO', how="inner")

In [None]:
sp500_w_addl_id.head()

Unnamed: 0,date,permno,comnam,ncusip,shrcd,exchcd,hsiccd,ticker,gvkey,iid,start,ending,ret,cik,LPERMNO
0,2011-01-31,60986,NEWELL RUBBERMAID INC,65122910,11.0,1.0,3089.0,NWL,7875,1,1989-04-27,2022-03-31,0.058856,814453.0,60986
1,2011-02-28,60986,NEWELL RUBBERMAID INC,65122910,11.0,1.0,3089.0,NWL,7875,1,1989-04-27,2022-03-31,0.007273,814453.0,60986
2,2011-03-31,60986,NEWELL RUBBERMAID INC,65122910,11.0,1.0,3089.0,NWL,7875,1,1989-04-27,2022-03-31,-0.010858,814453.0,60986
3,2011-04-29,60986,NEWELL RUBBERMAID INC,65122910,11.0,1.0,3089.0,NWL,7875,1,1989-04-27,2022-03-31,-0.003659,814453.0,60986
4,2011-05-31,60986,NEWELL RUBBERMAID INC,65122910,11.0,1.0,3089.0,NWL,7875,1,1989-04-27,2022-03-31,-0.061385,814453.0,60986


In [None]:
print(sp500_w_addl_id['cik'].nunique())
print(sp500_w_addl_id['permno'].nunique())
print(sp500_w_addl_id['ticker'].nunique())

726
715
756


In [None]:
def assign_month_label(date_str):
    return date_str[:-3]

In [None]:
sp500_w_addl_id["LABEL"] = sp500_w_addl_id['date'].apply(lambda x: assign_month_label(x))

#### For this part, we create a dictionary of lists which contain all s&p 500 companies in a certain quarter of a certain year.

In [None]:
# create a dictionary of lists which contain all s&p 500 companies in certain quarter of certain year
mon_keys = []
month_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
for year in range(2011, 2022):
    for month in month_list:
    mon_keys.append(str(year) + "-" + str(month))
print(mon_keys)
print(len(mon_keys))  # this should be 12 * 11 = 132 months in total

monthly_sp500_dict = {}
# select dataframe from a month and find unique companies to form a list
for key in mon_keys:
    df_mon = sp500_w_addl_id.loc[sp500_w_addl_id['LABEL'] == key]
    tickers = list(df_mon['ticker'].unique())
    monthly_sp500_dict[key] = tickers

['2011-01', '2011-02', '2011-03', '2011-04', '2011-05', '2011-06', '2011-07', '2011-08', '2011-09', '2011-10', '2011-11', '2011-12', '2012-01', '2012-02', '2012-03', '2012-04', '2012-05', '2012-06', '2012-07', '2012-08', '2012-09', '2012-10', '2012-11', '2012-12', '2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12', '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12', '2015-01', '2015-02', '2015-03', '2015-04', '2015-05', '2015-06', '2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12', '2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06', '2016-07', '2016-08', '2016-09', '2016-10', '2016-11', '2016-12', '2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06', '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12', '2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06', '2018-07'

In [None]:
# Check how many companies are in each quarter
for key, l in monthly_sp500_dict.items():
    print("{}: {} companies".format(key, len(l)))

2011-01: 494 companies
2011-02: 494 companies
2011-03: 494 companies
2011-04: 494 companies
2011-05: 494 companies
2011-06: 495 companies
2011-07: 495 companies
2011-08: 495 companies
2011-09: 495 companies
2011-10: 495 companies
2011-11: 496 companies
2011-12: 496 companies
2012-01: 496 companies
2012-02: 496 companies
2012-03: 496 companies
2012-04: 496 companies
2012-05: 496 companies
2012-06: 496 companies
2012-07: 496 companies
2012-08: 496 companies
2012-09: 496 companies
2012-10: 496 companies
2012-11: 496 companies
2012-12: 496 companies
2013-01: 496 companies
2013-02: 496 companies
2013-03: 496 companies
2013-04: 496 companies
2013-05: 496 companies
2013-06: 496 companies
2013-07: 496 companies
2013-08: 496 companies
2013-09: 496 companies
2013-10: 496 companies
2013-11: 496 companies
2013-12: 496 companies
2014-01: 496 companies
2014-02: 496 companies
2014-03: 496 companies
2014-04: 496 companies
2014-05: 495 companies
2014-06: 495 companies
2014-07: 495 companies
2014-08: 49

From the printed result above, we can see that number of companies in S&P500 within a certain month is usually around 494-496 from 2011-2021. When we try to count the number of companies in S&P500 within a certain quarter, the number varies from less than 500 to more than 500, which does not make sense. From the result here, we believe that checking S&P500 membership month-by-month would be more reasonable than checking S&P500 membership quarter-by-quarter.

In the next step, we are going to download all available 10-K files of S&P500 companies month-by-month. We start with a certain month, and obtain all tickers of companies which belong to S&P500 in that month, and then we check whether a company would have available 10-K during that month. If it does, we download that file returned; if not, we skip and jump to next ticker. 

In [None]:
for label in tqdm(monthly_sp500_dict.keys()):
    year, month = label.split("-")
    year, month = int(year), int(month)
    start_month, start_day = month, 1
    end_month = month
    if end_month in [1, 3, 5, 7, 8, 10, 12]:
    end_day = 31
    elif end_month in [4, 6, 9, 11]:
    end_day = 30
    elif end_month in [2]:
    if year in [2012, 2016, 2020]:
        end_day = 29
    else:
        end_day = 28

    for ticker in tqdm(monthly_sp500_dict[label]):
    # download 10-K in this month and in the list of S&P 500 firm list
    try:
        N = random.choice(list(range(3,20)))
        random_str = ''.join(random.choices(string.ascii_letters, k=N))
        file_10k = filings(cik_lookup=ticker, 
                        start_date=date(year, start_month, start_day), 
                        end_date=date(year, end_month, end_day), 
                        filing_type=FilingType.FILING_10K, 
                        user_agent=random_str)
        file_10k.save(data_path_10k, dir_pattern="{}".format(ticker))
    except:
        pass
    print("================{}==================".format(label))

#### After downloading, we check that we now have 4353 10-K files. 

In [27]:
num_10k_downloaded = 0
ticker_list = os.listdir(data_path_10k)
for ticker in ticker_list:
    file_list = os.listdir(os.path.join(data_path_10k, ticker))
    num_10k_downloaded += len(file_list)
print(num_10k_downloaded)

4353


#### Similar codes for downloading 10-Q files are listed below

In [None]:
for label in tqdm(monthly_sp500_dict.keys()):
    year, month = label.split("-")
    year, month = int(year), int(month)
    start_month, start_day = month, 1
    end_month = month
    if end_month in [1, 3, 5, 7, 8, 10, 12]:
        end_day = 31
    elif end_month in [4, 6, 9, 11]:
        end_day = 30
    elif end_month in [2]:
        if year in [2012, 2016, 2020]:
            end_day = 29
        else:
            end_day = 28

    for ticker in tqdm(monthly_sp500_dict[label]):
    #download 10-Q in this month and in the list of S&P 500 firm list
    try:
        N = random.choice(list(range(3,20)))
        random_str = ''.join(random.choices(string.ascii_letters, k=N))
        file_10q = filings(cik_lookup=ticker, 
                    start_date=date(year, start_month, start_day), 
                    end_date=date(year, end_month, end_day), 
                    filing_type=FilingType.FILING_10Q, 
                    user_agent=random_str)
        file_10q.save(data_path_10q, dir_pattern="{}".format(ticker))
    except:
        pass
    print("================{}==================".format(label))