<a href="https://colab.research.google.com/github/thowley1207/capstone_project/blob/main/01_obtain_and_combine_sec_master_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade wrds
!wget https://raw.githubusercontent.com/thowley0824/capstone/main/colab_initialization/initializer.py

import json
import pandas as pd
import pathlib
import numpy as np
import requests
import zipfile

import initializer
initializer.initialize_colab()
db = initializer.initialize_wrds_connection()

In [None]:
def convert_and_format_df(records,
                          required_cols = None,
                          float_cols = None,
                          int_cols = None,
                          date_cols = None):

    if required_cols is not None:
        df = pd.DataFrame.from_records(records)[list(required_cols)]
    else:
        df = pd.DataFrame.from_records(records)

    df.columns = df.columns.str.lower().str.replace(' ', '_')

    if float_cols is not None:
        for col in float_cols:
            col_mod = col.lower().replace(' ', '_')
            df[col_mod] = df[col_mod].astype(float)

    if int_cols is not None:
        for col in int_cols:
            col_mod = col.lower().replace(' ', '_')
            df[col_mod] = df[col_mod].astype(int)

    if date_cols is not None:
        for col in date_cols:
            col_mod = col.lower().replace(' ', '_')
            df[col_mod] = pd.to_datetime(df[col_mod])

    if 'year' in df.columns and 'quarter' in df.columns:
        df['period'] = (df['year'].astype(str
                              ) + '0' + df['quarter'].astype(str)
                              ).astype(int)

    return df

In [None]:
def download_and_write_index_file(
        output_subdir = 'data/sec_edgar/edgar_index_quarterly',
        year = 2017,
        quarter = 4,
        **headers):

    if not pathlib.Path(output_subdir).exists():
        print('Creating output directory')
        pathlib.Path.mkdir(pathlib.Path(output_subdir),parents=True)

    output_file_name = f'{year}_QTR{quarter}_master.zip'
    output_loc = pathlib.Path.cwd() / output_subdir / output_file_name

    url_prefix = 'https://www.sec.gov/Archives/edgar/full-index/'
    url_suffix = f'{year}/QTR{quarter}/master.zip'

    request_args = {'url': f'{url_prefix}{url_suffix}'}
    if len(headers) > 0:
      request_args['headers'] = headers

    output_loc.write_bytes(requests.get(**request_args).content)

    print(f'Output written: {output_loc}')

    return output_loc

In [None]:
def read_and_process_index_data(zipped_output,
                                form_type,
                                year,
                                quarter):

    period = {'year': year, 'quarter': quarter}
    index_labels = 'CIK|Company Name|Form Type|Date Filed|Filename'

    with zipfile.ZipFile(zipped_output, mode="r") as index_data:
        index_text = index_data.read("master.idx").decode(
            encoding="ISO-8859-1").split("\r\n")

    index_dicts = [{a:b for (a,b) in zip(index_labels.split('|'),
                                         row.split('|'))}
                for row in index_text if f'|{form_type}|' in row]

    index_record = [{**period, **row} for row in index_dicts]

    return index_record

In [None]:
def create_and_write_index_df(index_records,
                              output_file_name,
                              output_dir = 'data/',
                              form_type = '8-K',
                              **df_column_specs):

    if not pathlib.Path(output_dir).exists():
        print('Creating output directory')
        pathlib.Path.mkdir(pathlib.Path(output_dir),parents=True)

    if form_type == '8-K':
        output_file_prefix = '8k_'
    else:
        output_file_prefix = ''

    output_file = f"{output_dir}{output_file_prefix}{output_file_name}"

    df_combined_indices = convert_and_format_df(index_records,
                                                **df_column_specs
                            ).rename(columns={"date_filed": "event_date"}
                                     ).reset_index(names = ['event_id'])

    df_combined_indices.to_pickle(output_file)

    return df_combined_indices

In [None]:
'''
Create year-quarter dict for use in iteratively retreiving and
processing index data from Edgar
'''

reporting_periods = [dict(
    zip(('year','quarter'), period
        )) for period in ((year, quarter
                           ) for year in range(2005,2019
                                               ) for quarter in range(1,5
                                                                      ))]

'''
Define all function parameters required to step through the data
retrieval and conversion process
'''
raw_index_data_dir = 'data/sec_edgar/edgar_index_quarterly'
header_content = {'User-Agent': 'Georgia Tech thowley3@georgiatech.edu',
                  'Accept-Encoding': 'deflate, gzip',
                  'Host': 'www.sec.gov'}
form = '8-K'

master_index_all_periods_file_name = 'master_index_all_periods.pkl'
master_index_all_periods_col_specs = {'required_cols': ('year',
                                                        'quarter',
                                                        'CIK',
                                                        'Date Filed',
                                                        'Filename'),
                                      'int_cols': ('year',
                                                   'quarter',
                                                   'cik'),
                                      'date_cols': ('date_filed',)}

'''
(1) Create an empty list for holding the single period index dictionaries
(2) Iterating through each year-quarter period:
    (a) Download and write the period's raw index file
    (b) Read the period's raw index file and convert to a dictionary
    (c) Add the dataframe to the list of single period dictionaries
(3) Convert the list of index dictionaries to a single combined df
    - As specified, drop unneeded columns and convert types
(4) Write the dataframe output to pickle in specified location
'''

record_all_periods = []

for period in reporting_periods:

    output = download_and_write_index_file(
        output_subdir = raw_index_data_dir,
        **period,
        **header_content)

    record_single_period = read_and_process_index_data(
        zipped_output = output,
        form_type = form,
        **period)

    record_all_periods.extend(record_single_period)

master_index_all_periods = create_and_write_index_df(
    record_all_periods,
    master_index_all_periods_file_name,
    output_dir = 'data/edgar_wrds_linking/',
    **master_index_all_periods_col_specs)