<a href="https://colab.research.google.com/github/thowley1207/capstone_project/blob/04/04_obtain_event_study_returns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade wrds
!wget https://raw.githubusercontent.com/thowley1207/capstone_project/main/colab_initialization/initializer.py

import json
import pandas as pd
import pathlib
import numpy as np
import requests
import zipfile

import initializer
initializer.initialize_colab()
db = initializer.initialize_wrds_connection()

In [None]:
'''
SET DATA SUBDIRECTORIES AND FORM TYPE PREFIX
WHEN APPLICABLE, THIS FORM TYPE PREFIX WILL BE USED MOVING FORWARD
'''

linking_data_subdir = 'data/edgar_wrds_linking/'
event_study_data_subdir = 'data/event_study/returns/'
file_prefix = '8k_'

'''
ADDITIONAL FILE NAMES CARRIED DOWN FROM PRIOR WORK
'''

event_subset_file_name = 'event_subset.pkl'

'''
NEW FILE NAMES FOR USE BELOW
'''

event_study_ret_data_file_names = [
    'event_study_ret_data_pt_1.pkl',
    'event_study_ret_data_pt_2.pkl',
    'event_study_ret_data_pt_3.pkl',
    'event_study_ret_data_pt_4.pkl']

est_win_data_file_names = [
    'est_win_data_pt_1.pkl',
    'est_win_data_pt_2.pkl',
    'est_win_data_pt_3.pkl',
    'est_win_data_pt_4.pkl']

In [None]:
'''
READ IN EVENT SUBSET DATA
'''

event_subset = pd.read_pickle((
    linking_data_subdir +
    file_prefix +
    event_subset_file_name
    ))

### **QUERY HELPER FUNCTION**

In [None]:
def execute_wrds_query(query_content,
                       wrds_session = db,
                       output_directory = 'data/',
                       output_file = None):

    query_result = db.raw_sql(query_content)

    if output_file is not None:
        output_path = f"""{output_directory}{output_file}"""
        print(f"""Writing query result to: {output_path}""")

        if output_file.endswith('.csv'):
            query_result.to_csv(output_path)
        elif output_file.endswith('.pkl'):
            query_result.to_pickle(output_path)
        else:
            raise Exception("Invalid File Format Provided For Output")
        print(f"""Query result successfully written.""")
    else:
        print(f"""Warning: output is not saved to Google Drive.""")

    return query_result

 **Step 1:**

* **Get Event Study Return Data For All Dates**
    * For each event, get the return data for all dates beginning with the start of the estimation window and ending with the end of the possible event windows
    * Due to the size of this data, we split this output into four parts
    * We then obtain the data for each part and write the output seperately for each

In [None]:
period_1 = [200501,200502,200503,200504,200601,200602,200603,
            200604,200701,200702,200703,200704,200801,200802]

period_2 = [200803,200804,200901,200902,200903,200904,201001,
            201002,201003,201004,201101,201102,201103,201104]

period_3 = [201201,201202,201203,201204,201301,201302,201303,
            201304,201401,201402,201403,201404,201501,201502]

period_4 = [201503,201504,201601,201602,201603,201604,201701,
            201702,201703,201704,201801,201802,201803,201804]

periods_lst = [period_1,period_2,period_3,period_4]

In [None]:
for i in range(4):

    l_event_study_ret_data = []
    periods = periods_lst[i]

    for per in periods:

        mod_types_event_subset = event_subset[
            event_subset['period'] == per].copy()

        for col in mod_types_event_subset.columns:
            if mod_types_event_subset[col].dtype == 'datetime64[ns]':
                mod_types_event_subset[
                    col] = mod_types_event_subset[col].astype(str)

        json_event_subset = mod_types_event_subset.to_json(orient="records")

        q_event_study = f"""
    with
    event_info as
    (
        select
           e.event_id,
           e.period,
           e.permno,
           e.event_date,
           e.est_per_start,
           e.est_per_end,
           e.event_wind_start,
           e.event_wind_end
       from json_to_recordset('{json_event_subset}') as e(
                event_id int,
                period int,
                permno float,
                event_date date,
                est_per_start date,
                est_per_end date,
                event_wind_start date,
                event_wind_end date)
    ),

    crsp_daily as
    (
        select
            dsf.permno,
            dsf.date as ret_date,
            dsf.ret as sec_return
        from crsp_a_stock.dsf as dsf
        where dsf.ret is not null
        and dsf.date between
            (select min(e.est_per_start) from event_info e) and
            (select max(e.event_wind_end) from event_info e)
        and dsf.permno in (select distinct e.permno from event_info e)
    ),

    market_returns as
    (
        select
            ff.date as mkt_date,
            ff.mktrf as mkt_excess_return,
            ff.rf as risk_free,
            ff.mktrf + ff.rf as mkt_return
        from ff_all.factors_daily as ff
        where date between
            (select min(e.est_per_start) from event_info e) and
            (select max(e.event_wind_end) from event_info e)
    ),

    crsp_w_market as
    (
        select
            crsp.permno,
            crsp.ret_date,
            crsp.sec_return,
            crsp.sec_return - m.mkt_return as sec_excess_return,
            m.mkt_excess_return,
            m.risk_free,
            m.mkt_return
        from crsp_daily crsp
            join market_returns m
                on crsp.ret_date = m.mkt_date
    )

    select
        e.event_id,
        e.period,
        e.permno,
        e.event_date,
        e.est_per_start,
        e.est_per_end,
        e.event_wind_start,
        e.event_wind_end,
        cm.ret_date,
        cm.sec_return,
        cm.sec_excess_return,
        cm.mkt_excess_return,
        cm.risk_free,
        cm.mkt_return,
        case
            when cm.ret_date between e.est_per_start and e.est_per_end
                then 1
            else 0
        end as est_per_flag,
        case
            when cm.ret_date
                between e.event_wind_start and e.event_wind_end then 1
            else 0
        end as event_wind_flag,
        case
            when cm.ret_date = e.event_date then 1
            else 0
            end as event_date_flag
    from event_info e
        join crsp_w_market cm
            on e.permno = cm.permno
            and cm.ret_date between e.est_per_start and e.event_wind_end
    order by e.event_id, cm.ret_date;
    """

        event_study_ret_data_per = execute_wrds_query(q_event_study)
        print(f'''{per} event study data retreived.
        Adding to event study data list.''')

        l_event_study_ret_data.append(event_study_ret_data_per)

        print(f'''{per} event study data added to combined data list.''')

    event_study_ret_data = pd.concat(l_event_study_ret_data,
                                     ignore_index = True
                                     ).sort_values(by = ['period',
                                                         'event_id',
                                                         'ret_date'])

    date_cols = ['event_date',
                 'est_per_start',
                 'est_per_end',
                 'event_wind_start',
                 'event_wind_end',
                 'ret_date']

    for col in date_cols:
        event_study_ret_data[col] = pd.to_datetime(event_study_ret_data[col])

    event_study_ret_data_file_name = event_study_ret_data_file_names[i]

    event_study_ret_data.to_pickle((
        event_study_data_subdir +
        file_prefix +
        event_study_ret_data_file_names[i]
        ))

    print(f'''Event study return data part {i+1} generated.
    Output written as {event_study_ret_data_file_names[i]}''')

 **Step 2:**

* **Create Event Study Return Data For The Estimation Window Only**
    * Although the estimation window data output is smaller (3.6 MB),
   we still keep it partioned and write out in pieces
    * This is because the size of the event study return data files makes
     the possibility of failure higher when reading each
     in on a loop, and each operation takes a long time
    * This way, if any individual step fails, we will still
     have the prior parts saved.
    * Additionally, the next step (generating regression parameters) is very time consuming.
    * By keeping the estimation window data split in parts,
     we are able to monitor the progress of the regression
     parameter generation, and if there is issue, we can
     adjust our approach and save each regression parameter part individually

In [None]:
for i in range(4):
    event_study_ret_data = pd.read_pickle((
        event_study_data_subdir +
        file_prefix +
        event_study_ret_data_file_names[i]))

    est_win_data = event_study_ret_data[event_study_ret_data[
        'est_per_flag']==1].drop(columns = [
            col for col in event_study_ret_data.columns
            if col not in ['event_id','sec_return','mkt_return']])

    est_win_data.to_pickle((
        event_study_data_subdir +
        file_prefix +
        est_win_data_file_names[i]))

    print(f'''Estimation window data part {i+1} generated.
    Output written as {est_win_data_file_names[i]}''')