<a href="https://colab.research.google.com/github/thowley1207/capstone_project/blob/06/06_generate_event_car_labels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade wrds
!wget https://raw.githubusercontent.com/thowley1207/capstone_project/main/colab_initialization/initializer.py

import json
import pandas as pd
import pathlib
import numpy as np
import requests
import zipfile

import initializer
initializer.initialize_colab()

In [None]:
'''
SET PROCESSED DATA SUBDIRECTORIES AND FORM TYPE PREFIX
WHEN APPLICABLE, THIS FORM TYPE PREFIX WILL BE USED MOVING FORWARD
'''

results_data_subdir = 'data/event_study/results/'
labels_data_subdir = 'data/event_study/labels/'
file_prefix = '8k_'

'''
ADDITIONAL FILE NAMES CARRIED DOWN FROM PRIOR WORK
'''

event_car_data_file_name = 'event_car_data.pkl'

'''
NEW FILE NAMES FOR USE BELOW
'''

event_car_data_3_bins_file_name = 'event_car_data_3_bins.pkl'
event_car_data_2_bins_file_name = 'event_car_data_2_bins.pkl'

In [None]:
'''
READ IN EVENT CAR DATA
'''

event_car_data = pd.read_pickle((
    results_data_subdir +
    file_prefix +
    event_car_data_file_name
    ))

 **Step 1:**

* **Create Labels Binning CAR / SCAR Data Into Three Bins**
    * For each CAR / SCAR column in the event CAR data dataframe, split the results into quartiles
    * Replace all raw CAR / SCAR values with integer labels as follows:
        1. Label = 0 where event CAR percentile < .25
        2. Label = 1 where .25 < event CAR percentile < .75
        3. Label = 2 where .75 < event CAR percentile
    * This will allow us to try to classify events into three bins using the LLM (positive, negative, neutral)


In [None]:
event_car_data_3_bins = event_car_data.copy()

for col in [car_col for car_col
            in event_car_data_3_bins.columns
            if car_col not in ['event_id']]:

   event_car_data_3_bins[f"{col}_bin"] = pd.qcut(event_car_data_3_bins[col],
                                                 q = [0, .25, .5, .75, 1],
                                                 labels=False)

   event_car_data_3_bins[f"{col}_bin"] = event_car_data_3_bins[
        f"{col}_bin"].where(event_car_data_3_bins[f"{col}_bin"] < 2,
                            event_car_data_3_bins[f"{col}_bin"]-1)

event_car_data_3_bins = event_car_data_3_bins.drop(columns = [
    car_col for car_col
    in event_car_data.columns if car_col not in ['event_id']])


event_car_data_3_bins = event_car_data_3_bins.rename(
    columns = {col:col.replace('_bin','')
    for col in event_car_data_3_bins.columns}
    ).drop(columns=['resid_std_error'])

In [None]:
event_car_data_3_bins.to_pickle((
    labels_data_subdir +
    file_prefix +
    event_car_data_3_bins_file_name
    ))

print(f'''Event CAR data 3 bins generated.
Output written as {event_car_data_3_bins_file_name}.''')

 **Step 2:**

* **Create Labels Binning CAR / SCAR Data Into Two Bins**
    * Using the 3 label CAR dataset created above, create a dataframe binning the CAR data into 2 bins as follows
    * Replace all raw CAR / SCAR values with integer labels as follows:
        1. Label = 0 where event CAR percentile < .25 or event CAR percentile > .75
        2. Label = 1 where .25 < event CAR percentile < .75
    * This will allow us to try to classify events into two bins using the LLM (abnormal, neutral)

In [None]:
event_car_data_2_bins = event_car_data_3_bins.copy()

for col in [car_col for car_col
            in event_car_data_2_bins.columns
            if car_col not in ['event_id']]:

    event_car_data_2_bins[col] = event_car_data_2_bins[
        col].where(event_car_data_2_bins[col] == 1,
                   0)

In [None]:
event_car_data_2_bins.to_pickle((
    labels_data_subdir +
    file_prefix +
    event_car_data_2_bins_file_name
    ))

print(f'''Event CAR data 2 bins generated.
Output written as {event_car_data_2_bins_file_name}.''')