<a href="https://colab.research.google.com/github/thowley1207/capstone_project/blob/main/08_create_and_push_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://raw.githubusercontent.com/thowley1207/capstone_project/main/colab_initialization/initializer.py
!pip install --no-dependencies wrds
!pip install datasets

import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
import pathlib
import os
import re
import spacy
import string
import pickle

from datasets import (Dataset, DatasetDict, ClassLabel,
                      concatenate_datasets, load_dataset)
from transformers import AutoTokenizer

import initializer
initializer.initialize_colab()

from huggingface_hub import notebook_login
notebook_login()

In [None]:
'''
SET PROCESSED DATA SUBDIRECTORIES AND FORM TYPE PREFIX
WHEN APPLICABLE, THIS FORM TYPE PREFIX WILL BE USED MOVING FORWARD
'''
labels_data_subdir = 'data/event_study/labels/'
sec_edgar_data_subdir = 'data/sec_edgar/'
dataset_inputs_subdir = 'data/dataset_inputs/'

dataset_inputs_2_labels_subdir = 'data/dataset_inputs/text_w_2_labels/'
dataset_inputs_3_labels_subdir = 'data/dataset_inputs/text_w_3_labels/'

file_prefix = '8k_'

'''
FILE NAMES CARRIED DOWN FROM PRIOR WORK
'''

labels_2_bins_file_name = 'event_car_data_2_bins.pkl'
labels_3_bins_file_name = 'event_car_data_3_bins.pkl'
text_cleaned_file_name = 'text_cleaned.pkl'

'''
NEW FILE NAMES FOR USE BELOW
'''
text_w_labels_wide_file_name = 'text_w_labels_wide.pkl'

In [None]:
'''
READ IN LABELS DATA
'''

labels_2_bins = pd.read_pickle(
    labels_data_subdir +
    file_prefix +
    labels_2_bins_file_name)

labels_3_bins = pd.read_pickle(
    labels_data_subdir +
    file_prefix +
    labels_3_bins_file_name)

'''
READ IN CLEANED TEXT DATA
'''

text_cleaned = pd.read_pickle(
    sec_edgar_data_subdir +
    file_prefix +
    text_cleaned_file_name)

In [None]:
'''
LOAD THE sec-bert-shape MODEL TOKENIZER AS WELL AS
    THE spacy TOKENIZER
REQUIRED TO PREPROCESS MODEL INPUT TEXT WHEN USING
SEC BERT SHAPE
'''
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/sec-bert-shape")
spacy_tokenizer = spacy.load("en_core_web_sm")

**HELPER FUNCTION: for use in preprocessing sentences that will be tokenized using the SEC BERT SHAPE pretrained tokenizer.**

* This is necessary because the SEC BERT SHAPE tokenizer intentionally replaces numbers with pseudo-tokens that represent the number’s shape, so numeric expressions (of known shapes) are no longer fragmented.
    * Example: '53.2' becomes '[XX.X]'

* **NOTE: Code leverages the text preprocessing example function provided on the Hugging Face model document page (https://huggingface.co/nlpaueb/sec-bert-shape)**

        sec_bert_shape_preprocess(dset)

* **Function input parameters are:**
        # The dataset containing the text that requires preprocessing
        dset

* **Function returns:**
        # The input dataset with a new column 'text_8k_sec_bert_base' containing the preprocessed text now included
        dset
        

In [None]:
def sec_bert_shape_preprocess(dset):

    tokens = [t.text for t in spacy_tokenizer(dset['text_8k_sec_bert_base'])]

    processed_text = []
    for token in tokens:
        if re.fullmatch(r"(\d+[\d,.]*)|([,.]\d+)", token):
            shape = '[' + re.sub(r'\d', 'X', token) + ']'

            if shape in tokenizer.additional_special_tokens:
                processed_text.append(shape)
            else:
                processed_text.append('[NUM]')

        else:
            processed_text.append(token)

    dset['text_8k_sec_bert_shape'] = ' '.join(processed_text)

    return dset

**Step 1: Create a dataset containing a wide version of the sec text-event abnormal return label data corresponding to each event.**

 *
**NOTE: This wide-style dataset is meant to be a visual representation of the data for each event.**
* While it can be used as resource to understand and visualize the event-level data included, it is not as useful as alternative formulations in modeling.
* For the purposes of later work, the stacked versions of this data created in step two below are the version used.

*   Previously, in the `06_generate_event_car_labels` script we created two dataframes containing labels corresponding to each event's CAR/SCAR value within a variety of different event window lengths, and subsequently wrote the output to a pickled dataframe:

    * A df containing the results of applying two labels to the data by event_id
    * A df containing the results of applying three labels to the data by event_id

*   These dfs have identical structure, with naming convention for the label columns =
`{car or scar}_{event window relative start}_{event window relative end}`


* Rather than continuing to save this data as dataframes on Google Drive, **we will convert them to Datasets and push the Datasets to Hugging Face**

* We will create two versions of the original data; the following step creates the first, a wide dataset with the following characteristics:

    * num_rows = num unique event_id included in the data
    * columns =
        * event_id of each unique event in the dataset (essentially acts as a primary key)
        * label data columns corresponding to each defined labeling permutation
        * text columns containing two versions of each event's 8K text:
            * the version of the 8K text that was cleaned and filtered in `07_obtain_and_clean_8k_text` immediately following its retreival
            * the version of the 8k text following post processing using the function defined above (in order to format text as acceptable input to sec_base_shape)

In [None]:
'''
Relabel each label column in the 2 label and 3 label dataframes
    to include the prefix '{2/3}_labels_'.
This is necessary because these dataframes have identical column names,
    and this allows us to distinguish between 2/3 label data in a
    wide form that represents each event in a single row.
'''

col_rename_2_bins = {col:f'2_labels_{col}' for col
                     in labels_2_bins.columns
                     if col != 'event_id'}

col_rename_3_bins = {col:f'3_labels_{col}' for col
                     in labels_3_bins.columns
                     if col != 'event_id'}

labels_2_bins = labels_2_bins.rename(columns=col_rename_2_bins)
labels_3_bins = labels_3_bins.rename(columns=col_rename_3_bins)

'''
Merge all the previously seperate dataframes into a single dataframe
    using 'event_id' as the merge key.
This creates our initial wide view, but does not yet contain the text
    preprocessed using the function defined above.
Truncate the text strings to be <= 1 million characters in order to
    avoid issues with size limitations for spacy input (only 61 rws)
'''

labels = labels_2_bins.merge(labels_3_bins, how='left', on='event_id')

text_w_labels_wide = text_cleaned.merge(labels,
                                        how='left',
                                        on='event_id').rename(
                            columns={'text_8k': 'text_8k_sec_bert_base'})

text_w_labels_wide['text_8k_sec_bert_base'] = text_w_labels_wide[
    'text_8k_sec_bert_base'].str.slice(0,999999)

* Now that we have created the wide dataset from our prior data, we make several adjustments before pushing the dataset to Hugging Face

    1. **Convert each label column from its original integer format into ClassLabels**
        * Includs logic required to set the number of classes and the names of each ClassLabel appropriately, as they differ for two and three level label cols
    2. **Use the dataset map functionality to create the `text_8k_sec_bert_shape` data via the function defined above**
        * This is the most efficient way to create this data
    3. **Reorder the columns so that the order is more intuitive**
        * Uses the select function and a list of the desired order

**Once these final few steps are complete, the wide dataset is pushed to the Hub**



In [None]:
dataset_wide = Dataset.from_pandas(text_w_labels_wide,
                                   split='train')

new_features = dataset_wide.features.copy()
for col_name in labels_2_bins.columns:

    if col_name != 'event_id':

        new_features[col_name] = ClassLabel(
            num_classes = 2,
            names=['non-neutral', 'neutral'])

for col_name in labels_3_bins.columns:

    if col_name != 'event_id':

        new_features[col_name] = ClassLabel(
            num_classes = 3,
            names=['low', 'neutral', 'high'])

dataset_wide = dataset_wide.cast(new_features)

dataset_wide = dataset_wide.map(sec_bert_shape_preprocess, num_proc=8)

ordered_col_list = [
    'event_id','text_8k_sec_bert_base','text_8k_sec_bert_shape',
    '2_labels_car_-5_5','2_labels_scar_-5_5','2_labels_car_-4_4',
    '2_labels_scar_-4_4','2_labels_car_-3_3','2_labels_scar_-3_3',
    '2_labels_car_-2_2','2_labels_scar_-2_2','2_labels_car_-1_1',
    '2_labels_scar_-1_1','2_labels_car_-1_2','2_labels_scar_-1_2',
    '2_labels_car_-1_3','2_labels_scar_-1_3','2_labels_car_-1_4',
    '2_labels_scar_-1_4','2_labels_car_-1_5','2_labels_scar_-1_5',
    '3_labels_car_-5_5','3_labels_scar_-5_5','3_labels_car_-4_4',
    '3_labels_scar_-4_4','3_labels_car_-3_3','3_labels_scar_-3_3',
    '3_labels_car_-2_2','3_labels_scar_-2_2','3_labels_car_-1_1',
    '3_labels_scar_-1_1','3_labels_car_-1_2','3_labels_scar_-1_2',
    '3_labels_car_-1_3','3_labels_scar_-1_3','3_labels_car_-1_4',
    '3_labels_scar_-1_4','3_labels_car_-1_5','3_labels_scar_-1_5']

dataset_wide = dataset_wide.select_columns(ordered_col_list)

dataset_wide.push_to_hub('text_w_labels_wide')

**Step 2: Create 2 Datasets (one to hold data for labels with 2 values, the other for the data for labels with 3 values)**

* **NOTE: These are the datasets used in future work**


*   This step essentially entails taking the values in what were components of the label column names in the wide dataset and converting them into descriptor values in new columns named
    * `abnormal_return_metric` (scar or car)
    * `event_window_start`
    * `event_window_end`

* The reason that label could not likewise be included and two datasets needed to be created is because we cannot stack two different ClassLabels (one with two values, one with three) in a single column.
    * Although there were certainly workarounds (two label cols, with blank values in the column not corresponding to a given row subsets number of labels) they lead to less intuitive data structure, and were not pursued.

* The end result of this process is the creation of two new stacked datasets where in each:

    * num_rows = num unique event_id included in the data X the number of unique abnormal return metric values X the number of unique (event window start, event window end) tuples
    * columns:
        * event_id
        * sec_text_8k_base
        * sec_text_8k_shape
        * abnormal_return_metric
        * event_window_start
        * event_window_end
        * label

* Following this reshaping, the data is pushed to the Hub as two datasets with identical shape:
    * `dataset_stacked_two_labels`
    * `dataset_stacked_three_labels`



In [None]:
to_keep = ['event_id','text_8k_sec_bert_base','text_8k_sec_bert_shape']

lst_2_labels_datasets_train = []
lst_2_labels_datasets_test = []
lst_3_labels_datasets_train = []
lst_3_labels_datasets_test = []

for col in dataset_wide.column_names:

    if col not in to_keep:

        cur_cols = to_keep + [col]

        cur_dataset = dataset_wide.select_columns(cur_cols)
        current_dataset = cur_dataset.rename_column(col, 'label')

        col_split = col.split('_')

        num_labels = [int(col_split[0])]*len(dataset_wide)
        abnormal_return_metric = [col_split[-3]]*len(dataset_wide)
        event_window_start = [int(col_split[-2])] * len(dataset_wide)
        event_window_end = [int(col_split[-1])] * len(dataset_wide)

        current_dataset = current_dataset.add_column(
            'abnormal_return_metric', abnormal_return_metric)
        current_dataset = current_dataset.add_column(
            'event_window_start', event_window_start)
        current_dataset = current_dataset.add_column(
            'event_window_end', event_window_end)

        if num_labels[0] == 2:
            current_dataset = current_dataset.train_test_split(
                test_size = .2,
                stratify_by_column = 'label')

            lst_2_labels_datasets_train.append(current_dataset['train'])
            lst_2_labels_datasets_test.append(current_dataset['test'])

        elif num_labels[0] == 3:
            current_dataset = current_dataset.train_test_split(
                test_size = .2,
                stratify_by_column = 'label')

            lst_3_labels_datasets_train.append(current_dataset['train'])
            lst_3_labels_datasets_test.append(current_dataset['test'])

In [None]:
dataset_stacked_2_labels_train = concatenate_datasets(
    lst_2_labels_datasets_train)
dataset_stacked_2_labels_test = concatenate_datasets(
    lst_2_labels_datasets_test)

dataset_stacked_2_labels = DatasetDict({
    'train': dataset_stacked_2_labels_train,
    'test': dataset_stacked_2_labels_test
})

dataset_stacked_3_labels_train = concatenate_datasets(
    lst_3_labels_datasets_train)
dataset_stacked_3_labels_test = concatenate_datasets(
    lst_3_labels_datasets_test)

dataset_stacked_3_labels = DatasetDict({
    'train': dataset_stacked_3_labels_train,
    'test': dataset_stacked_3_labels_test
})

dataset_stacked_2_labels.push_to_hub('dataset_stacked_2_labels')
dataset_stacked_3_labels.push_to_hub('dataset_stacked_3_labels')