In [None]:
!pip install datasets
!pip install transformers


import numpy as np
import pandas as pd

from datasets import (Dataset, DatasetDict, ClassLabel,
                      concatenate_datasets, load_dataset)
from transformers import AutoTokenizer

from huggingface_hub import notebook_login
notebook_login()

**Define a helper functions for use in tokenizing the cleaned and preprocessed 8K text entries in the dataset_stacked_{2/3}_labels dataframes created in 08_create_and_push_datasets:**

        def shape_tokenize_function(example):

        def base_tokenize_function(examples):

* **Functions input parameter is:**
         # The desired stacked dataframe containing event_id, labels, text, and
         #   the additional descriptor details added last script
         examples

* **Function returns:**
        # The AutoTokenizer.from_pretrained({relevant_model_loc}) object that
        #     the name of the function indicates (sec-bert-shape for the #.
        #     shape_tokenize_function, sec-bert-base for the
        #     base_tokenize_function)
        base_tokenizer / shape_tokenizer object

**These functions are soon aaplied to the stacked datasets via the dataset map functionality to efficiently create the tokenized data for both versions of the text input included as columns


* **Function steps through the following sequence:**
    * Creates a dictionary **event_regression_dict** containing a single entry:
            event_regression_dict['event_id'] = id
    * Creates a dataframe **event_est_win_data** from **estimation_window_df** containing only the data where the event_id = id
            estimation_window_df[estimation_window_df['event_id'] == event].copy()
    * Create vector **X** equal to the dataframe's market return column and add a constant
            X = event_est_win_data[['mkt_return']]
            X = sm.add_constant(X)
    * Create vector **y** from the dataframe's security return column
            y = event_est_win_data[['sec_return']]

    * Use the stats_models OLS function to create **mod**, an object equal to the OLS regression model object using X and y
    * Create **est**, an object containing the results of the fit regression model
            mod = sm.OLS(y,X)
            est = mod.fit()
    * Using the attributes of the fitted model object **est**, add the following values to the **event_regression_dict**:

            # The intercept of the fitted OLS model
            event_regression_dict['alpha'] = est.params['const']
            
            # The slope coefficient of the fitted OLS model
            event_regression_dict['beta'] = est.params['mkt_return']

            # The standard error of the fitted model's residuals
            event_regression_dict['resid_std_error']** = np.sqrt(est.mse_resid)



In [None]:
shape_tokenizer = AutoTokenizer.from_pretrained("nlpaueb/sec-bert-shape")
base_tokenizer = AutoTokenizer.from_pretrained("nlpaueb/sec-bert-base")

def shape_tokenize_function(example):
    return shape_tokenizer(example["text_8k_sec_bert_shape"],
                           padding = "max_length",
                           truncation=True)


def base_tokenize_function(examples):
    return base_tokenizer(examples['text_8k_sec_bert_base'],
                          padding = 'max_length',
                          truncation=True)

In [None]:
dataset_2_labels = load_dataset("thowley824/dataset_stacked_2_labels")
dataset_3_labels = load_dataset("thowley824/dataset_stacked_3_labels")

**Create several datasets from subsets of the overall 2 label and 3 label stacked datasets with different characteristics.**

**NOTE:** for each subset we reference in the descriptions provided below, the subset should be assumed to have been generated for both the 2 label and the 3 label data
* This is assumed because it is impossible not to consider this data seperately, as the labels cause each of the respective datasets to have a fundamental difference in the definition of each's class label.


We will fine tune several of the eventual final resultant datasets that are created in this process.

* They will allow us to observe whether there are any obvious patterns or differences between fine tuning performance based on the contrasting characteristics of the datasets (or conversely if the result is inconclusive).

* Datasets with different combinations of the following differing characteristics will be generated in this process and pushed to the hub:

    * **Text tokenized using sec-bert-base vs. text tokenized using sec-bert-shape** (these cannot remain in the same Dataset because the model accepts specifically formatted Datasets as training and evaluation data)
    * Data with **labels representing 2 bins** vs. data with **labels representing 3 bins**
    * Labels based on the **non-standardized CAR values** associated with events vs. labels based on the **standardized CAR values** associated with the events**
    * Labels generated from the results of event studies with a **long event window (starting 5 days before the event date, ending 5 days after)** vs. labels generated from the results of event studies with a **short event window (starting 1 day before the event, ending 1 day after)**
        * **NOTE:** in the prior work, we have generated event study results based on many different event windows.
            * However, due to time constraints, rather than testing all possible permutations (of which there are 72) we will only fine-tune the data corresponding to the longest and shortest symetric windows we utilized


**Step 1: Create long and short window datasets from the overall stacked data**

* To do so, pass a lambda function to the Dataset filter function resulting in the retention of only data with the desired event window start and ends.

In [None]:
long_window_2_labels = dataset_2_labels.filter(lambda x: (x['event_window_start']==-5)&(x['event_window_end']==5))
long_window_3_labels = dataset_3_labels.filter(lambda x: (x['event_window_start']==-5)&(x['event_window_end']==5))

short_window_2_labels = dataset_2_labels.filter(lambda x: (x['event_window_start']==-1)&(x['event_window_end']==1))
short_window_3_labels = dataset_3_labels.filter(lambda x: (x['event_window_start']==-1)&(x['event_window_end']==1))

**Step 2: Create shape tokenized and base tokenized datasets from the long and short window data created in Step 1.**

* To do so, pass the corresponding tokenize function defined above to the Dataset map function.

In [None]:
shape_long_window_2_labels = long_window_2_labels.map(
    shape_tokenize_function, batched=True)

shape_long_window_3_labels = long_window_3_labels.map(
    shape_tokenize_function, batched=True)

base_long_window_2_labels = long_window_2_labels.map(
    base_tokenize_function, batched=True)

base_long_window_3_labels = long_window_3_labels.map(
    base_tokenize_function, batched=True)

shape_short_window_2_labels = short_window_2_labels.map(
    shape_tokenize_function, batched=True)

shape_short_window_3_labels = short_window_3_labels.map(
    shape_tokenize_function, batched=True)

base_short_window_2_labels = short_window_2_labels.map(
    base_tokenize_function, batched=True)

base_short_window_3_labels = short_window_3_labels.map(
    base_tokenize_function, batched=True)

**Step 3: Create CAR-label-based and SCAR-label-based datasets the long and short window / base and shape tokenized data created in Step 2.**

**Step 4: Remove all columns from every dataset created in Step 3 except for those created by the tokenizer and the label column.**

**Step 5: Rename the label column labels to conform with model input requirements.**

* To do so, pass the corresponding tokenize function defined above to the Dataset map function.

In [None]:
remove_columns = [
    'event_id','text_8k_sec_bert_base','text_8k_sec_bert_shape',
    'event_window_start','event_window_end','abnormal_return_metric']

shape_long_window_2_labels_car = shape_long_window_2_labels.filter(
    lambda x: (x['abnormal_return_metric']=='car'))
shape_long_window_2_labels_car = shape_long_window_2_labels_car.map(
    remove_columns = remove_columns)
shape_long_window_2_labels_car = shape_long_window_2_labels_car.rename_column(
    "label", "labels")

shape_long_window_3_labels_car = shape_long_window_3_labels.filter(
    lambda x: (x['abnormal_return_metric']=='car'))
shape_long_window_3_labels_car = shape_long_window_3_labels_car.map(
    remove_columns = remove_columns)
shape_long_window_3_labels_car = shape_long_window_3_labels_car.rename_column(
    "label", "labels")

shape_long_window_2_labels_scar = shape_long_window_2_labels.filter(
    lambda x: (x['abnormal_return_metric']=='scar'))
shape_long_window_2_labels_scar = shape_long_window_2_labels_scar.map(
    remove_columns = remove_columns)
shape_long_window_2_labels_scar = shape_long_window_2_labels_scar.rename_column(
    "label", "labels")

shape_long_window_3_labels_scar = shape_long_window_3_labels.filter(
    lambda x: (x['abnormal_return_metric']=='scar'))
shape_long_window_3_labels_scar = shape_long_window_3_labels_scar.map(
    remove_columns = remove_columns)
shape_long_window_3_labels_scar = shape_long_window_3_labels_scar.rename_column(
    "label", "labels")

base_long_window_2_labels_car = base_long_window_2_labels.filter(
    lambda x: (x['abnormal_return_metric']=='car'))
base_long_window_2_labels_car = base_long_window_2_labels_car.map(
    remove_columns = remove_columns)
base_long_window_2_labels_car = base_long_window_2_labels_car.rename_column(
    "label", "labels")

base_long_window_2_labels_scar = base_long_window_2_labels.filter(
    lambda x: (x['abnormal_return_metric']=='scar'))
base_long_window_2_labels_scar = base_long_window_2_labels_scar.map(
    remove_columns = remove_columns)
base_long_window_2_labels_scar = base_long_window_2_labels_scar.rename_column(
    "label", "labels")

base_long_window_3_labels_car = base_long_window_3_labels.filter(
    lambda x: (x['abnormal_return_metric']=='car'))
base_long_window_3_labels_car = base_long_window_3_labels_car.map(
    remove_columns = remove_columns)
base_long_window_3_labels_car = base_long_window_3_labels_car.rename_column(
    "label", "labels")

base_long_window_3_labels_scar = base_long_window_3_labels.filter(
    lambda x: (x['abnormal_return_metric']=='scar'))
base_long_window_3_labels_scar = base_long_window_3_labels_scar.map(
    remove_columns = remove_columns)
base_long_window_3_labels_scar = base_long_window_3_labels_scar.rename_column(
    "label", "labels")

shape_short_window_2_labels_car = shape_short_window_2_labels.filter(
    lambda x: (x['abnormal_return_metric']=='car'))
shape_short_window_2_labels_car = shape_short_window_2_labels_car.map(
    remove_columns = remove_columns)
shape_short_window_2_labels_car = shape_short_window_2_labels_car.rename_column(
    "label", "labels")

shape_short_window_2_labels_scar = shape_short_window_2_labels.filter(
    lambda x: (x['abnormal_return_metric']=='scar'))
shape_short_window_2_labels_scar = shape_short_window_2_labels_scar.map(
    remove_columns = remove_columns)
shape_short_window_2_labels_scar = shape_short_window_2_labels_scar.rename_column(
    "label", "labels")

shape_short_window_3_labels_car = shape_short_window_3_labels.filter(
    lambda x: (x['abnormal_return_metric']=='car'))
shape_short_window_3_labels_car = shape_short_window_3_labels_car.map(
    remove_columns = remove_columns)
shape_short_window_3_labels_car = shape_short_window_3_labels_car.rename_column(
    "label", "labels")

shape_short_window_3_labels_scar = shape_short_window_3_labels.filter(
    lambda x: (x['abnormal_return_metric']=='scar'))
shape_short_window_3_labels_scar = shape_short_window_3_labels_scar.map(
    remove_columns = remove_columns)
shape_short_window_3_labels_scar = shape_short_window_3_labels_scar.rename_column(
    "label", "labels")

base_short_window_2_labels_car = base_short_window_2_labels.filter(
    lambda x: (x['abnormal_return_metric']=='car'))
base_short_window_2_labels_car = base_short_window_2_labels_car.map(
    remove_columns = remove_columns)
base_short_window_2_labels_car = base_short_window_2_labels_car.rename_column(
    "label", "labels")

base_short_window_2_labels_scar = base_short_window_2_labels.filter(
    lambda x: (x['abnormal_return_metric']=='scar'))
base_short_window_2_labels_scar = base_short_window_2_labels_scar.map(
    remove_columns = remove_columns)
base_short_window_2_labels_scar = base_short_window_2_labels_scar.rename_column(
    "label", "labels")

base_short_window_3_labels_car = base_short_window_3_labels.filter(
    lambda x: (x['abnormal_return_metric']=='car'))
base_short_window_3_labels_car = base_short_window_3_labels_car.map(
    remove_columns = remove_columns)
base_short_window_3_labels_car = base_short_window_3_labels_car.rename_column(
    "label", "labels")

base_short_window_3_labels_scar = base_short_window_3_labels.filter(
    lambda x: (x['abnormal_return_metric']=='scar'))
base_short_window_3_labels_scar = base_short_window_3_labels_scar.map(
    remove_columns = remove_columns)
base_short_window_3_labels_scar = base_short_window_3_labels_scar.rename_column(
    "label", "labels")

In [None]:
shape_long_window_2_labels_car.push_to_hub('shape_long_window_2_labels_car')
shape_long_window_3_labels_car.push_to_hub('shape_long_window_3_labels_car')
base_long_window_2_labels_car.push_to_hub('base_long_window_2_labels_car')
base_long_window_3_labels_car.push_to_hub('base_long_window_3_labels_car')
shape_short_window_2_labels_car.push_to_hub('shape_short_window_2_labels_car')
shape_short_window_3_labels_car.push_to_hub('shape_short_window_3_labels_car')
base_short_window_2_labels_car.push_to_hub('base_short_window_2_labels_car')
base_short_window_3_labels_car.push_to_hub('base_short_window_3_labels_car')

shape_long_window_2_labels_scar.push_to_hub('shape_long_window_2_labels_scar')
shape_long_window_3_labels_scar.push_to_hub('shape_long_window_3_labels_scar')
base_long_window_2_labels_scar.push_to_hub('base_long_window_2_labels_scar')
base_long_window_3_labels_scar.push_to_hub('base_long_window_3_labels_scar')
shape_short_window_2_labels_scar.push_to_hub('shape_short_window_2_labels_scar')
shape_short_window_3_labels_scar.push_to_hub('shape_short_window_3_labels_scar')
base_short_window_2_labels_scar.push_to_hub('base_short_window_2_labels_scar')
base_short_window_3_labels_scar.push_to_hub('base_short_window_3_labels_scar')

In [None]:
tokenized_datasets.set_format("torch")

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(eval_dataset, batch_size=8)

In [None]:
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

accelerator = Accelerator()

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer)

  num_epochs = 3
  num_training_steps = num_epochs * len(train_dataloader)
  lr_scheduler = get_scheduler(
      "linear",
      optimizer=optimizer,
      num_warmup_steps=0,
      num_training_steps=num_training_steps
  )

  progress_bar = tqdm(range(num_training_steps))

  model.train()
  for epoch in range(num_epochs):
      for batch in train_dataloader:
-         batch = {k: v.to(device) for k, v in batch.items()}
          outputs = model(**batch)
          loss = outputs.loss
-         loss.backward()
+         accelerator.backward(loss)

          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)