In [None]:
%%capture
!wget https://raw.githubusercontent.com/thowley0824/capstone/main/colab_initialization/initializer.py
!pip install --no-dependencies wrds

!pip install datasets
!pip install transformers
!pip install evaluate
!pip install accelerate

import evaluate
import numpy as np
import os
import pandas as pd
import pickle
import torch

from accelerate import (Accelerator,
                        notebook_launcher)
from datasets import (Dataset, DatasetDict, ClassLabel,
                      concatenate_datasets, load_dataset)
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import get_scheduler
from transformers import (AutoModelForSequenceClassification,
                          AutoTokenizer,
                          get_scheduler)

import initializer
from huggingface_hub import notebook_login

**THE CELL BELOW ONLY NEEDS TO BE RUN ONE TIME**
* Google Colab had no accelerate config file defined
* This makes it impossible to leverage accelerate functionality
to speedup model fine-tuning tasks
* The `write_basic_config` function call generates a basic default config file for non-TPU, non-multi GPU accelerate usage (I am running my fine-tuning on a single A100 GPU, so this is sufficient for the work here)
* After writing this config, the notebook must be exited and restarted in order to source this config file for use.
* Following this step, the config file will remain instantiated, and will not require future calls.  

In [None]:
'''
COMMENTING OUT THE ACCELERATE write_basic_config FUNCTION CALL
TO AVOID UNNNEEDED RUNS IN THE FUTURE
'''

'''
from accelerate.utils import write_basic_config

write_basic_config()  # Write a config file
os._exit(00)  # Re
'''

In [None]:
initializer.initialize_colab()
notebook_login()

In [None]:
fine_tuning_result_loc = ('data/fine_tuning_result/'+
                          'fine_tuning_result.pkl')

if os.path.exists(fine_tuning_result_loc):

    with open(fine_tuning_result_loc, 'rb') as f:
        model_results_record = pickle.load(f)
else:
    model_results_record = []

model_results_record

In [None]:
def training_function(model, dataset_name):

    accelerator = Accelerator()

    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
    eval_dataloader = DataLoader(eval_dataset, batch_size=32)

    optimizer = AdamW(model.parameters(), lr=3e-5)

    train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
     train_dataloader, eval_dataloader, model, optimizer)

    num_epochs = 3
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler("linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps)

    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

    metric=evaluate.combine(["accuracy", "recall", "precision", "f1"])

    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        metric.add_batch(predictions=predictions,
                         references=batch["labels"],
                         average='weighted')


    eval_metric = metric.compute()
    eval_metric['dataset_name'] = dataset_name

    model_results_record.append(eval_metric)

    with open(fine_tuning_result_loc, 'wb') as f:
        pickle.dump(model_results_record, f)

    accelerator.print(eval_metric)

In [None]:
model_1 = AutoModelForSequenceClassification.from_pretrained(
        "nlpaueb/sec-bert-shape", num_labels=2)

curr_dataset_name = 'shape_long_window_2_labels_scar'

curr_dataset = load_dataset(f"""thowley824/{curr_dataset_name}""")
curr_dataset.set_format("torch")

train_dataset = curr_dataset["train"]
eval_dataset = curr_dataset["test"]

print(f"""FINE-TUNING DATASET AND MODEL SUMMARY:
Model:                  nlpaueb/sec-bert-shape
Number of Labels:       2
Abnormal Return Metric: Standardized CAR
Event Window:           5 days before event through 5 days after event""")

eval_result = notebook_launcher(training_function, (model_1,
                                                    curr_dataset_name))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
model_2 = AutoModelForSequenceClassification.from_pretrained(
        "nlpaueb/sec-bert-base", num_labels=2)

curr_dataset_name = 'base_long_window_2_labels_scar'

curr_dataset = load_dataset(f"thowley824/{curr_dataset_name}")

curr_dataset.set_format("torch")

train_dataset = curr_dataset["train"]
eval_dataset = curr_dataset["test"]

print(f"""FINE-TUNING DATASET AND MODEL SUMMARY:
Model:                  nlpaueb/sec-bert-base
Number of Labels:       2
Abnormal Return Metric: Standardized CAR
Event Window:           5 days before event through 5 days after event""")

notebook_launcher(training_function, (model_2,
                                      curr_dataset_name))

In [None]:
model_3 = AutoModelForSequenceClassification.from_pretrained(
        "nlpaueb/sec-bert-shape", num_labels=2)

curr_dataset_name = 'shape_long_window_2_labels_car'

curr_dataset = load_dataset(f"thowley824/{curr_dataset_name}")
curr_dataset.set_format("torch")

train_dataset = curr_dataset["train"]
eval_dataset = curr_dataset["test"]

print(f"""FINE-TUNING DATASET AND MODEL SUMMARY:
Model:                  nlpaueb/sec-bert-shape
Number of Labels:       2
Abnormal Return Metric: CAR
Event Window:           5 days before event through 5 days after event""")

notebook_launcher(training_function, (model_3,
                                      curr_dataset_name))

In [None]:
model_4 = AutoModelForSequenceClassification.from_pretrained(
        "nlpaueb/sec-bert-base", num_labels=2)

curr_dataset_name = 'base_long_window_2_labels_car'

curr_dataset = load_dataset(f"thowley824/{curr_dataset_name}")
curr_dataset.set_format("torch")

train_dataset = curr_dataset["train"]
eval_dataset = curr_dataset["test"]

print(f"""FINE-TUNING DATASET AND MODEL SUMMARY:
Model:                  nlpaueb/sec-bert-base
Number of Labels:       2
Abnormal Return Metric: CAR
Event Window:           5 days before event through 5 days after event""")

notebook_launcher(training_function, (model_4,))

In [None]:
model_5 = AutoModelForSequenceClassification.from_pretrained(
        "nlpaueb/sec-bert-shape", num_labels=2)

curr_dataset_name = 'shape_short_window_2_labels_scar'

curr_dataset = load_dataset(f"thowley824/{curr_dataset_name}")
curr_dataset.set_format("torch")

train_dataset = curr_dataset["train"]
eval_dataset = curr_dataset["test"]

print(f"""FINE-TUNING DATASET AND MODEL SUMMARY:
Model:                  nlpaueb/sec-bert-shape
Number of Labels:       2
Abnormal Return Metric: Standardized CAR
Event Window:           1 day before event through 1 day after event""")

notebook_launcher(training_function, (model_5,
                                      curr_dataset_name))

In [None]:
model_6 = AutoModelForSequenceClassification.from_pretrained(
        "nlpaueb/sec-bert-base", num_labels=2)

curr_dataset_name = 'base_short_window_2_labels_scar'

curr_dataset = load_dataset(f"thowley824/{curr_dataset_name}")
curr_dataset.set_format("torch")

train_dataset = curr_dataset["train"]
eval_dataset = curr_dataset["test"]

print(f"""FINE-TUNING DATASET AND MODEL SUMMARY:
Model:                  nlpaueb/sec-bert-base
Number of Labels:       2
Abnormal Return Metric: Standardized CAR
Event Window:           1 day before event through 1 day after event""")

notebook_launcher(training_function, (model_6,
                                      curr_dataset_name))

In [None]:
model_7 = AutoModelForSequenceClassification.from_pretrained(
        "nlpaueb/sec-bert-shape", num_labels=2)

curr_dataset_name = 'shape_short_window_2_labels_car'

curr_dataset = load_dataset(f"thowley824/{curr_dataset_name}")
curr_dataset.set_format("torch")

train_dataset = curr_dataset["train"]
eval_dataset = curr_dataset["test"]

print(f"""FINE-TUNING DATASET AND MODEL SUMMARY:
Model:                  nlpaueb/sec-bert-shape
Number of Labels:       2
Abnormal Return Metric: CAR
Event Window:           1 day before event through 1 day after event""")

notebook_launcher(training_function, (model_7,
                                      curr_dataset_name))

In [None]:
model_8 = AutoModelForSequenceClassification.from_pretrained(
        "nlpaueb/sec-bert-base", num_labels=2)

curr_dataset_name = 'base_short_window_2_labels_car'

curr_dataset = load_dataset(f"thowley824/{curr_dataset_name}")
curr_dataset.set_format("torch")

train_dataset = curr_dataset["train"]
eval_dataset = curr_dataset["test"]

print(f"""FINE-TUNING DATASET AND MODEL SUMMARY:
Model:                  nlpaueb/sec-bert-base
Number of Labels:       2
Abnormal Return Metric: CAR
Event Window:           1 day before event through 1 day after event""")

notebook_launcher(training_function, (model_8,
                                      curr_dataset_name))