In [1]:
import pandas as pd
import os
!pip install datasets
import torch
import numpy as np
from datasets import *
!pip install transformers==4.28.0
from transformers import BigBirdTokenizer, \
BigBirdForSequenceClassification, Trainer, TrainingArguments, EvalPrediction, AutoTokenizer
!pip install transformers[sentencepiece]
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import random

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collec

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"torch.device: {device}")

torch.device: cuda


In [3]:
from google.colab import drive 
drive.mount('/content/gdrive')
folder_path = "gdrive/MyDrive/Colab Notebooks/AdvML/data/"

Mounted at /content/gdrive


In [4]:
#helper functions

def preprocess_data(examples):
  # take a batch of texts
  text = examples["new_text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

def multi_label_metrics(predictions, labels, threshold=0.6):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

### Data Preprocessing

In [5]:
# Get list of files in folder
file_list = os.listdir(folder_path)

# Create empty list to hold dataframes
df_list = []

# Loop through files in folder
for file in file_list:
    # Check if file is a CSV
    if file.endswith('.csv'):
        # Read CSV file into a pandas dataframe
        df = pd.read_csv(os.path.join(folder_path, file))
        # Append dataframe to list
        df_list.append(df)

# Concatenate all dataframes in list into a single dataframe
df = pd.concat(df_list, axis=0, ignore_index=True)

# Create results dictionary
results = {}

# Clean data
df = df.merge(
    df[['speaker', 'side']].drop_duplicates(ignore_index=True),
    how='left',
    left_on='speaker_addressed',
    right_on='speaker',
    suffixes=('', '_addressed')).drop('speaker_addressed', axis=1)

  df = pd.read_csv(os.path.join(folder_path, file))
  df = pd.read_csv(os.path.join(folder_path, file))
  df = pd.read_csv(os.path.join(folder_path, file))
  df = pd.read_csv(os.path.join(folder_path, file))


In [6]:
# Add in utterance information to utterance

for name_col, type_col in {'speaker': 'speaker_type', 'speaker_replied_to': 'speaker_type_replied_to'}.items():
    df[f'{name_col}_natural'] = df[name_col].apply(lambda name: ' '.join(name.lstrip('j__').split('_')).title())
    speaker_type_translation = {
        'J': 'Justice',
        'A': 'Attorney',
        '<Inaudile>': None
    }

    df[type_col].fillna('na', inplace=True)
    df[f'{type_col}_natural'] = df[type_col].apply(lambda s_type: speaker_type_translation[s_type] if not s_type == 'na' else None)

    df[f'{name_col}_natural'] = df[f'{type_col}_natural'] + ' ' + df[f'{name_col}_natural']
    df[f'{name_col}_natural'].fillna('Unknown', inplace=True)
    df.drop(f'{type_col}_natural', axis=1, inplace=True)

side_translation = {
    0: 'Responding',
    1: 'Petitioning',
    2: '',
    3: ''
}

df['side'].fillna(3, inplace=True)
df[f'side_natural'] = df['side'].apply(lambda side: side_translation[side])

df['side_addressed'].fillna(3, inplace=True)
df[f'side_addressed_natural'] = df['side_addressed'].apply(lambda side: side_translation[side])


df["new_text"] = "<UTTERANCE_START>" + df['side_natural'] + " " + df["speaker_natural"] + " says: '" + df["text"] + "' to " + df['side_addressed_natural'] + " " + df["speaker_replied_to_natural"] + " <UTTERANCE_END>"


j_columns = [col for col in df.columns if col.startswith('votes_side_j_')]

In [7]:
df['new_text'][0]

"<UTTERANCE_START> Justice Samuel A Alito Jr says: 'Well, if the text of this is so clear, how is it that Mr. Gould and Mr. Abbott proposed different interpretations of this provision?' to Petitioning Attorney David L Horan <UTTERANCE_END>"

### Modeling

#### Top case justices

Big Bird
https://jesusleal.io/2021/05/14/Big-Bird-text-classification-tutorial/ 

In [8]:
count = 0
justices_list = []
justices_dict = {}

for justice in j_columns:
  df_j = df[['case_id', 'new_text'] + [justice]]
  grouped_df = df_j.groupby('case_id')['new_text'].apply(lambda x: ','.join(x)).reset_index()
  justices = df[["case_id"] + [justice]].drop_duplicates(keep='first')

  df1 = pd.merge(grouped_df, justices, left_on='case_id', right_on='case_id', how='left').dropna(axis='rows', how='any')
  df1 = df1.drop(df1[~df1[justice].isin([0, 1])].index)

  justices_dict[justice] = len(df1)


In [9]:
# Sorted justices by case load
sorted_justices_by_case = sorted(justices_dict.items(), key=lambda x:x[1], reverse=True)
sorted_justices_by_case

[('votes_side_j__ruth_bader_ginsburg', 1390),
 ('votes_side_j__clarence_thomas', 1387),
 ('votes_side_j__stephen_g_breyer', 1376),
 ('votes_side_j__anthony_m_kennedy', 1264),
 ('votes_side_j__antonin_scalia', 1090),
 ('votes_side_j__john_g_roberts_jr', 1003),
 ('votes_side_j__samuel_a_alito_jr', 969),
 ('votes_side_j__john_paul_stevens', 741),
 ('votes_side_j__sonia_sotomayor', 710),
 ('votes_side_j__david_h_souter', 665),
 ('votes_side_j__elena_kagan', 605),
 ('votes_side_j__sandra_day_oconnor', 388),
 ('votes_side_j__william_h_rehnquist', 365),
 ('votes_side_j__neil_gorsuch', 195),
 ('votes_side_j__brett_m_kavanaugh', 115),
 ('votes_side_j__abe_fortas', 0),
 ('votes_side_j__arthur_j_goldberg', 0),
 ('votes_side_j__byron_r_white', 0),
 ('votes_side_j__charles_e_whittaker', 0),
 ('votes_side_j__earl_warren', 0),
 ('votes_side_j__felix_frankfurter', 0),
 ('votes_side_j__harold_burton', 0),
 ('votes_side_j__harry_a_blackmun', 0),
 ('votes_side_j__hugo_l_black', 0),
 ('votes_side_j__john_

In [10]:
top_justices = [justice for justice, votes in sorted_justices_by_case[:15]]
print(top_justices)

['votes_side_j__ruth_bader_ginsburg', 'votes_side_j__clarence_thomas', 'votes_side_j__stephen_g_breyer', 'votes_side_j__anthony_m_kennedy', 'votes_side_j__antonin_scalia', 'votes_side_j__john_g_roberts_jr', 'votes_side_j__samuel_a_alito_jr', 'votes_side_j__john_paul_stevens', 'votes_side_j__sonia_sotomayor', 'votes_side_j__david_h_souter', 'votes_side_j__elena_kagan', 'votes_side_j__sandra_day_oconnor', 'votes_side_j__william_h_rehnquist', 'votes_side_j__neil_gorsuch', 'votes_side_j__brett_m_kavanaugh']


#### Run model for top justices

In [11]:
def preprocess_for_justice(justice, test_valid_amt = 0.2, valid_portion_test_valid_amt = 0.5):
    '''
    Takes a justice votes_side column and gets data ready for model

    Inputs:
        justice (str): Justice vote side column
        test_valid_amt (float): Percentage of data allocated to testing / validation
        valid_portion_test_valid_amt (float): Percentage of data allocated to validation
    '''
    # Concatenate utterances per case
    df_j = df[['case_id', 'new_text'] + [justice]]
    grouped_df = df_j.groupby('case_id')['new_text'].apply(lambda x: ','.join(x)).reset_index()

    # Get justice and combine with utterances
    justices = df[["case_id"] + [justice]].drop_duplicates(keep='first')
    df1 = pd.merge(grouped_df,
                   justices,
                   left_on='case_id',
                   right_on='case_id',
                   how='left').dropna(axis='rows', how='any')

    # Ensure binary label
    df1 = df1.drop(df1[~df1[justice].isin([0, 1])].index)
    print(df1[justice].value_counts(normalize=True))
    pct_overturn = df1[justice].value_counts(normalize=True)[1]

    print(f'Number of cases: {len(df1)}')
    print(f'Number of training cases: {len(df1) * (1-test_valid_amt)}')
    print(f'Number of validation cases: {len(df1) * (test_valid_amt * (valid_portion_test_valid_amt))}')
    print(f'Number of test cases: {len(df1) * (test_valid_amt * (1-valid_portion_test_valid_amt))}')

    # Create dataset
    dataset = Dataset.from_pandas(df1.drop('case_id', axis=1), preserve_index = False)

    # Train/Validation/Test split
    dataset = dataset.train_test_split(test_size=test_valid_amt, shuffle=True)
    dataset_test_valid = dataset['test'].train_test_split(test_size=(1-valid_portion_test_valid_amt), shuffle=True)
    dataset = DatasetDict({
        'train': dataset['train'],
        'test': dataset_test_valid['test'],
        'validation': dataset_test_valid['train']})

    # Keep case_id per observation
    labels = [label for label in dataset['train'].features.keys() if label not in ['case_id', 'new_text']]
    id2label = {idx:label for idx, label in enumerate(labels)}
    label2id = {label:idx for idx, label in enumerate(labels)}

    return dataset, labels, id2label, label2id, pct_overturn

In [12]:
# Params for each model
tokenizer = AutoTokenizer.from_pretrained('google/bigbird-roberta-base', 
                                          max_length = 2048,
                                          cache_dir='gdrive/MyDrive/Colab Notebooks/AdvML/data/',)
lr = 2e-5
epochs = 25
output_directory = 'gdrive/MyDrive/Colab Notebooks/AdvML/results/'
cache_directory = 'gdrive/MyDrive/Colab Notebooks/AdvML/data/'
train_batch_size = 8
eval_batch_size = 8
save_and_eval_strat = "epoch"
warm_steps = 160
wt_decay = 0.01
dataloader_num_workers_input = 2
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
validation_and_testing_percentage = 0.2
split_of_validation_and_testing = 0.5 # This is percentage of above going to validation

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/846k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

Save model
https://discuss.huggingface.co/t/how-to-save-my-model-to-use-it-later/20568/2

In [16]:
overall_results = {}
baselines = {}
for justice in top_justices[6:]:
    # Get data
    dataset, labels, id2label, label2id, pct_overturn = preprocess_for_justice(justice,
                                                                               validation_and_testing_percentage,
                                                                               split_of_validation_and_testing)
    justice_name = justice.split("j__")[1]

    baselines[justice_name] = pct_overturn

    # Load model and tokenizer and define length of the text sequence
    model = BigBirdForSequenceClassification.from_pretrained('google/bigbird-roberta-base',
                                                             gradient_checkpointing=False,
                                                             num_labels = len(labels),
                                                             id2label=id2label,
                                                             label2id=label2id,
                                                             cache_dir=cache_directory,
                                                             return_dict=True)
    # Tokenize dataset
    encoded_dataset = dataset.map(preprocess_data,
                                  batched=True,
                                  remove_columns=dataset['train'].column_names)

    # get args with appropriate model name
    args = TrainingArguments(
        output_dir = output_directory,
        num_train_epochs = epochs,
        per_device_train_batch_size = train_batch_size,
        # gradient_accumulation_steps = 32,    
        per_device_eval_batch_size= eval_batch_size,
        evaluation_strategy = save_and_eval_strat,
        save_strategy = save_and_eval_strat,
        save_total_limit = 2,
        disable_tqdm = False, 
        load_best_model_at_end=True,
        warmup_steps=warm_steps,
        weight_decay=wt_decay,
        # logging_steps = 4,
        learning_rate = lr,
        # logging_dir='gdrive/MyDrive/Colab Notebooks/AdvML/logs/',
        dataloader_num_workers = dataloader_num_workers_input,
        run_name = f'bigbird_classification_{justice_name}_{lr}'
        )
    # Run trainer and save output

    trainer = Trainer(
        model,
        args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
        )
    
    print(f'Starting training for {justice_name}')
    trainer.train()

    results = trainer.evaluate()

    # trainer.save_model(f'gdrive/MyDrive/Colab Notebooks/AdvML/models/bigbird_{justice_name}_{lr}_v01')
    
    # from transformers import AutoModel
    # trained_model = AutoModel.from_pretrained(f'gdrive/MyDrive/Colab Notebooks/AdvML/models/bigbird_{justice_name}_{lr}_v01')
    
    # Initialize a new trainer instance with the trained model and test data
    trainer_test = Trainer(
        model=model,  
        args=args,
        eval_dataset=encoded_dataset["test"], 
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
        )

    # Evaluate the model on the test data
    test_results = trainer_test.evaluate()

    # Print results
    print(f'For justice {justice_name}, results are below:')
    print(test_results)

    # Add results to output
    overall_results[justice_name] = test_results
    

1.0    0.601651
0.0    0.398349
Name: votes_side_j__samuel_a_alito_jr, dtype: float64
Number of cases: 969
Number of training cases: 775.2
Number of validation cases: 96.9
Number of test cases: 96.9


Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

Map:   0%|          | 0/775 [00:00<?, ? examples/s]

Map:   0%|          | 0/97 [00:00<?, ? examples/s]

Map:   0%|          | 0/97 [00:00<?, ? examples/s]

Starting training for samuel_a_alito_jr


Attention type 'block_sparse' is not possible if sequence_length: 128 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.281321,0.494845,0.453247,0.494845
2,No log,0.328037,0.56701,0.5,0.56701
3,No log,0.266904,0.56701,0.5,0.56701
4,No log,0.272625,0.56701,0.5,0.56701
5,No log,0.285129,0.56701,0.5,0.56701
6,0.269700,0.251168,0.546392,0.501515,0.546392
7,0.269700,0.297793,0.57732,0.556926,0.57732
8,0.269700,0.321038,0.597938,0.558225,0.597938
9,0.269700,0.367917,0.536082,0.500866,0.536082
10,0.269700,0.404734,0.536082,0.509307,0.536082


For justice samuel_a_alito_jr, results are below:
{'eval_loss': 0.25563308596611023, 'eval_f1': 0.6082474226804123, 'eval_roc_auc': 0.6180851063829788, 'eval_accuracy': 0.6082474226804123, 'eval_runtime': 0.6069, 'eval_samples_per_second': 159.823, 'eval_steps_per_second': 21.42}
1.0    0.5722
0.0    0.4278
Name: votes_side_j__john_paul_stevens, dtype: float64
Number of cases: 741
Number of training cases: 592.8000000000001
Number of validation cases: 74.10000000000001
Number of test cases: 74.10000000000001


Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

Map:   0%|          | 0/592 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Map:   0%|          | 0/74 [00:00<?, ? examples/s]

Starting training for john_paul_stevens


Attention type 'block_sparse' is not possible if sequence_length: 128 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.311113,0.418919,0.5,0.418919
2,No log,0.276693,0.418919,0.5,0.418919
3,No log,0.245706,0.472973,0.546512,0.472973
4,No log,0.339937,0.418919,0.5,0.418919
5,No log,0.267077,0.486486,0.513128,0.486486
6,No log,0.285363,0.486486,0.526632,0.486486
7,0.263500,0.301202,0.554054,0.58027,0.554054
8,0.263500,0.476613,0.445946,0.505251,0.445946
9,0.263500,0.493842,0.472973,0.510503,0.472973
10,0.263500,0.386282,0.527027,0.525506,0.527027


For justice john_paul_stevens, results are below:
{'eval_loss': 0.24554148316383362, 'eval_f1': 0.5466666666666666, 'eval_roc_auc': 0.5050215208034433, 'eval_accuracy': 0.5466666666666666, 'eval_runtime': 0.5989, 'eval_samples_per_second': 125.232, 'eval_steps_per_second': 16.698}
1.0    0.590141
0.0    0.409859
Name: votes_side_j__sonia_sotomayor, dtype: float64
Number of cases: 710
Number of training cases: 568.0
Number of validation cases: 71.0
Number of test cases: 71.0


Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

Map:   0%|          | 0/568 [00:00<?, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Starting training for sonia_sotomayor


Attention type 'block_sparse' is not possible if sequence_length: 128 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.23255,0.676056,0.5,0.676056
2,No log,0.241622,0.676056,0.5,0.676056
3,No log,0.233398,0.676056,0.5,0.676056
4,No log,0.229541,0.676056,0.5,0.676056
5,No log,0.230175,0.676056,0.5,0.676056
6,No log,0.260218,0.676056,0.5,0.676056
7,No log,0.21907,0.676056,0.5,0.676056
8,0.285000,0.223441,0.676056,0.5,0.676056
9,0.285000,0.233503,0.676056,0.5,0.676056
10,0.285000,0.379655,0.521127,0.532609,0.521127


For justice sonia_sotomayor, results are below:
{'eval_loss': 0.24052877724170685, 'eval_f1': 0.5915492957746479, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.5915492957746479, 'eval_runtime': 0.5955, 'eval_samples_per_second': 119.219, 'eval_steps_per_second': 15.112}
1.0    0.604511
0.0    0.395489
Name: votes_side_j__david_h_souter, dtype: float64
Number of cases: 665
Number of training cases: 532.0
Number of validation cases: 66.5
Number of test cases: 66.5


Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

Map:   0%|          | 0/532 [00:00<?, ? examples/s]

Map:   0%|          | 0/67 [00:00<?, ? examples/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

Starting training for david_h_souter


Attention type 'block_sparse' is not possible if sequence_length: 128 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.252047,0.606061,0.47619,0.606061
2,No log,0.230137,0.636364,0.5,0.636364
3,No log,0.252278,0.636364,0.5,0.636364
4,No log,0.257474,0.621212,0.488095,0.621212
5,No log,0.267234,0.545455,0.4375,0.545455
6,No log,0.294766,0.469697,0.485119,0.469697
7,No log,0.259784,0.606061,0.502976,0.606061
8,0.277200,0.305363,0.545455,0.455357,0.545455
9,0.277200,0.36981,0.5,0.428571,0.5
10,0.277200,0.340885,0.575758,0.52381,0.575758


For justice david_h_souter, results are below:
{'eval_loss': 0.2268718034029007, 'eval_f1': 0.6567164179104478, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.6567164179104478, 'eval_runtime': 0.6008, 'eval_samples_per_second': 111.512, 'eval_steps_per_second': 14.979}
1.0    0.606612
0.0    0.393388
Name: votes_side_j__elena_kagan, dtype: float64
Number of cases: 605
Number of training cases: 484.0
Number of validation cases: 60.5
Number of test cases: 60.5


Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Starting training for elena_kagan


Attention type 'block_sparse' is not possible if sequence_length: 128 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.230049,0.666667,0.5,0.666667
2,No log,0.23172,0.666667,0.5,0.666667
3,No log,0.246213,0.666667,0.5,0.666667
4,No log,0.245564,0.666667,0.5,0.666667
5,No log,0.240491,0.666667,0.5125,0.666667
6,No log,0.285154,0.516667,0.525,0.516667
7,No log,0.243996,0.666667,0.6,0.666667
8,No log,0.319991,0.566667,0.5625,0.566667
9,0.245600,0.228069,0.716667,0.625,0.716667
10,0.245600,0.268733,0.65,0.5875,0.65


For justice elena_kagan, results are below:
{'eval_loss': 0.25925949215888977, 'eval_f1': 0.6229508196721312, 'eval_roc_auc': 0.5202380952380953, 'eval_accuracy': 0.6229508196721312, 'eval_runtime': 2.1303, 'eval_samples_per_second': 28.634, 'eval_steps_per_second': 3.755}
1.0    0.701031
0.0    0.298969
Name: votes_side_j__sandra_day_oconnor, dtype: float64
Number of cases: 388
Number of training cases: 310.40000000000003
Number of validation cases: 38.800000000000004
Number of test cases: 38.800000000000004


Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

Map:   0%|          | 0/310 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

Starting training for sandra_day_oconnor


Attention type 'block_sparse' is not possible if sequence_length: 128 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.223171,0.692308,0.5,0.692308
2,No log,0.210024,0.692308,0.5,0.692308
3,No log,0.265414,0.692308,0.5,0.692308
4,No log,0.230575,0.692308,0.5,0.692308
5,No log,0.240683,0.692308,0.5,0.692308
6,No log,0.25857,0.692308,0.5,0.692308
7,No log,0.254617,0.615385,0.444444,0.615385
8,No log,0.280655,0.538462,0.481481,0.538462
9,No log,0.302087,0.717949,0.587963,0.717949
10,No log,0.285221,0.717949,0.541667,0.717949


For justice sandra_day_oconnor, results are below:
{'eval_loss': 0.20715820789337158, 'eval_f1': 0.717948717948718, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.717948717948718, 'eval_runtime': 0.4929, 'eval_samples_per_second': 79.116, 'eval_steps_per_second': 10.143}
1.0    0.665753
0.0    0.334247
Name: votes_side_j__william_h_rehnquist, dtype: float64
Number of cases: 365
Number of training cases: 292.0
Number of validation cases: 36.5
Number of test cases: 36.5


Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

Map:   0%|          | 0/292 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Starting training for william_h_rehnquist


Attention type 'block_sparse' is not possible if sequence_length: 128 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.263781,0.527778,0.4375,0.527778
2,No log,0.243056,0.666667,0.5,0.666667
3,No log,0.272211,0.666667,0.5,0.666667
4,No log,0.233342,0.666667,0.5,0.666667
5,No log,0.214336,0.666667,0.5,0.666667
6,No log,0.212004,0.666667,0.5,0.666667
7,No log,0.213972,0.666667,0.5,0.666667
8,No log,0.346552,0.527778,0.583333,0.527778
9,No log,0.230292,0.666667,0.5,0.666667
10,No log,0.231965,0.722222,0.604167,0.722222


For justice william_h_rehnquist, results are below:
{'eval_loss': 0.23358500003814697, 'eval_f1': 0.7027027027027027, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.7027027027027027, 'eval_runtime': 0.5022, 'eval_samples_per_second': 73.675, 'eval_steps_per_second': 9.956}
1.0    0.610256
0.0    0.389744
Name: votes_side_j__neil_gorsuch, dtype: float64
Number of cases: 195
Number of training cases: 156.0
Number of validation cases: 19.5
Number of test cases: 19.5


Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Starting training for neil_gorsuch


Attention type 'block_sparse' is not possible if sequence_length: 128 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.551511,0.421053,0.5,0.421053
2,No log,0.185686,0.631579,0.5625,0.631579
3,No log,0.310393,0.578947,0.5,0.578947
4,No log,0.282507,0.578947,0.5,0.578947
5,No log,0.296146,0.578947,0.5,0.578947
6,No log,0.352199,0.473684,0.409091,0.473684
7,No log,0.28382,0.578947,0.5,0.578947
8,No log,0.362889,0.578947,0.5,0.578947
9,No log,0.323824,0.578947,0.5,0.578947
10,No log,0.347201,0.526316,0.471591,0.526316


For justice neil_gorsuch, results are below:
{'eval_loss': 0.20227554440498352, 'eval_f1': 0.75, 'eval_roc_auc': 0.5833333333333333, 'eval_accuracy': 0.75, 'eval_runtime': 0.4773, 'eval_samples_per_second': 41.902, 'eval_steps_per_second': 6.285}
1.0    0.626087
0.0    0.373913
Name: votes_side_j__brett_m_kavanaugh, dtype: float64
Number of cases: 115
Number of training cases: 92.0
Number of validation cases: 11.5
Number of test cases: 11.5


Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

Map:   0%|          | 0/92 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Starting training for brett_m_kavanaugh


Attention type 'block_sparse' is not possible if sequence_length: 128 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.837844,0.272727,0.5,0.272727
2,No log,0.649556,0.272727,0.5,0.272727
3,No log,0.286691,0.363636,0.354167,0.363636
4,No log,0.226017,0.727273,0.5,0.727273
5,No log,0.228119,0.727273,0.5,0.727273
6,No log,0.224404,0.727273,0.5,0.727273
7,No log,0.246948,0.727273,0.5,0.727273
8,No log,0.315331,0.545455,0.375,0.545455
9,No log,0.264422,0.727273,0.5,0.727273
10,No log,0.296919,0.727273,0.5,0.727273


For justice brett_m_kavanaugh, results are below:
{'eval_loss': 0.3099692761898041, 'eval_f1': 0.5, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.5, 'eval_runtime': 0.3956, 'eval_samples_per_second': 30.333, 'eval_steps_per_second': 5.055}


In [18]:
first_6 = overall_results.copy()
for justice, test_results in first_6.items():
    overall_results[justice] = test_results

In [19]:
for justice, test_results in overall_results.items():
    print(justice)
    print(f'Baseline petitioner winrate: {baselines[justice]:.2%}')
    print(f'loss: {test_results["eval_loss"]:.2%}')
    print(f'f1: {test_results["eval_f1"]:.2%}')
    print(f'roc_auc: {test_results["eval_roc_auc"]:.2%}')
    print(f'accuracy: {test_results["eval_accuracy"]:.2%}')
    print('\n--------------------------------------\n')

samuel_a_alito_jr
Baseline petitioner winrate: 60.17%
loss: 25.56%
f1: 60.82%
roc_auc: 61.81%
accuracy: 60.82%

--------------------------------------

john_paul_stevens
Baseline petitioner winrate: 57.22%
loss: 24.55%
f1: 54.67%
roc_auc: 50.50%
accuracy: 54.67%

--------------------------------------

sonia_sotomayor
Baseline petitioner winrate: 59.01%
loss: 24.05%
f1: 59.15%
roc_auc: 50.00%
accuracy: 59.15%

--------------------------------------

david_h_souter
Baseline petitioner winrate: 60.45%
loss: 22.69%
f1: 65.67%
roc_auc: 50.00%
accuracy: 65.67%

--------------------------------------

elena_kagan
Baseline petitioner winrate: 60.66%
loss: 25.93%
f1: 62.30%
roc_auc: 52.02%
accuracy: 62.30%

--------------------------------------

sandra_day_oconnor
Baseline petitioner winrate: 70.10%
loss: 20.72%
f1: 71.79%
roc_auc: 50.00%
accuracy: 71.79%

--------------------------------------

william_h_rehnquist
Baseline petitioner winrate: 66.58%
loss: 23.36%
f1: 70.27%
roc_auc: 50.00%
ac

### Case Outcome

In [21]:
def preprocess_all_cases(test_valid_amt = 0.2, valid_portion_test_valid_amt = 0.5):
    '''
    Takes a win_side column and gets data ready for model

    Inputs:
        test_valid_amt (float): Percentage of data allocated to testing / validation
        valid_portion_test_valid_amt (float): Percentage of data allocated to validation
    '''

    # Concatenate utterances per case
    df_j = df[['case_id', 'new_text'] + ["win_side"]]
    grouped_df = df_j.groupby('case_id')['new_text'].apply(lambda x: ','.join(x)).reset_index()

    # Get justice and combine with utterances
    justices = df[["case_id"] + ["win_side"]].drop_duplicates(keep='first')
    df1 = pd.merge(grouped_df, justices, left_on='case_id', right_on='case_id', how='left').dropna(axis='rows', how='any')

    # Ensure binary label
    df1 = df1.drop(df1[~df1["win_side"].isin([0, 1])].index)
    print(df1["win_side"].value_counts(normalize=True))

    pct_overturn = df1["win_side"].value_counts(normalize=True)[1]

    print(f'Number of cases: {len(df1)}')
    print(f'Number of training cases: {len(df1) * (1-test_valid_amt)}')
    print(f'Number of validation cases: {len(df1) * (test_valid_amt * (valid_portion_test_valid_amt))}')
    print(f'Number of test cases: {len(df1) * (test_valid_amt * (1-valid_portion_test_valid_amt))}')

    # Create dataset
    dataset = Dataset.from_pandas(df1.drop('case_id', axis=1), preserve_index = False)

    # Train/Validation/Test split
    dataset = dataset.train_test_split(test_size=test_valid_amt, shuffle=True)
    dataset_test_valid = dataset['test'].train_test_split(test_size=(1-valid_portion_test_valid_amt),
                                                        shuffle=True)
    dataset = DatasetDict({
        'train': dataset['train'],
        'test': dataset_test_valid['test'],
        'validation': dataset_test_valid['train']})

    # Keep case_id per observation
    labels = [label for label in dataset['train'].features.keys() if label not in ['case_id', 'new_text']]
    id2label = {idx:label for idx, label in enumerate(labels)}
    label2id = {label:idx for idx, label in enumerate(labels)}

    return dataset, labels, id2label, label2id, pct_overturn

In [22]:
# Get data
dataset, labels, id2label, label2id, pct_overturn = preprocess_all_cases(validation_and_testing_percentage,
                                                                         split_of_validation_and_testing)

baseline = pct_overturn

# Load model and tokenizer and define length of the text sequence
model = BigBirdForSequenceClassification.from_pretrained('google/bigbird-roberta-base',
                                                            gradient_checkpointing=False,
                                                            num_labels = len(labels),
                                                            id2label=id2label,
                                                            label2id=label2id,
                                                            cache_dir=cache_directory,
                                                            return_dict=True)
# Tokenize dataset
encoded_dataset = dataset.map(preprocess_data,
                                batched=True,
                                remove_columns=dataset['train'].column_names)

# get args with appropriate model name
args = TrainingArguments(
    output_dir = output_directory,
    num_train_epochs = epochs,
    per_device_train_batch_size = train_batch_size,
    # gradient_accumulation_steps = 32,    
    per_device_eval_batch_size= eval_batch_size,
    evaluation_strategy = save_and_eval_strat,
    save_strategy = save_and_eval_strat,
    save_total_limit = 2,
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=warm_steps,
    weight_decay=wt_decay,
    # logging_steps = 4,
    learning_rate = lr,
    # logging_dir='gdrive/MyDrive/Colab Notebooks/AdvML/logs/',
    dataloader_num_workers = dataloader_num_workers_input,
    run_name = f'bigbird_classification_all_cases_{lr}'
    )
# Run trainer and save output

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )
trainer.train()

results = trainer.evaluate()

# trainer.save_model(f'gdrive/MyDrive/Colab Notebooks/AdvML/models/bigbird_all_cases_{lr}_v01')

# from transformers import AutoModel
# trained_model = AutoModel.from_pretrained(f'gdrive/MyDrive/Colab Notebooks/AdvML/models/bigbird_all_cases_{lr}_v01')

# Initialize a new trainer instance with the trained model and test data
trainer_test = Trainer(
    model=model,  
    args=args,
    eval_dataset=encoded_dataset["test"], 
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )

# Evaluate the model on the test data
test_results = trainer_test.evaluate()

# Print results
print(f'For all_cases, results are below:')
print(test_results)

# Add results to output
overall_results = test_results


1.0    0.657143
0.0    0.342857
Name: win_side, dtype: float64
Number of cases: 1400
Number of training cases: 1120.0
Number of validation cases: 140.0
Number of test cases: 140.0


Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

Map:   0%|          | 0/1120 [00:00<?, ? examples/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

Attention type 'block_sparse' is not possible if sequence_length: 128 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.210644,0.721429,0.5,0.721429
2,No log,0.204482,0.721429,0.5,0.721429
3,No log,0.228262,0.721429,0.5,0.721429
4,0.282100,0.211226,0.721429,0.5,0.721429
5,0.282100,0.202469,0.721429,0.5,0.721429
6,0.282100,0.213941,0.721429,0.5,0.721429
7,0.282100,0.214826,0.721429,0.5,0.721429
8,0.233900,0.20497,0.721429,0.5,0.721429
9,0.233900,0.205138,0.721429,0.5,0.721429
10,0.233900,0.218629,0.721429,0.5,0.721429


For all_cases, results are below:
{'eval_loss': 0.2409883737564087, 'eval_f1': 0.6571428571428571, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.6571428571428571, 'eval_runtime': 0.7298, 'eval_samples_per_second': 191.831, 'eval_steps_per_second': 24.664}


In [23]:
print('Win side results')
print(f'Baseline petitioner winrate: {baseline:.2%}')
print(f'loss: {overall_results["eval_loss"]:.2%}')
print(f'f1: {overall_results["eval_f1"]:.2%}')
print(f'roc_auc: {overall_results["eval_roc_auc"]:.2%}')
print(f'accuracy: {overall_results["eval_accuracy"]:.2%}')

Win side results
Baseline petitioner winrate: 65.71%
loss: 24.10%
f1: 65.71%
roc_auc: 50.00%
accuracy: 65.71%
