In [1]:
# Install packages
!pip install datasets
!pip install transformers==4.28.0

# Clear Colab memory
import os
import shutil
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
try:
    shutil.rmtree('../content/roberta-case')
except FileNotFoundError:
    print('No previous models to remove')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collec

In [None]:
# Import libraries
import torch
import random
import numpy as np
import pandas as pd
from datasets import *
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EvalPrediction, AutoTokenizer

In [None]:
def preprocess_data(examples):
    """
    Takes a batch of texts and encodes them as input_ids such that BERT
    can function
    
    Input:
        - examples: Raw data in text form
    
    Output:
        - encoding: Encoded dataset
    """
    # Take a batch of texts and encode them
    text = examples["new_text"]
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
    
    # Add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    
    # Fill numpy array with the bach of labels
    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]
    encoding["labels"] = labels_matrix.tolist()
    
    return encoding



def multi_label_metrics(predictions, labels, threshold=0.5):
    """
    Computes the f1, roc, and accuracy while training.

    Inputs:
        - predictions: Predictions made by BERT (to be transformed)
        - labels: Labels against which to check the predictions
        - threshold (float): theshold to use for the prediction
    
    Output:
        - metrics (dict): f1, roc_auc, and accuracy
    """
    # Compute predictions from the model
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    # Compute f1, roc_auc, and accuracy
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)

    # Build dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics



def compute_metrics(p: EvalPrediction):
    """
    Wrapper to compute the metrics

    Input:
        - p (EvalPrediction): model output
    
    Output:
        - result (dict): f1, roc_auc, and accuracy
    """
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

def select_random_chars(string, max_length=3000):
    """
    BERT cannot process more than 512 tokens so we narrow down
    the text per case to a random window of 3000 characters

    Inputs:
        - string (str): the text of an utterance
        - max_length (int): the maximal length of the new string
    
    Output:
        - selected_chars (str): new, truncated text of an utterance
    """
    string_length = len(string)

    # Check if the string is long enough to select 3000 characters
    if string_length <= max_length:
        return string

    # Generate a random starting index
    start_index = random.randint(0, string_length - max_length)

    # Select the 3000 characters from the string
    selected_chars = string[start_index:start_index + max_length]
    
    return selected_chars

In [None]:
# Need the CPUs for what comes next
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"torch.device: {device}")

# Get list of files in folder
folder_path = "gdrive/MyDrive/data/"
file_list = os.listdir(folder_path)


# Concatenate the data
df_list = []
for file in file_list:
    # Check if file is a CSV and contains the data we want
    if file.endswith('.csv') and file.startswith('utterances_clean'):
        df = pd.read_csv(os.path.join(folder_path, file))
        df_list.append(df)
df = pd.concat(df_list, axis=0, ignore_index=True)

# For each utterance, add columns specifying who is addressed
df = df.merge(
    df[['speaker', 'side']].drop_duplicates(ignore_index=True),
    how='left',
    left_on='speaker_addressed',
    right_on='speaker',
    suffixes=('', '_addressed')).drop('speaker_addressed', axis=1)

# Translate into natural text (e.g., 'J' --> 'Justice', 'j__clarence_thomas' --> 'Clarence Thomas')
for name_col, type_col in {'speaker': 'speaker_type', 'speaker_replied_to': 'speaker_type_replied_to'}.items():
    df[f'{name_col}_natural'] = df[name_col].apply(lambda name: ' '.join(name.lstrip('j__').split('_')).title())
    speaker_type_translation = {
        'J': 'Justice',
        'A': 'Attorney',
        '<Inaudile>': None
    }

    df[type_col].fillna('na', inplace=True)
    df[f'{type_col}_natural'] = df[type_col].apply(lambda s_type: speaker_type_translation[s_type] if not s_type == 'na' else None)

    df[f'{name_col}_natural'] = df[f'{type_col}_natural'] + ' ' + df[f'{name_col}_natural']
    df[f'{name_col}_natural'].fillna('Unknown', inplace=True)
    df.drop(f'{type_col}_natural', axis=1, inplace=True)

# Also translate the side into natural text (e.g., '1' --> 'Petitioning' (attorney))
side_translation = {
    0: 'Responding',
    1: 'Petitioning',
    2: '',
    3: ''
}

df['side'].fillna(3, inplace=True)
df[f'side_natural'] = df['side'].apply(lambda side: side_translation[side])

df['side_addressed'].fillna(3, inplace=True)
df[f'side_addressed_natural'] = df['side_addressed'].apply(lambda side: side_translation[side])

# Enrich the text of an utterance with some context information
df["new_text"] = "<UTTERANCE_START>" + df['side_natural'] + " " + df["speaker_natural"] + " says: '" + df["text"] + "' to " + df['side_addressed_natural'] + " " + df["speaker_replied_to_natural"] + " <UTTERANCE_END>"



  df = pd.read_csv(os.path.join(folder_path, file))
  df = pd.read_csv(os.path.join(folder_path, file))
  df = pd.read_csv(os.path.join(folder_path, file))
  df = pd.read_csv(os.path.join(folder_path, file))


In [None]:
# Choose justices
j_columns = ['votes_side_j__ruth_bader_ginsburg', 'votes_side_j__clarence_thomas', 'votes_side_j__stephen_g_breyer', 'votes_side_j__anthony_m_kennedy', 'votes_side_j__antonin_scalia', 'votes_side_j__john_g_roberts_jr', 'votes_side_j__samuel_a_alito_jr', 'votes_side_j__john_paul_stevens', 'votes_side_j__sonia_sotomayor', 'votes_side_j__david_h_souter', 'votes_side_j__elena_kagan', 'votes_side_j__sandra_day_oconnor', 'votes_side_j__william_h_rehnquist', 'votes_side_j__neil_gorsuch', 'votes_side_j__brett_m_kavanaugh', 'win_side']


In [None]:

results = pd.DataFrame()

for justice in j_columns:

  # Data Wrangling for each justice
  df_j = df[['case_id', 'new_text'] + [justice]]
  grouped_df = df_j.groupby('case_id')['new_text'].apply(lambda x: ','.join(x)).reset_index()
  justices = df[["case_id"] + [justice]].drop_duplicates(keep='first')

  df1 = pd.merge(grouped_df, justices, left_on='case_id', right_on='case_id', how='left').dropna(axis='rows', how='any')
  df1 = df1.drop(df1[~df1[justice].isin([0, 1])].index)

  df1['new_text'] = df1['new_text'].apply(select_random_chars)

  dataset = Dataset.from_pandas(df1.drop('case_id', axis=1), preserve_index = False)

  dataset = dataset.train_test_split(test_size=0.3, shuffle=True)

  dataset_test_valid = dataset['test'].train_test_split(test_size=0.5, shuffle=True)

  dataset = DatasetDict({
      'train': dataset['train'],
      'test': dataset_test_valid['test'],
      'validation': dataset_test_valid['train']})

  labels = [label for label in dataset['train'].features.keys() if label not in ['case_id', 'new_text']]
  id2label = {idx:label for idx, label in enumerate(labels)}
  label2id = {label:idx for idx, label in enumerate(labels)}
  
  # Start creating model
  tokenizer = AutoTokenizer.from_pretrained('roberta-base', return_overflowing_tokens=True)

  encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

  model = AutoModelForSequenceClassification.from_pretrained('roberta-base', 
                                                            problem_type="multi_label_classification", 
                                                            num_labels=len(labels),
                                                            id2label=id2label,
                                                            label2id=label2id)

  batch_size = 8
  metric_name = "f1"

  args = TrainingArguments(
      "test-trainer",
      evaluation_strategy = "epoch",
      save_total_limit = 2,
      save_strategy = "epoch",
      learning_rate=2e-5,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      num_train_epochs=15,
      weight_decay=0.01,
      load_best_model_at_end=True,
      metric_for_best_model=metric_name,
      #push_to_hub=True,
  )

  trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
  )
  trainer.train()

  val_metrics = trainer.evaluate()
  val_metrics = dict((key.replace('eval', 'val'), [value]) for (key, value) in val_metrics.items())

  # Initialize a new trainer instance with the trained model and test data
  trainer_test = Trainer(
      model=model,  
      args=args,
      eval_dataset=encoded_dataset["test"], 
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
      )

  # Evaluate the model on the test data
  test_metrics = trainer_test.evaluate()
  test_metrics = dict((key.replace('eval', 'test'), [value]) for (key, value) in test_metrics.items())

  metrics = val_metrics
  metrics.update(test_metrics)
  metrics['justice'] = [justice]

  results = pd.concat([results, pd.DataFrame(metrics)], ignore_index=True)

results = results[['justice', 'epoch'] +[col for col in results.columns if col.startswith('val_')] +
    [col for col in results.columns if col.startswith('test_')]]



1.0    0.584173
0.0    0.415827
Name: votes_side_j__ruth_bader_ginsburg, dtype: float64


Map:   0%|          | 0/973 [00:00<?, ? examples/s]

Map:   0%|          | 0/209 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.673158,0.600962,0.5,0.600962
2,No log,0.675778,0.600962,0.5,0.600962
3,No log,0.674743,0.600962,0.5,0.600962
4,No log,0.673566,0.600962,0.5,0.600962
5,0.684600,0.67349,0.600962,0.5,0.600962
6,0.684600,0.69042,0.600962,0.5,0.600962
7,0.684600,0.714008,0.5,0.476723,0.5
8,0.684600,0.984292,0.514423,0.500867,0.514423
9,0.612700,1.146168,0.5,0.498988,0.5
10,0.612700,1.843978,0.528846,0.504771,0.528846


1.0    0.586878
0.0    0.413122
Name: votes_side_j__clarence_thomas, dtype: float64


Map:   0%|          | 0/970 [00:00<?, ? examples/s]

Map:   0%|          | 0/209 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.683534,0.576923,0.5,0.576923
2,No log,0.681871,0.576923,0.5,0.576923
3,No log,0.682335,0.576923,0.5,0.576923
4,No log,0.683364,0.576923,0.5,0.576923
5,0.681700,0.688987,0.576923,0.5,0.576923
6,0.681700,0.734528,0.5,0.519697,0.5
7,0.681700,0.922864,0.451923,0.488636,0.451923
8,0.681700,1.333007,0.490385,0.488636,0.490385
9,0.512300,2.511125,0.451923,0.438636,0.451923
10,0.512300,2.839279,0.504808,0.498106,0.504808


1.0    0.626453
0.0    0.373547
Name: votes_side_j__stephen_g_breyer, dtype: float64


Map:   0%|          | 0/963 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Map:   0%|          | 0/206 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.632885,0.684466,0.5,0.684466
2,No log,0.626701,0.684466,0.5,0.684466
3,No log,0.64736,0.684466,0.5,0.684466
4,No log,0.628742,0.684466,0.5,0.684466
5,0.678300,0.631108,0.684466,0.5,0.684466
6,0.678300,0.638079,0.684466,0.5,0.684466
7,0.678300,0.628349,0.684466,0.5,0.684466
8,0.678300,0.631074,0.684466,0.5,0.684466
9,0.672000,0.631505,0.684466,0.5,0.684466
10,0.672000,0.635825,0.684466,0.5,0.684466


1.0    0.664557
0.0    0.335443
Name: votes_side_j__anthony_m_kennedy, dtype: float64


Map:   0%|          | 0/884 [00:00<?, ? examples/s]

Map:   0%|          | 0/190 [00:00<?, ? examples/s]

Map:   0%|          | 0/190 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.651891,0.657895,0.5,0.657895
2,No log,0.655533,0.657895,0.5,0.657895
3,No log,0.6472,0.657895,0.5,0.657895
4,No log,0.772644,0.652632,0.496,0.652632
5,0.632000,0.783698,0.557895,0.472,0.557895
6,0.632000,1.404155,0.605263,0.515385,0.605263
7,0.632000,1.694525,0.621053,0.486769,0.621053
8,0.632000,2.060787,0.584211,0.495692,0.584211
9,0.632000,2.540893,0.568421,0.487385,0.568421
10,0.226200,2.465825,0.605263,0.508,0.605263


1.0    0.630275
0.0    0.369725
Name: votes_side_j__antonin_scalia, dtype: float64


Map:   0%|          | 0/763 [00:00<?, ? examples/s]

Map:   0%|          | 0/164 [00:00<?, ? examples/s]

Map:   0%|          | 0/163 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.68041,0.588957,0.5,0.588957
2,No log,0.678329,0.588957,0.5,0.588957
3,No log,0.680302,0.588957,0.5,0.588957
4,No log,0.677637,0.588957,0.5,0.588957
5,No log,0.684961,0.588957,0.5,0.588957
6,0.664100,0.704998,0.588957,0.5,0.588957
7,0.664100,0.856244,0.546012,0.483831,0.546012
8,0.664100,1.542685,0.453988,0.462065,0.453988
9,0.664100,2.463481,0.447853,0.474891,0.447853
10,0.664100,2.533551,0.552147,0.502565,0.552147


1.0    0.653041
0.0    0.346959
Name: votes_side_j__john_g_roberts_jr, dtype: float64


Map:   0%|          | 0/702 [00:00<?, ? examples/s]

Map:   0%|          | 0/151 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.62376,0.686667,0.5,0.686667
2,No log,0.627843,0.686667,0.5,0.686667
3,No log,0.625796,0.686667,0.5,0.686667
4,No log,0.652413,0.686667,0.5,0.686667
5,No log,0.780027,0.68,0.495146,0.68
6,0.617700,1.184652,0.513333,0.425842,0.513333
7,0.617700,1.601647,0.566667,0.429973,0.566667
8,0.617700,2.294566,0.546667,0.421194,0.546667
9,0.617700,2.437323,0.54,0.479963,0.54
10,0.617700,2.761765,0.553333,0.437616,0.553333


1.0    0.601651
0.0    0.398349
Name: votes_side_j__samuel_a_alito_jr, dtype: float64


Map:   0%|          | 0/678 [00:00<?, ? examples/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

Map:   0%|          | 0/145 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.687771,0.57931,0.5,0.57931
2,No log,0.683524,0.57931,0.5,0.57931
3,No log,0.678531,0.57931,0.5,0.57931
4,No log,0.701114,0.537931,0.479996,0.537931
5,No log,0.849144,0.524138,0.5242,0.524138
6,0.628200,0.994936,0.531034,0.474044,0.531034
7,0.628200,2.050019,0.524138,0.45687,0.524138
8,0.628200,2.452073,0.517241,0.462139,0.517241
9,0.628200,2.624908,0.524138,0.474824,0.524138
10,0.628200,2.82533,0.517241,0.468872,0.517241


1.0    0.5722
0.0    0.4278
Name: votes_side_j__john_paul_stevens, dtype: float64


Map:   0%|          | 0/518 [00:00<?, ? examples/s]

Map:   0%|          | 0/112 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.700339,0.495495,0.5,0.495495
2,No log,0.716379,0.495495,0.5,0.495495
3,No log,0.704804,0.495495,0.5,0.495495
4,No log,0.705233,0.495495,0.5,0.495495
5,No log,0.733289,0.540541,0.543019,0.540541
6,No log,0.835364,0.558559,0.562013,0.558559
7,No log,0.96124,0.54955,0.549675,0.54955
8,0.596400,1.306536,0.531532,0.532955,0.531532
9,0.596400,2.15051,0.495495,0.498701,0.495495
10,0.596400,2.675558,0.513514,0.515747,0.513514


1.0    0.590141
0.0    0.409859
Name: votes_side_j__sonia_sotomayor, dtype: float64


Map:   0%|          | 0/497 [00:00<?, ? examples/s]

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.664657,0.650943,0.5,0.650943
2,No log,0.702765,0.330189,0.466706,0.330189
3,No log,0.658673,0.650943,0.5,0.650943
4,No log,0.672924,0.650943,0.5,0.650943
5,No log,0.681248,0.575472,0.485899,0.575472
6,No log,0.668439,0.622642,0.515864,0.622642
7,No log,1.119565,0.40566,0.462005,0.40566
8,0.642600,1.040764,0.433962,0.446142,0.433962
9,0.642600,1.628731,0.471698,0.456326,0.471698
10,0.642600,2.365328,0.443396,0.453388,0.443396


1.0    0.604511
0.0    0.395489
Name: votes_side_j__david_h_souter, dtype: float64


Map:   0%|          | 0/465 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.683548,0.58,0.5,0.58
2,No log,0.680173,0.58,0.5,0.58
3,No log,0.679653,0.58,0.5,0.58
4,No log,0.684945,0.58,0.5,0.58
5,No log,0.705127,0.57,0.494663,0.57
6,No log,0.740806,0.46,0.468801,0.46
7,No log,0.822673,0.57,0.543924,0.57
8,No log,0.993156,0.51,0.521757,0.51
9,0.614900,1.158034,0.48,0.482759,0.48
10,0.614900,1.318878,0.54,0.527915,0.54


1.0    0.606612
0.0    0.393388
Name: votes_side_j__elena_kagan, dtype: float64


Map:   0%|          | 0/423 [00:00<?, ? examples/s]

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.66005,0.615385,0.5,0.615385
2,No log,0.681424,0.615385,0.5,0.615385
3,No log,0.687891,0.615385,0.5,0.615385
4,No log,0.656845,0.648352,0.553571,0.648352
5,No log,0.71126,0.604396,0.496429,0.604396
6,No log,0.82336,0.527473,0.508929,0.527473
7,No log,1.410404,0.615385,0.516071,0.615385
8,No log,1.919462,0.582418,0.526786,0.582418
9,No log,2.252269,0.582418,0.521429,0.582418
10,0.461800,2.358747,0.582418,0.553571,0.582418


1.0    0.701031
0.0    0.298969
Name: votes_side_j__sandra_day_oconnor, dtype: float64


Map:   0%|          | 0/271 [00:00<?, ? examples/s]

Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Map:   0%|          | 0/58 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.565987,0.758621,0.5,0.758621
2,No log,0.549653,0.758621,0.5,0.758621
3,No log,0.546462,0.758621,0.5,0.758621
4,No log,0.593031,0.758621,0.5,0.758621
5,No log,0.663674,0.672414,0.516234,0.672414
6,No log,0.922784,0.534483,0.474026,0.534483
7,No log,1.691523,0.482759,0.464286,0.482759
8,No log,1.949574,0.534483,0.400974,0.534483
9,No log,2.157994,0.603448,0.446429,0.603448
10,No log,2.340673,0.586207,0.435065,0.586207


1.0    0.665753
0.0    0.334247
Name: votes_side_j__william_h_rehnquist, dtype: float64


Map:   0%|          | 0/255 [00:00<?, ? examples/s]

Map:   0%|          | 0/55 [00:00<?, ? examples/s]

Map:   0%|          | 0/55 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.692611,0.6,0.5,0.6
2,No log,0.683082,0.6,0.5,0.6
3,No log,0.672776,0.6,0.5,0.6
4,No log,0.750089,0.6,0.5,0.6
5,No log,0.8633,0.6,0.5,0.6
6,No log,0.651971,0.6,0.590909,0.6
7,No log,1.113413,0.6,0.522727,0.6
8,No log,1.511576,0.618182,0.55303,0.618182
9,No log,1.860005,0.6,0.537879,0.6
10,No log,2.241761,0.581818,0.522727,0.581818


1.0    0.610256
0.0    0.389744
Name: votes_side_j__neil_gorsuch, dtype: float64


Map:   0%|          | 0/136 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.673151,0.62069,0.5,0.62069
2,No log,0.66572,0.62069,0.5,0.62069
3,No log,0.661112,0.62069,0.5,0.62069
4,No log,0.660667,0.62069,0.5,0.62069
5,No log,0.663759,0.62069,0.5,0.62069
6,No log,0.750001,0.62069,0.5,0.62069
7,No log,0.919402,0.62069,0.5,0.62069
8,No log,0.922087,0.655172,0.545455,0.655172
9,No log,1.09183,0.586207,0.472222,0.586207
10,No log,1.3315,0.62069,0.535354,0.62069


1.0    0.626087
0.0    0.373913
Name: votes_side_j__brett_m_kavanaugh, dtype: float64


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.663697,0.647059,0.5,0.647059
2,No log,0.646605,0.647059,0.5,0.647059
3,No log,0.651071,0.647059,0.5,0.647059
4,No log,0.645448,0.647059,0.5,0.647059
5,No log,0.644954,0.647059,0.5,0.647059
6,No log,0.653207,0.647059,0.5,0.647059
7,No log,0.748025,0.647059,0.5,0.647059
8,No log,0.579056,0.647059,0.537879,0.647059
9,No log,0.895366,0.647059,0.5,0.647059
10,No log,0.563002,0.705882,0.69697,0.705882


1.0    0.657143
0.0    0.342857
Name: win_side, dtype: float64


Map:   0%|          | 0/980 [00:00<?, ? examples/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.625372,0.704762,0.5,0.704762
2,No log,0.625446,0.704762,0.5,0.704762
3,No log,0.613686,0.704762,0.5,0.704762
4,No log,0.630181,0.704762,0.5,0.704762
5,0.660400,0.730334,0.561905,0.581408,0.561905
6,0.660400,0.775058,0.633333,0.524303,0.633333
7,0.660400,1.385256,0.647619,0.539124,0.647619
8,0.660400,1.591407,0.652381,0.519071,0.652381
9,0.409100,2.081824,0.619048,0.570401,0.619048
10,0.409100,2.142974,0.633333,0.528989,0.633333


In [None]:
results


Unnamed: 0,justice,epoch,val_loss,val_f1,val_roc_auc,val_accuracy,val_runtime,val_samples_per_second,val_steps_per_second,test_loss,test_f1,test_roc_auc,test_accuracy,test_runtime,test_samples_per_second,test_steps_per_second
0,votes_side_j__ruth_bader_ginsburg,15.0,0.673158,0.600962,0.5,0.600962,1.6294,127.654,15.957,0.679074,0.583732,0.5,0.583732,1.5806,132.226,17.082
1,votes_side_j__clarence_thomas,15.0,0.683534,0.576923,0.5,0.576923,1.6136,128.902,16.113,0.668339,0.617225,0.5,0.617225,1.5732,132.849,17.162
2,votes_side_j__stephen_g_breyer,15.0,0.632885,0.684466,0.5,0.684466,1.6245,126.81,16.005,0.645785,0.657005,0.5,0.657005,1.5372,134.663,16.914
3,votes_side_j__anthony_m_kennedy,15.0,0.651891,0.657895,0.5,0.657895,1.4871,127.764,16.139,0.638538,0.689474,0.5,0.689474,1.4126,134.507,16.99
4,votes_side_j__antonin_scalia,15.0,0.68041,0.588957,0.5,0.588957,1.2805,127.292,16.4,0.645863,0.658537,0.5,0.658537,1.2528,130.909,16.763
5,votes_side_j__john_g_roberts_jr,15.0,0.62376,0.686667,0.5,0.686667,1.1629,128.988,16.338,0.691134,0.582781,0.5,0.582781,1.1383,132.654,16.692
6,votes_side_j__samuel_a_alito_jr,15.0,0.687771,0.57931,0.5,0.57931,1.1363,127.605,16.721,0.680201,0.643836,0.5,0.643836,1.105,132.128,17.195
7,votes_side_j__john_paul_stevens,15.0,0.835364,0.558559,0.562013,0.558559,0.9049,122.665,15.471,0.905748,0.517857,0.533844,0.517857,0.8515,131.532,16.442
8,votes_side_j__sonia_sotomayor,15.0,0.664657,0.650943,0.5,0.650943,0.842,125.896,16.628,0.665082,0.654206,0.5,0.654206,0.806,132.759,17.37
9,votes_side_j__david_h_souter,15.0,0.683548,0.58,0.5,0.58,0.7915,126.342,16.425,0.668417,0.68,0.5,0.68,0.7476,133.759,17.389


In [None]:
print(df1)