In [None]:
# Install packages
!pip install datasets
!pip install transformers==4.28.0

# Clear Colab memory
import os
import shutil
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
try:
    shutil.rmtree('../content/roberta-case')
except FileNotFoundError:
    print('No previous models to remove')


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collec

In [None]:
# Import libraries
import torch
import random
import numpy as np
import pandas as pd
from datasets import *
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EvalPrediction, AutoTokenizer

In [None]:
def preprocess_data(examples):
    """
    Takes a batch of texts and encodes them as input_ids such that BERT
    can function
    
    Input:
        - examples: Raw data in text form
    
    Output:
        - encoding: Encoded dataset
    """
    # Take a batch of texts and encode them
    text = examples["new_text"]
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
    
    # Add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    
    # Fill numpy array with the bach of labels
    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]
    encoding["labels"] = labels_matrix.tolist()
    
    return encoding



def multi_label_metrics(predictions, labels, threshold=0.5):
    """
    Computes the f1, roc, and accuracy while training.

    Inputs:
        - predictions: Predictions made by BERT (to be transformed)
        - labels: Labels against which to check the predictions
        - threshold (float): theshold to use for the prediction
    
    Output:
        - metrics (dict): f1, roc_auc, and accuracy
    """
    # Compute predictions from the model
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    # Compute f1, roc_auc, and accuracy
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)

    # Build dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics



def compute_metrics(p: EvalPrediction):
    """
    Wrapper to compute the metrics

    Input:
        - p (EvalPrediction): model output
    
    Output:
        - result (dict): f1, roc_auc, and accuracy
    """
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

def select_random_chars(string, max_length=3000):
    """
    BERT cannot process more than 512 tokens so we narrow down
    the text per case to a random window of 3000 characters

    Inputs:
        - string (str): the text of an utterance
        - max_length (int): the maximal length of the new string
    
    Output:
        - selected_chars (str): new, truncated text of an utterance
    """
    string_length = len(string)

    # Check if the string is long enough to select 3000 characters
    if string_length <= max_length:
        return string

    # Generate a random starting index
    start_index = random.randint(0, string_length - max_length)

    # Select the 3000 characters from the string
    selected_chars = string[start_index:start_index + max_length]
    
    return selected_chars

In [None]:
# Need the CPUs for what comes next
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"torch.device: {device}")

# Get list of files in folder
folder_path = "gdrive/MyDrive/data/"
file_list = os.listdir(folder_path)


# Concatenate the data
df_list = []
for file in file_list:
    # Check if file is a CSV and contains the data we want
    if file.endswith('.csv') and file.startswith('utterances_clean'):
        df = pd.read_csv(os.path.join(folder_path, file))
        df_list.append(df)
df = pd.concat(df_list, axis=0, ignore_index=True)

# For each utterance, add columns specifying who is addressed
df = df.merge(
    df[['speaker', 'side']].drop_duplicates(ignore_index=True),
    how='left',
    left_on='speaker_addressed',
    right_on='speaker',
    suffixes=('', '_addressed')).drop('speaker_addressed', axis=1)

# Translate into natural text (e.g., 'J' --> 'Justice', 'j__clarence_thomas' --> 'Clarence Thomas')
for name_col, type_col in {'speaker': 'speaker_type', 'speaker_replied_to': 'speaker_type_replied_to'}.items():
    df[f'{name_col}_natural'] = df[name_col].apply(lambda name: ' '.join(name.lstrip('j__').split('_')).title())
    speaker_type_translation = {
        'J': 'Justice',
        'A': 'Attorney',
        '<Inaudile>': None
    }

    df[type_col].fillna('na', inplace=True)
    df[f'{type_col}_natural'] = df[type_col].apply(lambda s_type: speaker_type_translation[s_type] if not s_type == 'na' else None)

    df[f'{name_col}_natural'] = df[f'{type_col}_natural'] + ' ' + df[f'{name_col}_natural']
    df[f'{name_col}_natural'].fillna('Unknown', inplace=True)
    df.drop(f'{type_col}_natural', axis=1, inplace=True)

# Also translate the side into natural text (e.g., '1' --> 'Petitioning' (attorney))
side_translation = {
    0: 'Responding',
    1: 'Petitioning',
    2: '',
    3: ''
}

df['side'].fillna(3, inplace=True)
df[f'side_natural'] = df['side'].apply(lambda side: side_translation[side])

df['side_addressed'].fillna(3, inplace=True)
df[f'side_addressed_natural'] = df['side_addressed'].apply(lambda side: side_translation[side])

# Enrich the text of an utterance with some context information
df["new_text"] = "<UTTERANCE_START>" + df['side_natural'] + " " + df["speaker_natural"] + " says: '" + df["text"] + "' to " + df['side_addressed_natural'] + " " + df["speaker_replied_to_natural"] + " <UTTERANCE_END>"




  df = pd.read_csv(os.path.join(folder_path, file))
  df = pd.read_csv(os.path.join(folder_path, file))
  df = pd.read_csv(os.path.join(folder_path, file))
  df = pd.read_csv(os.path.join(folder_path, file))


In [None]:
# Choose justices
j_columns = ['votes_side_j__ruth_bader_ginsburg', 'votes_side_j__clarence_thomas', 'votes_side_j__stephen_g_breyer', 'votes_side_j__anthony_m_kennedy', 'votes_side_j__antonin_scalia', 'votes_side_j__john_g_roberts_jr', 'votes_side_j__samuel_a_alito_jr', 'votes_side_j__john_paul_stevens', 'votes_side_j__sonia_sotomayor', 'votes_side_j__david_h_souter', 'votes_side_j__elena_kagan', 'votes_side_j__sandra_day_oconnor', 'votes_side_j__william_h_rehnquist', 'votes_side_j__neil_gorsuch', 'votes_side_j__brett_m_kavanaugh', 'win_side']


In [None]:
results = pd.DataFrame()

for justice in j_columns:

  # Data Wrangling for each justice
  df_j = df[['case_id', 'new_text'] + [justice]]
  grouped_df = df_j.groupby('case_id')['new_text'].apply(lambda x: ','.join(x)).reset_index()
  justices = df[["case_id"] + [justice]].drop_duplicates(keep='first')

  df1 = pd.merge(grouped_df, justices, left_on='case_id', right_on='case_id', how='left').dropna(axis='rows', how='any')
  df1 = df1.drop(df1[~df1[justice].isin([0, 1])].index)

  dataset = Dataset.from_pandas(df1.drop('case_id', axis=1), preserve_index = False)

  dataset = dataset.train_test_split(test_size=0.3, shuffle=True)

  dataset_test_valid = dataset['test'].train_test_split(test_size=0.5, shuffle=True)

  dataset = DatasetDict({
      'train': dataset['train'],
      'test': dataset_test_valid['test'],
      'validation': dataset_test_valid['train']})

  labels = [label for label in dataset['train'].features.keys() if label not in ['case_id', 'new_text']]
  id2label = {idx:label for idx, label in enumerate(labels)}
  label2id = {label:idx for idx, label in enumerate(labels)}

  # Start creating model

  tokenizer = AutoTokenizer.from_pretrained('roberta-base', return_overflowing_tokens=True)

  encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

  model = AutoModelForSequenceClassification.from_pretrained('roberta-base', 
                                                            problem_type="multi_label_classification", 
                                                            num_labels=len(labels),
                                                            id2label=id2label,
                                                            label2id=label2id)

  batch_size = 8
  metric_name = "f1"

  args = TrainingArguments(
      "test-trainer",
      evaluation_strategy = "epoch",
      save_total_limit = 2,
      save_strategy = "epoch",
      learning_rate=2e-5,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      num_train_epochs=15,
      weight_decay=0.01,
      load_best_model_at_end=True,
      metric_for_best_model=metric_name,
      #push_to_hub=True,
  )

  trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
  )
  trainer.train()

  val_metrics = trainer.evaluate()
  val_metrics = dict((key.replace('eval', 'val'), [value]) for (key, value) in val_metrics.items())

  # Initialize a new trainer instance with the trained model and test data
  trainer_test = Trainer(
      model=model,  
      args=args,
      eval_dataset=encoded_dataset["test"], 
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
      )

  # Evaluate the model on the test data
  test_metrics = trainer_test.evaluate()
  test_metrics = dict((key.replace('eval', 'test'), [value]) for (key, value) in test_metrics.items())

  metrics = val_metrics
  metrics.update(test_metrics)
  metrics['justice'] = [justice]

  results = pd.concat([results, pd.DataFrame(metrics)], ignore_index=True)

results = results[['justice', 'epoch'] +[col for col in results.columns if col.startswith('val_')] +
    [col for col in results.columns if col.startswith('test_')]]



1.0    0.584173
0.0    0.415827
Name: votes_side_j__ruth_bader_ginsburg, dtype: float64


Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/973 [00:00<?, ? examples/s]

Map:   0%|          | 0/209 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.671905,0.610577,0.5,0.610577
2,No log,0.659721,0.610577,0.5,0.610577
3,No log,0.65882,0.610577,0.5,0.610577
4,No log,0.646003,0.682692,0.619423,0.682692
5,0.668700,0.713422,0.567308,0.556236,0.567308
6,0.668700,0.877919,0.591346,0.569214,0.591346
7,0.668700,1.152449,0.586538,0.560805,0.586538
8,0.668700,1.620341,0.639423,0.615291,0.639423
9,0.342400,2.012795,0.644231,0.594634,0.644231
10,0.342400,2.321181,0.581731,0.565811,0.581731


1.0    0.586878
0.0    0.413122
Name: votes_side_j__clarence_thomas, dtype: float64


Map:   0%|          | 0/970 [00:00<?, ? examples/s]

Map:   0%|          | 0/209 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.675864,0.576923,0.5,0.576923
2,No log,0.683827,0.576923,0.5,0.576923
3,No log,0.686864,0.586538,0.514394,0.586538
4,No log,0.768672,0.552885,0.580682,0.552885
5,0.659100,1.063563,0.524038,0.478409,0.524038
6,0.659100,1.517676,0.548077,0.520455,0.548077
7,0.659100,2.115229,0.557692,0.54697,0.557692
8,0.659100,2.482127,0.548077,0.550758,0.548077
9,0.213300,2.834065,0.572115,0.55947,0.572115
10,0.213300,2.994012,0.567308,0.546212,0.567308


1.0    0.626453
0.0    0.373547
Name: votes_side_j__stephen_g_breyer, dtype: float64


Map:   0%|          | 0/963 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Map:   0%|          | 0/206 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.637633,0.684466,0.5,0.684466
2,No log,0.628665,0.684466,0.5,0.684466
3,No log,0.62544,0.684466,0.5,0.684466
4,No log,0.627019,0.684466,0.5,0.684466
5,0.678200,0.636699,0.684466,0.5,0.684466
6,0.678200,0.643492,0.684466,0.5,0.684466
7,0.678200,0.629042,0.684466,0.5,0.684466
8,0.678200,0.635017,0.684466,0.5,0.684466
9,0.672800,0.632845,0.684466,0.5,0.684466
10,0.672800,0.634099,0.684466,0.5,0.684466


1.0    0.664557
0.0    0.335443
Name: votes_side_j__anthony_m_kennedy, dtype: float64


Map:   0%|          | 0/884 [00:00<?, ? examples/s]

Map:   0%|          | 0/190 [00:00<?, ? examples/s]

Map:   0%|          | 0/190 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.646378,0.657895,0.5,0.657895
2,No log,0.65234,0.657895,0.5,0.657895
3,No log,0.708909,0.652632,0.503385,0.652632
4,No log,0.764037,0.657895,0.507385,0.657895
5,0.595500,0.933707,0.578947,0.524923,0.578947
6,0.595500,1.510197,0.636842,0.565231,0.636842
7,0.595500,1.940523,0.636842,0.520923,0.636842
8,0.595500,2.126994,0.642105,0.521231,0.642105
9,0.595500,2.21225,0.663158,0.566769,0.663158
10,0.162700,2.292598,0.668421,0.567077,0.668421


1.0    0.630275
0.0    0.369725
Name: votes_side_j__antonin_scalia, dtype: float64


Map:   0%|          | 0/763 [00:00<?, ? examples/s]

Map:   0%|          | 0/164 [00:00<?, ? examples/s]

Map:   0%|          | 0/163 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.675824,0.588957,0.5,0.588957
2,No log,0.683559,0.588957,0.5,0.588957
3,No log,0.664162,0.588957,0.5,0.588957
4,No log,0.697507,0.595092,0.532261,0.595092
5,No log,0.805713,0.607362,0.538169,0.607362
6,0.623600,1.010399,0.564417,0.549052,0.564417
7,0.623600,2.288757,0.496933,0.500777,0.496933
8,0.623600,2.548727,0.570552,0.540734,0.570552
9,0.623600,3.005416,0.533742,0.513993,0.533742
10,0.623600,3.233258,0.490798,0.479789,0.490798


1.0    0.653041
0.0    0.346959
Name: votes_side_j__john_g_roberts_jr, dtype: float64


Map:   0%|          | 0/702 [00:00<?, ? examples/s]

Map:   0%|          | 0/151 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.620671,0.686667,0.5,0.686667
2,No log,0.620636,0.686667,0.5,0.686667
3,No log,0.616642,0.686667,0.5,0.686667
4,No log,0.673439,0.686667,0.5,0.686667
5,No log,0.938386,0.626667,0.519934,0.626667
6,0.572700,1.458448,0.586667,0.525511,0.586667
7,0.572700,1.929913,0.58,0.543793,0.58
8,0.572700,2.416914,0.546667,0.531089,0.546667
9,0.572700,2.33623,0.606667,0.522723,0.606667
10,0.572700,2.226928,0.633333,0.519004,0.633333


1.0    0.601651
0.0    0.398349
Name: votes_side_j__samuel_a_alito_jr, dtype: float64


Map:   0%|          | 0/678 [00:00<?, ? examples/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

Map:   0%|          | 0/145 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.689984,0.572414,0.496292,0.572414
2,No log,0.689488,0.57931,0.5,0.57931
3,No log,0.705081,0.475862,0.498244,0.475862
4,No log,0.901597,0.468966,0.44516,0.468966
5,No log,1.159243,0.524138,0.510734,0.524138
6,0.572900,1.928128,0.531034,0.50322,0.531034
7,0.572900,2.410432,0.565517,0.50605,0.565517
8,0.572900,2.753297,0.537931,0.509173,0.537931
9,0.572900,3.07638,0.510345,0.510051,0.510345
10,0.572900,2.99096,0.531034,0.507709,0.531034


1.0    0.5722
0.0    0.4278
Name: votes_side_j__john_paul_stevens, dtype: float64


Map:   0%|          | 0/518 [00:00<?, ? examples/s]

Map:   0%|          | 0/112 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.697926,0.495495,0.5,0.495495
2,No log,0.729451,0.495495,0.5,0.495495
3,No log,0.710591,0.495495,0.5,0.495495
4,No log,0.703093,0.495495,0.5,0.495495
5,No log,0.703912,0.522523,0.526786,0.522523
6,No log,0.773703,0.495495,0.49724,0.495495
7,No log,0.931839,0.495495,0.495779,0.495495
8,0.602900,1.282188,0.504505,0.504221,0.504505
9,0.602900,1.864843,0.495495,0.495942,0.495495
10,0.602900,2.52605,0.495495,0.495617,0.495495


1.0    0.590141
0.0    0.409859
Name: votes_side_j__sonia_sotomayor, dtype: float64


Map:   0%|          | 0/497 [00:00<?, ? examples/s]

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.669269,0.650943,0.5,0.650943
2,No log,0.675845,0.650943,0.5,0.650943
3,No log,0.654631,0.650943,0.5,0.650943
4,No log,0.674599,0.603774,0.551508,0.603774
5,No log,0.787997,0.396226,0.486095,0.396226
6,No log,0.867349,0.556604,0.546612,0.556604
7,No log,1.597128,0.443396,0.503525,0.443396
8,0.537000,1.985503,0.537736,0.507051,0.537736
9,0.537000,2.606256,0.509434,0.51038,0.509434
10,0.537000,2.494169,0.54717,0.495495,0.54717


1.0    0.604511
0.0    0.395489
Name: votes_side_j__david_h_souter, dtype: float64


Map:   0%|          | 0/465 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.683499,0.58,0.5,0.58
2,No log,0.681205,0.58,0.5,0.58
3,No log,0.680509,0.58,0.5,0.58
4,No log,0.680592,0.58,0.5,0.58
5,No log,0.682378,0.58,0.5,0.58
6,No log,0.681198,0.58,0.5,0.58
7,No log,0.701117,0.58,0.5,0.58
8,No log,0.751469,0.58,0.503284,0.58
9,0.676500,0.774543,0.51,0.488916,0.51
10,0.676500,0.839375,0.52,0.527094,0.52


1.0    0.606612
0.0    0.393388
Name: votes_side_j__elena_kagan, dtype: float64


Map:   0%|          | 0/423 [00:00<?, ? examples/s]

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.669444,0.615385,0.5,0.615385
2,No log,0.664137,0.615385,0.5,0.615385
3,No log,0.676595,0.615385,0.5,0.615385
4,No log,0.667145,0.615385,0.5,0.615385
5,No log,0.7041,0.615385,0.5,0.615385
6,No log,0.791689,0.615385,0.526786,0.615385
7,No log,0.925564,0.615385,0.6125,0.615385
8,No log,1.133108,0.604396,0.614286,0.604396
9,No log,1.434131,0.626374,0.557143,0.626374
10,0.554100,1.750237,0.56044,0.525,0.56044


1.0    0.701031
0.0    0.298969
Name: votes_side_j__sandra_day_oconnor, dtype: float64


Map:   0%|          | 0/271 [00:00<?, ? examples/s]

Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Map:   0%|          | 0/58 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.561208,0.758621,0.5,0.758621
2,No log,0.552349,0.758621,0.5,0.758621
3,No log,0.561112,0.758621,0.5,0.758621
4,No log,0.650826,0.758621,0.5,0.758621
5,No log,0.533192,0.758621,0.5,0.758621
6,No log,0.629697,0.724138,0.525974,0.724138
7,No log,0.938574,0.637931,0.493506,0.637931
8,No log,1.310893,0.62069,0.530844,0.62069
9,No log,1.742321,0.568966,0.521104,0.568966
10,No log,1.352668,0.706897,0.538961,0.706897


1.0    0.665753
0.0    0.334247
Name: votes_side_j__william_h_rehnquist, dtype: float64


Map:   0%|          | 0/255 [00:00<?, ? examples/s]

Map:   0%|          | 0/55 [00:00<?, ? examples/s]

Map:   0%|          | 0/55 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.681808,0.6,0.5,0.6
2,No log,0.701772,0.6,0.5,0.6
3,No log,0.656885,0.6,0.5,0.6
4,No log,0.740182,0.6,0.5,0.6
5,No log,0.76708,0.6,0.5,0.6
6,No log,0.659133,0.6,0.5,0.6
7,No log,0.691314,0.581818,0.515152,0.581818
8,No log,0.794657,0.581818,0.55303,0.581818
9,No log,1.220098,0.636364,0.560606,0.636364
10,No log,1.183291,0.636364,0.575758,0.636364


1.0    0.610256
0.0    0.389744
Name: votes_side_j__neil_gorsuch, dtype: float64


Map:   0%|          | 0/136 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.659504,0.62069,0.5,0.62069
2,No log,0.666447,0.62069,0.5,0.62069
3,No log,0.669024,0.62069,0.5,0.62069
4,No log,0.692894,0.62069,0.5,0.62069
5,No log,0.724625,0.62069,0.5,0.62069
6,No log,0.767558,0.586207,0.489899,0.586207
7,No log,0.883431,0.517241,0.416667,0.517241
8,No log,0.945043,0.482759,0.406566,0.482759
9,No log,1.111684,0.413793,0.386364,0.413793
10,No log,1.341197,0.310345,0.30303,0.310345


1.0    0.626087
0.0    0.373913
Name: votes_side_j__brett_m_kavanaugh, dtype: float64


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.662675,0.647059,0.5,0.647059
2,No log,0.648953,0.647059,0.5,0.647059
3,No log,0.640855,0.647059,0.5,0.647059
4,No log,0.646576,0.647059,0.5,0.647059
5,No log,0.68007,0.647059,0.5,0.647059
6,No log,0.665422,0.647059,0.5,0.647059
7,No log,0.763739,0.647059,0.537879,0.647059
8,No log,0.869545,0.705882,0.621212,0.705882
9,No log,0.914622,0.705882,0.621212,0.705882
10,No log,1.06378,0.470588,0.477273,0.470588


1.0    0.657143
0.0    0.342857
Name: win_side, dtype: float64


Map:   0%|          | 0/980 [00:00<?, ? examples/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.627675,0.704762,0.5,0.704762
2,No log,0.627056,0.704762,0.5,0.704762
3,No log,0.610971,0.704762,0.5,0.704762
4,No log,0.632534,0.704762,0.5,0.704762
5,0.657700,0.694975,0.652381,0.486269,0.652381
6,0.657700,0.822281,0.647619,0.54381,0.647619
7,0.657700,1.379119,0.652381,0.537816,0.652381
8,0.657700,1.810491,0.638095,0.537053,0.638095
9,0.340700,2.359294,0.580952,0.519943,0.580952
10,0.340700,2.26972,0.642857,0.540432,0.642857


In [None]:
results


Unnamed: 0,justice,epoch,val_loss,val_f1,val_roc_auc,val_accuracy,val_runtime,val_samples_per_second,val_steps_per_second,test_loss,test_f1,test_roc_auc,test_accuracy,test_runtime,test_samples_per_second,test_steps_per_second
0,votes_side_j__ruth_bader_ginsburg,15.0,0.646003,0.682692,0.619423,0.682692,1.5332,135.666,16.958,0.764797,0.526316,0.496594,0.526316,1.4995,139.382,18.006
1,votes_side_j__clarence_thomas,15.0,0.686864,0.586538,0.514394,0.586538,1.5249,136.4,17.05,0.669994,0.607656,0.49937,0.607656,1.471,142.076,18.354
2,votes_side_j__stephen_g_breyer,15.0,0.637633,0.684466,0.5,0.684466,1.5301,134.634,16.993,0.65324,0.657005,0.5,0.657005,1.4897,138.956,17.453
3,votes_side_j__anthony_m_kennedy,15.0,2.292598,0.668421,0.567077,0.668421,1.4125,134.515,16.991,2.501234,0.626316,0.510092,0.626316,1.3573,139.981,17.682
4,votes_side_j__antonin_scalia,15.0,0.805713,0.607362,0.538169,0.607362,1.2219,133.401,17.187,0.712927,0.670732,0.556548,0.670732,1.1969,137.021,17.545
5,votes_side_j__john_g_roberts_jr,15.0,0.620671,0.686667,0.5,0.686667,1.145,131.004,16.594,0.697469,0.582781,0.5,0.582781,1.0765,140.27,17.65
6,votes_side_j__samuel_a_alito_jr,15.0,0.689488,0.57931,0.5,0.57931,1.0883,133.235,17.458,0.661288,0.643836,0.5,0.643836,1.0521,138.775,18.06
7,votes_side_j__john_paul_stevens,15.0,0.703912,0.522523,0.526786,0.522523,0.8788,126.307,15.931,0.695927,0.482143,0.499361,0.482143,0.8079,138.636,17.33
8,votes_side_j__sonia_sotomayor,15.0,0.669269,0.650943,0.5,0.650943,0.8199,129.276,17.074,0.668212,0.654206,0.5,0.654206,0.7712,138.742,18.153
9,votes_side_j__david_h_souter,15.0,0.683499,0.58,0.5,0.58,0.7571,132.076,17.17,0.665199,0.68,0.5,0.68,0.7207,138.757,18.038
