In [1]:
import pandas as pd
import os
!pip install datasets
import torch
import numpy as np
from datasets import *
!pip install transformers==4.28.0
from transformers import AutoTokenizer, LongformerForSequenceClassification, \
  Trainer, TrainingArguments, EvalPrediction
!pip install transformers[sentencepiece]
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import random

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Collec

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"torch.device: {device}")

torch.device: cuda


In [3]:
from google.colab import drive 
drive.mount('/content/gdrive')
folder_path = "gdrive/MyDrive/Colab Notebooks/AdvML/data/"

Mounted at /content/gdrive


In [4]:
#helper functions

def preprocess_data(examples):
  # take a batch of texts
  text = examples["new_text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

def multi_label_metrics(predictions, labels, threshold=0.6):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

### Data Preprocessing

In [5]:
# Get list of files in folder
file_list = os.listdir(folder_path)

# Create empty list to hold dataframes
df_list = []

# Loop through files in folder
for file in file_list:
    # Check if file is a CSV
    if file.endswith('.csv'):
        # Read CSV file into a pandas dataframe
        df = pd.read_csv(os.path.join(folder_path, file))
        # Append dataframe to list
        df_list.append(df)

# Concatenate all dataframes in list into a single dataframe
df = pd.concat(df_list, axis=0, ignore_index=True)

# Create results dictionary
results = {}

# Clean data
df = df.merge(
    df[['speaker', 'side']].drop_duplicates(ignore_index=True),
    how='left',
    left_on='speaker_addressed',
    right_on='speaker',
    suffixes=('', '_addressed')).drop('speaker_addressed', axis=1)

  df = pd.read_csv(os.path.join(folder_path, file))
  df = pd.read_csv(os.path.join(folder_path, file))
  df = pd.read_csv(os.path.join(folder_path, file))
  df = pd.read_csv(os.path.join(folder_path, file))


In [6]:
# Add in utterance information to utterance

for name_col, type_col in {'speaker': 'speaker_type', 'speaker_replied_to': 'speaker_type_replied_to'}.items():
    df[f'{name_col}_natural'] = df[name_col].apply(lambda name: ' '.join(name.lstrip('j__').split('_')).title())
    speaker_type_translation = {
        'J': 'Justice',
        'A': 'Attorney',
        '<Inaudile>': None
    }

    df[type_col].fillna('na', inplace=True)
    df[f'{type_col}_natural'] = df[type_col].apply(lambda s_type: speaker_type_translation[s_type] if not s_type == 'na' else None)

    df[f'{name_col}_natural'] = df[f'{type_col}_natural'] + ' ' + df[f'{name_col}_natural']
    df[f'{name_col}_natural'].fillna('Unknown', inplace=True)
    df.drop(f'{type_col}_natural', axis=1, inplace=True)

side_translation = {
    0: 'Responding',
    1: 'Petitioning',
    2: '',
    3: ''
}

df['side'].fillna(3, inplace=True)
df[f'side_natural'] = df['side'].apply(lambda side: side_translation[side])

df['side_addressed'].fillna(3, inplace=True)
df[f'side_addressed_natural'] = df['side_addressed'].apply(lambda side: side_translation[side])


df["new_text"] = "<UTTERANCE_START>" + df['side_natural'] + " " + df["speaker_natural"] + " says: '" + df["text"] + "' to " + df['side_addressed_natural'] + " " + df["speaker_replied_to_natural"] + " <UTTERANCE_END>"


j_columns = [col for col in df.columns if col.startswith('votes_side_j_')]

In [7]:
df['new_text'][0]

"<UTTERANCE_START> Justice Samuel A Alito Jr says: 'Well, if the text of this is so clear, how is it that Mr. Gould and Mr. Abbott proposed different interpretations of this provision?' to Petitioning Attorney David L Horan <UTTERANCE_END>"

### Modeling

#### Top case justices

LongformerForSequenceClassification
https://huggingface.co/docs/transformers/v4.29.1/en/model_doc/longformer#transformers.LongformerForSequenceClassification.forward.example

In [8]:
count = 0
justices_list = []
justices_dict = {}

for justice in j_columns:
  df_j = df[['case_id', 'new_text'] + [justice]]
  grouped_df = df_j.groupby('case_id')['new_text'].apply(lambda x: ','.join(x)).reset_index()
  justices = df[["case_id"] + [justice]].drop_duplicates(keep='first')

  df1 = pd.merge(grouped_df, justices, left_on='case_id', right_on='case_id', how='left').dropna(axis='rows', how='any')
  df1 = df1.drop(df1[~df1[justice].isin([0, 1])].index)

  justices_dict[justice] = len(df1)


In [9]:
# Sorted justices by case load
sorted_justices_by_case = sorted(justices_dict.items(), key=lambda x:x[1], reverse=True)
sorted_justices_by_case

[('votes_side_j__ruth_bader_ginsburg', 1390),
 ('votes_side_j__clarence_thomas', 1387),
 ('votes_side_j__stephen_g_breyer', 1376),
 ('votes_side_j__anthony_m_kennedy', 1264),
 ('votes_side_j__antonin_scalia', 1090),
 ('votes_side_j__john_g_roberts_jr', 1003),
 ('votes_side_j__samuel_a_alito_jr', 969),
 ('votes_side_j__john_paul_stevens', 741),
 ('votes_side_j__sonia_sotomayor', 710),
 ('votes_side_j__david_h_souter', 665),
 ('votes_side_j__elena_kagan', 605),
 ('votes_side_j__sandra_day_oconnor', 388),
 ('votes_side_j__william_h_rehnquist', 365),
 ('votes_side_j__neil_gorsuch', 195),
 ('votes_side_j__brett_m_kavanaugh', 115),
 ('votes_side_j__abe_fortas', 0),
 ('votes_side_j__arthur_j_goldberg', 0),
 ('votes_side_j__byron_r_white', 0),
 ('votes_side_j__charles_e_whittaker', 0),
 ('votes_side_j__earl_warren', 0),
 ('votes_side_j__felix_frankfurter', 0),
 ('votes_side_j__harold_burton', 0),
 ('votes_side_j__harry_a_blackmun', 0),
 ('votes_side_j__hugo_l_black', 0),
 ('votes_side_j__john_

In [10]:
top_justices = [justice for justice, votes in sorted_justices_by_case[:15]]
print(top_justices)

['votes_side_j__ruth_bader_ginsburg', 'votes_side_j__clarence_thomas', 'votes_side_j__stephen_g_breyer', 'votes_side_j__anthony_m_kennedy', 'votes_side_j__antonin_scalia', 'votes_side_j__john_g_roberts_jr', 'votes_side_j__samuel_a_alito_jr', 'votes_side_j__john_paul_stevens', 'votes_side_j__sonia_sotomayor', 'votes_side_j__david_h_souter', 'votes_side_j__elena_kagan', 'votes_side_j__sandra_day_oconnor', 'votes_side_j__william_h_rehnquist', 'votes_side_j__neil_gorsuch', 'votes_side_j__brett_m_kavanaugh']


#### Run model for top justices

In [11]:
def preprocess_for_justice(justice, test_valid_amt = 0.2, valid_portion_test_valid_amt = 0.5):
    '''
    Takes a justice votes_side column and gets data ready for model

    Inputs:
        justice (str): Justice vote side column
        test_valid_amt (float): Percentage of data allocated to testing / validation
        valid_portion_test_valid_amt (float): Percentage of data allocated to validation
    '''
    # Concatenate utterances per case
    df_j = df[['case_id', 'new_text'] + [justice]]
    grouped_df = df_j.groupby('case_id')['new_text'].apply(lambda x: ','.join(x)).reset_index()

    # Get justice and combine with utterances
    justices = df[["case_id"] + [justice]].drop_duplicates(keep='first')
    df1 = pd.merge(grouped_df,
                   justices,
                   left_on='case_id',
                   right_on='case_id',
                   how='left').dropna(axis='rows', how='any')

    # Ensure binary label
    df1 = df1.drop(df1[~df1[justice].isin([0, 1])].index)
    print(df1[justice].value_counts(normalize=True))
    pct_overturn = df1[justice].value_counts(normalize=True)[1]

    print(f'Number of cases: {len(df1)}')
    print(f'Number of training cases: {len(df1) * (1-test_valid_amt)}')
    print(f'Number of validation cases: {len(df1) * (test_valid_amt * (valid_portion_test_valid_amt))}')
    print(f'Number of test cases: {len(df1) * (test_valid_amt * (1-valid_portion_test_valid_amt))}')

    # Create dataset
    dataset = Dataset.from_pandas(df1.drop('case_id', axis=1), preserve_index = False)

    # Train/Validation/Test split
    dataset = dataset.train_test_split(test_size=test_valid_amt, shuffle=True)
    dataset_test_valid = dataset['test'].train_test_split(test_size=(1-valid_portion_test_valid_amt), shuffle=True)
    dataset = DatasetDict({
        'train': dataset['train'],
        'test': dataset_test_valid['test'],
        'validation': dataset_test_valid['train']})

    # Keep case_id per observation
    labels = [label for label in dataset['train'].features.keys() if label not in ['case_id', 'new_text']]
    id2label = {idx:label for idx, label in enumerate(labels)}
    label2id = {label:idx for idx, label in enumerate(labels)}

    return dataset, labels, id2label, label2id, pct_overturn

In [12]:
# Params for each model
tokenizer = AutoTokenizer.from_pretrained("jpwahle/longformer-base-plagiarism-detection",
                                          cache_dir='gdrive/MyDrive/Colab Notebooks/AdvML/data/',)
lr = 2e-5
epochs = 25
output_directory = 'gdrive/MyDrive/Colab Notebooks/AdvML/results/'
cache_directory = 'gdrive/MyDrive/Colab Notebooks/AdvML/data/'
train_batch_size = 8
eval_batch_size = 8
save_and_eval_strat = "epoch"
warm_steps = 160
wt_decay = 0.01
dataloader_num_workers_input = 2
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
validation_and_testing_percentage = 0.2
split_of_validation_and_testing = 0.5 # This is percentage of above going to validation

Save model
https://discuss.huggingface.co/t/how-to-save-my-model-to-use-it-later/20568/2

In [13]:
overall_results = {}
baselines = {}
for justice in top_justices[9:]:
    # Get data
    dataset, labels, id2label, label2id, pct_overturn = preprocess_for_justice(justice,
                                                                               validation_and_testing_percentage,
                                                                               split_of_validation_and_testing)
    justice_name = justice.split("j__")[1]

    baselines[justice_name] = pct_overturn

    # Load model and tokenizer and define length of the text sequence
    model = LongformerForSequenceClassification.from_pretrained('jpwahle/longformer-base-plagiarism-detection',
                                                                gradient_checkpointing=False,
                                                                ignore_mismatched_sizes=True,
                                                                num_labels = len(labels),
                                                                id2label=id2label,
                                                                label2id=label2id,
                                                                cache_dir=cache_directory,
                                                                return_dict=True)
    # Tokenize dataset
    encoded_dataset = dataset.map(preprocess_data,
                                  batched=True,
                                  remove_columns=dataset['train'].column_names)

    # get args with appropriate model name
    args = TrainingArguments(
        output_dir = output_directory,
        num_train_epochs = epochs,
        per_device_train_batch_size = train_batch_size,
        # gradient_accumulation_steps = 32,    
        per_device_eval_batch_size= eval_batch_size,
        evaluation_strategy = save_and_eval_strat,
        save_strategy = save_and_eval_strat,
        save_total_limit = 2,
        disable_tqdm = False, 
        load_best_model_at_end=True,
        warmup_steps=warm_steps,
        weight_decay=wt_decay,
        # logging_steps = 4,
        learning_rate = lr,
        # logging_dir='gdrive/MyDrive/Colab Notebooks/AdvML/logs/',
        dataloader_num_workers = dataloader_num_workers_input,
        run_name = f'longformer_classification_{justice_name}_{lr}'
        )
    # Run trainer and save output

    trainer = Trainer(
        model,
        args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
        )
    
    print(f'Starting training for {justice_name}')
    trainer.train()

    results = trainer.evaluate()

    # trainer.save_model(f'gdrive/MyDrive/Colab Notebooks/AdvML/models/longformer_{justice_name}_{lr}_v01')
    
    # from transformers import AutoModel
    # trained_model = AutoModel.from_pretrained(f'gdrive/MyDrive/Colab Notebooks/AdvML/models/longformer_{justice_name}_{lr}_v01')
    
    # Initialize a new trainer instance with the trained model and test data
    trainer_test = Trainer(
        model=model,  
        args=args,
        eval_dataset=encoded_dataset["test"], 
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
        )

    # Evaluate the model on the test data
    test_results = trainer_test.evaluate()

    # Print results
    print(f'For justice {justice_name}, results are below:')
    print(test_results)

    # Add results to output
    overall_results[justice_name] = test_results
    

1.0    0.604511
0.0    0.395489
Name: votes_side_j__david_h_souter, dtype: float64
Number of cases: 665
Number of training cases: 532.0
Number of validation cases: 66.5
Number of test cases: 66.5


Some weights of the model checkpoint at jpwahle/longformer-base-plagiarism-detection were not used when initializing LongformerForSequenceClassification: ['longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at jpwahle/longformer-base-plagiarism-detection and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size

Map:   0%|          | 0/532 [00:00<?, ? examples/s]

Map:   0%|          | 0/67 [00:00<?, ? examples/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

Starting training for david_h_souter




Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.239291,0.636364,0.5,0.636364
2,No log,0.235496,0.636364,0.5,0.636364
3,No log,0.231333,0.636364,0.5,0.636364
4,No log,0.244429,0.636364,0.5,0.636364
5,No log,0.237084,0.621212,0.497024,0.621212
6,No log,0.300011,0.590909,0.571429,0.590909
7,No log,0.324353,0.666667,0.595238,0.666667
8,0.199800,0.404533,0.621212,0.532738,0.621212
9,0.199800,0.326811,0.666667,0.595238,0.666667
10,0.199800,0.313252,0.666667,0.58631,0.666667


For justice david_h_souter, results are below:
{'eval_loss': 0.24038895964622498, 'eval_f1': 0.6268656716417911, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.6268656716417911, 'eval_runtime': 1.2418, 'eval_samples_per_second': 53.953, 'eval_steps_per_second': 7.247}
1.0    0.606612
0.0    0.393388
Name: votes_side_j__elena_kagan, dtype: float64
Number of cases: 605
Number of training cases: 484.0
Number of validation cases: 60.5
Number of test cases: 60.5


Some weights of the model checkpoint at jpwahle/longformer-base-plagiarism-detection were not used when initializing LongformerForSequenceClassification: ['longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at jpwahle/longformer-base-plagiarism-detection and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Starting training for elena_kagan




Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.232484,0.666667,0.5,0.666667
2,No log,0.239057,0.666667,0.5,0.666667
3,No log,0.244892,0.65,0.4875,0.65
4,No log,0.233466,0.666667,0.525,0.666667
5,No log,0.321346,0.633333,0.4875,0.633333
6,No log,0.240374,0.683333,0.575,0.683333
7,No log,0.291441,0.65,0.5375,0.65
8,No log,0.25809,0.633333,0.55,0.633333
9,0.198200,0.278619,0.683333,0.6,0.683333
10,0.198200,0.262263,0.666667,0.575,0.666667


For justice elena_kagan, results are below:
{'eval_loss': 0.2537681758403778, 'eval_f1': 0.6557377049180327, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.6557377049180327, 'eval_runtime': 1.1814, 'eval_samples_per_second': 51.635, 'eval_steps_per_second': 6.772}
1.0    0.701031
0.0    0.298969
Name: votes_side_j__sandra_day_oconnor, dtype: float64
Number of cases: 388
Number of training cases: 310.40000000000003
Number of validation cases: 38.800000000000004
Number of test cases: 38.800000000000004


Some weights of the model checkpoint at jpwahle/longformer-base-plagiarism-detection were not used when initializing LongformerForSequenceClassification: ['longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at jpwahle/longformer-base-plagiarism-detection and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size

Map:   0%|          | 0/310 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

Starting training for sandra_day_oconnor




Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.231115,0.692308,0.5,0.692308
2,No log,0.264247,0.692308,0.5,0.692308
3,No log,0.225069,0.692308,0.5,0.692308
4,No log,0.23621,0.692308,0.5,0.692308
5,No log,0.284186,0.692308,0.5,0.692308
6,No log,0.241578,0.692308,0.5,0.692308
7,No log,0.250779,0.692308,0.523148,0.692308
8,No log,0.2248,0.692308,0.5,0.692308
9,No log,0.262943,0.717949,0.564815,0.717949
10,No log,0.217754,0.717949,0.541667,0.717949


For justice sandra_day_oconnor, results are below:
{'eval_loss': 0.23543673753738403, 'eval_f1': 0.6923076923076923, 'eval_roc_auc': 0.48214285714285715, 'eval_accuracy': 0.6923076923076923, 'eval_runtime': 0.8488, 'eval_samples_per_second': 45.948, 'eval_steps_per_second': 5.891}
1.0    0.665753
0.0    0.334247
Name: votes_side_j__william_h_rehnquist, dtype: float64
Number of cases: 365
Number of training cases: 292.0
Number of validation cases: 36.5
Number of test cases: 36.5


Some weights of the model checkpoint at jpwahle/longformer-base-plagiarism-detection were not used when initializing LongformerForSequenceClassification: ['longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at jpwahle/longformer-base-plagiarism-detection and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size

Map:   0%|          | 0/292 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Starting training for william_h_rehnquist




Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.226773,0.722222,0.583333,0.722222
2,No log,0.228789,0.666667,0.5,0.666667
3,No log,0.23515,0.666667,0.5,0.666667
4,No log,0.216108,0.694444,0.541667,0.694444
5,No log,0.211508,0.666667,0.5,0.666667
6,No log,0.229105,0.722222,0.666667,0.722222
7,No log,0.259569,0.722222,0.583333,0.722222
8,No log,0.504704,0.555556,0.583333,0.555556
9,No log,0.320151,0.722222,0.666667,0.722222
10,No log,0.289915,0.694444,0.645833,0.694444


For justice william_h_rehnquist, results are below:
{'eval_loss': 0.2121744453907013, 'eval_f1': 0.7027027027027027, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.7027027027027027, 'eval_runtime': 0.8342, 'eval_samples_per_second': 44.352, 'eval_steps_per_second': 5.993}
1.0    0.610256
0.0    0.389744
Name: votes_side_j__neil_gorsuch, dtype: float64
Number of cases: 195
Number of training cases: 156.0
Number of validation cases: 19.5
Number of test cases: 19.5


Some weights of the model checkpoint at jpwahle/longformer-base-plagiarism-detection were not used when initializing LongformerForSequenceClassification: ['longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at jpwahle/longformer-base-plagiarism-detection and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Starting training for neil_gorsuch




Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.415188,0.421053,0.5,0.421053
2,No log,0.25338,0.578947,0.5,0.578947
3,No log,0.252734,0.578947,0.5,0.578947
4,No log,0.288249,0.684211,0.625,0.684211
5,No log,0.246927,0.631579,0.5625,0.631579
6,No log,0.394594,0.421053,0.465909,0.421053
7,No log,0.36555,0.578947,0.551136,0.578947
8,No log,0.467901,0.473684,0.477273,0.473684
9,No log,0.543177,0.315789,0.340909,0.315789
10,No log,0.692526,0.315789,0.375,0.315789


For justice neil_gorsuch, results are below:
{'eval_loss': 0.3063710331916809, 'eval_f1': 0.65, 'eval_roc_auc': 0.5119047619047619, 'eval_accuracy': 0.65, 'eval_runtime': 0.6077, 'eval_samples_per_second': 32.911, 'eval_steps_per_second': 4.937}
1.0    0.626087
0.0    0.373913
Name: votes_side_j__brett_m_kavanaugh, dtype: float64
Number of cases: 115
Number of training cases: 92.0
Number of validation cases: 11.5
Number of test cases: 11.5


Some weights of the model checkpoint at jpwahle/longformer-base-plagiarism-detection were not used when initializing LongformerForSequenceClassification: ['longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at jpwahle/longformer-base-plagiarism-detection and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size

Map:   0%|          | 0/92 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Starting training for brett_m_kavanaugh




Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.636709,0.272727,0.5,0.272727
2,No log,0.397225,0.272727,0.5,0.272727
3,No log,0.23826,0.727273,0.5,0.727273
4,No log,0.216433,0.727273,0.5,0.727273
5,No log,0.217438,0.727273,0.5,0.727273
6,No log,0.23192,0.727273,0.5,0.727273
7,No log,0.243604,0.727273,0.5,0.727273
8,No log,0.278516,0.636364,0.4375,0.636364
9,No log,0.296621,0.636364,0.4375,0.636364
10,No log,0.305978,0.545455,0.479167,0.545455


For justice brett_m_kavanaugh, results are below:
{'eval_loss': 0.28401896357536316, 'eval_f1': 0.5, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.5, 'eval_runtime': 0.4642, 'eval_samples_per_second': 25.851, 'eval_steps_per_second': 4.309}


In [14]:
for justice, test_results in overall_results.items():
    print(justice)
    print(f'Baseline petitioner winrate: {baselines[justice]:.2%}')
    print(f'loss: {test_results["eval_loss"]:.2%}')
    print(f'f1: {test_results["eval_f1"]:.2%}')
    print(f'roc_auc: {test_results["eval_roc_auc"]:.2%}')
    print(f'accuracy: {test_results["eval_accuracy"]:.2%}')
    print('\n--------------------------------------\n')

david_h_souter
Baseline petitioner winrate: 60.45%
loss: 24.04%
f1: 62.69%
roc_auc: 50.00%
accuracy: 62.69%

--------------------------------------

elena_kagan
Baseline petitioner winrate: 60.66%
loss: 25.38%
f1: 65.57%
roc_auc: 50.00%
accuracy: 65.57%

--------------------------------------

sandra_day_oconnor
Baseline petitioner winrate: 70.10%
loss: 23.54%
f1: 69.23%
roc_auc: 48.21%
accuracy: 69.23%

--------------------------------------

william_h_rehnquist
Baseline petitioner winrate: 66.58%
loss: 21.22%
f1: 70.27%
roc_auc: 50.00%
accuracy: 70.27%

--------------------------------------

neil_gorsuch
Baseline petitioner winrate: 61.03%
loss: 30.64%
f1: 65.00%
roc_auc: 51.19%
accuracy: 65.00%

--------------------------------------

brett_m_kavanaugh
Baseline petitioner winrate: 62.61%
loss: 28.40%
f1: 50.00%
roc_auc: 50.00%
accuracy: 50.00%

--------------------------------------



### Case Outcome

In [None]:
def preprocess_all_cases(test_valid_amt = 0.2, valid_portion_test_valid_amt = 0.5):
    '''
    Takes a win_side column and gets data ready for model

    Inputs:
        test_valid_amt (float): Percentage of data allocated to testing / validation
        valid_portion_test_valid_amt (float): Percentage of data allocated to validation
    '''

    # Concatenate utterances per case
    df_j = df[['case_id', 'new_text'] + ["win_side"]]
    grouped_df = df_j.groupby('case_id')['new_text'].apply(lambda x: ','.join(x)).reset_index()

    # Get justice and combine with utterances
    justices = df[["case_id"] + ["win_side"]].drop_duplicates(keep='first')
    df1 = pd.merge(grouped_df, justices, left_on='case_id', right_on='case_id', how='left').dropna(axis='rows', how='any')

    # Ensure binary label
    df1 = df1.drop(df1[~df1["win_side"].isin([0, 1])].index)
    print(df1["win_side"].value_counts(normalize=True))

    pct_overturn = df1["win_side"].value_counts(normalize=True)[1]

    print(f'Number of cases: {len(df1)}')
    print(f'Number of training cases: {len(df1) * (1-test_valid_amt)}')
    print(f'Number of validation cases: {len(df1) * (test_valid_amt * (valid_portion_test_valid_amt))}')
    print(f'Number of test cases: {len(df1) * (test_valid_amt * (1-valid_portion_test_valid_amt))}')

    # Create dataset
    dataset = Dataset.from_pandas(df1.drop('case_id', axis=1), preserve_index = False)

    # Train/Validation/Test split
    dataset = dataset.train_test_split(test_size=test_valid_amt, shuffle=True)
    dataset_test_valid = dataset['test'].train_test_split(test_size=(1-valid_portion_test_valid_amt),
                                                        shuffle=True)
    dataset = DatasetDict({
        'train': dataset['train'],
        'test': dataset_test_valid['test'],
        'validation': dataset_test_valid['train']})

    # Keep case_id per observation
    labels = [label for label in dataset['train'].features.keys() if label not in ['case_id', 'new_text']]
    id2label = {idx:label for idx, label in enumerate(labels)}
    label2id = {label:idx for idx, label in enumerate(labels)}

    return dataset, labels, id2label, label2id, pct_overturn

In [None]:
# Get data
dataset, labels, id2label, label2id, pct_overturn = preprocess_all_cases(validation_and_testing_percentage,
                                                                         split_of_validation_and_testing)

baseline = pct_overturn

# Load model and tokenizer and define length of the text sequence
model = LongformerForSequenceClassification.from_pretrained("jpwahle/longformer-base-plagiarism-detection",
                                                            gradient_checkpointing=False,
                                                            num_labels = len(labels),
                                                            ignore_mismatched_sizes=True,
                                                            id2label=id2label,
                                                            label2id=label2id,
                                                            cache_dir=cache_directory,
                                                            return_dict=True)
# Tokenize dataset
encoded_dataset = dataset.map(preprocess_data,
                                batched=True,
                                remove_columns=dataset['train'].column_names)

# get args with appropriate model name
args = TrainingArguments(
    output_dir = output_directory,
    num_train_epochs = epochs,
    per_device_train_batch_size = train_batch_size,
    # gradient_accumulation_steps = 32,    
    per_device_eval_batch_size= eval_batch_size,
    evaluation_strategy = save_and_eval_strat,
    save_strategy = save_and_eval_strat,
    save_total_limit = 2,
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=warm_steps,
    weight_decay=wt_decay,
    # logging_steps = 4,
    learning_rate = lr,
    # logging_dir='gdrive/MyDrive/Colab Notebooks/AdvML/logs/',
    dataloader_num_workers = dataloader_num_workers_input,
    run_name = f'longformer_classification_all_cases_{lr}'
    )
# Run trainer and save output

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )
trainer.train()

results = trainer.evaluate()

# trainer.save_model(f'gdrive/MyDrive/Colab Notebooks/AdvML/models/longformer_all_cases_{lr}_v01')

# from transformers import AutoModel
# trained_model = AutoModel.from_pretrained(f'gdrive/MyDrive/Colab Notebooks/AdvML/models/longformer_all_cases_{lr}_v01')

# Initialize a new trainer instance with the trained model and test data
trainer_test = Trainer(
    model=model,  
    args=args,
    eval_dataset=encoded_dataset["test"], 
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )

# Evaluate the model on the test data
test_results = trainer_test.evaluate()

# Print results
print(f'For all_cases, results are below:')
print(test_results)

# Add results to output
overall_results = test_results


1.0    0.657143
0.0    0.342857
Name: win_side, dtype: float64
Number of cases: 1400
Number of training cases: 1120.0
Number of validation cases: 140.0
Number of test cases: 140.0


Some weights of the model checkpoint at jpwahle/longformer-base-plagiarism-detection were not used when initializing LongformerForSequenceClassification: ['longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at jpwahle/longformer-base-plagiarism-detection and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size

Map:   0%|          | 0/1120 [00:00<?, ? examples/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.228983,0.657143,0.5,0.657143
2,No log,0.241878,0.657143,0.5,0.657143
3,No log,0.223332,0.657143,0.5,0.657143
4,0.257900,0.242263,0.657143,0.5,0.657143
5,0.257900,0.371983,0.478571,0.508605,0.478571
6,0.257900,0.41624,0.514286,0.51087,0.514286
7,0.257900,0.356413,0.571429,0.53442,0.571429
8,0.132000,0.345721,0.557143,0.518569,0.557143
9,0.132000,0.38924,0.55,0.543025,0.55
10,0.132000,0.424789,0.535714,0.527174,0.535714


For all_cases, results are below:
{'eval_loss': 0.22670027613639832, 'eval_f1': 0.6642857142857143, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.6642857142857143, 'eval_runtime': 2.3852, 'eval_samples_per_second': 58.694, 'eval_steps_per_second': 7.546}


In [None]:
print('Win side results')
print(f'Baseline petitioner winrate: {baseline:.2%}')
print(f'loss: {overall_results["eval_loss"]:.2%}')
print(f'f1: {overall_results["eval_f1"]:.2%}')
print(f'roc_auc: {overall_results["eval_roc_auc"]:.2%}')
print(f'accuracy: {overall_results["eval_accuracy"]:.2%}')

Win side results
Baseline petitioner winrate: 65.71%
loss: 22.67%
f1: 66.43%
roc_auc: 50.00%
accuracy: 66.43%
