# Bag of Words Model

#### Libraries Import

In [None]:
import os
import time
import torch
import subprocess as sp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!pip install datasets
from datasets import Dataset
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, random_split

%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
base_path = sp.getoutput('git rev-parse --show-toplevel')
os.chdir(base_path)

from src import embed, pred_models, model_helpers

### Data Preparation

#### Data Loading

In [None]:
# Get list of files in folder
folder_path = "data/"
file_list = os.listdir(folder_path)

# Create empty list to hold dataframes
df_list = []

# Loop through files in folder
for file in file_list:
    # Check if file is a CSV
    if file.endswith('.csv'):
        # Read CSV file into a pandas dataframe
        df = pd.read_csv(os.path.join(folder_path, file))
        # Append dataframe to list
        df_list.append(df)

# Concatenate all dataframes in list into a single dataframe
df = pd.concat(df_list, axis=0, ignore_index=True)

df.head()

#### Select Top Justices

In [None]:
# Get judges columns
j_columns = [col for col in df.columns if col.startswith('votes_side_j_')]

count = 0
justices_list = []
justices_dict = {}

for justice in j_columns:
  df_j = df[['case_id', 'text'] + [justice]]
  grouped_df = df_j.groupby('case_id')['text'].apply(lambda x: ','.join(x)).reset_index()
  justices = df[["case_id"] + [justice]].drop_duplicates(keep='first')

  df1 = pd.merge(grouped_df, justices, left_on='case_id', right_on='case_id', how='left').dropna(axis='rows', how='any')
  df1 = df1.drop(df1[~df1[justice].isin([0, 1])].index)

  justices_dict[justice] = len(df1)



In [None]:
# Sorted justices by case load
sorted_justices_by_case = sorted(justices_dict.items(), key=lambda x:x[1], reverse=True)
sorted_justices_by_case

In [None]:
# Get top justices
top_justices = [justice for justice, votes in sorted_justices_by_case[:15]]
print(top_justices)

### Case Outcome Predictions

#### Datasets & Tokenizer

In [None]:
# Have to do by utterance as grouping by case_id gives too few examples to learn from
df_all = df[['win_side', 'text', 'case_id']]
df_all.head()

# Keep only cases with outcomes 0 or 1
df_all = df_all[df_all.win_side.isin([0, 1])]

In [None]:
# Get unique case_ids
unique_case_ids = df_all['case_id'].unique()

# Split the unique case_ids into training and testing sets
train_case_ids, val_test_case_ids = train_test_split(unique_case_ids, test_size=0.2, random_state=123)
val_case_ids, test_case_ids = train_test_split(val_test_case_ids, test_size=0.5, random_state=123)

# Filter the original dataframe to create the train and test dataframes using the train and test case_ids
train_df = df_all[df_all['case_id'].isin(train_case_ids)]
val_df = df_all[df_all['case_id'].isin(val_case_ids)]
test_df = df_all[df_all['case_id'].isin(test_case_ids)]

In [None]:
# Bag of Words
vocab = embed.get_vocab(train_df, min_freq=100)
vocab_size = len(vocab)
vocab_size

In [None]:
# Data Loaders
BATCH_SIZE = 128
vocab_size = 300 # Size of GloVe vectors

train_dataloader = DataLoader(Dataset.from_pandas(train_df.drop(columns=['case_id']), preserve_index = False), batch_size=BATCH_SIZE,
                              shuffle=True,
                              collate_fn=lambda batch: embed.collate_into_bow(batch, vocab))
valid_dataloader = DataLoader(Dataset.from_pandas(val_df.drop(columns=['case_id']), preserve_index = False), batch_size=BATCH_SIZE,
                              shuffle=False, 
                              collate_fn=lambda batch: embed.collate_into_bow(batch, vocab))
test_dataloader = DataLoader(Dataset.from_pandas(test_df.drop(columns=['case_id']), preserve_index = False), batch_size=BATCH_SIZE,
                             shuffle=False, 
                             collate_fn=lambda batch: embed.collate_into_bow(batch, vocab))

#### Training

In [None]:
# BoW NN Classifier
hidden_dim = 1000
model = pred_models.BoWNNClassifier(vocab_size=vocab_size, hidden_dim=hidden_dim, output_dim=1)

In [None]:
EPOCHS = 15
loss_function = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

best_model = None
best_val_loss = float('inf')
val_losses = []
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    model_helpers.train_an_epoch(model, train_dataloader, optimizer, loss_function, print_val=True)
    val_y_true, val_y_pred = model_helpers.make_predictions(model, valid_dataloader)
    val_loss = log_loss(val_y_true.astype(np.float64), val_y_pred.astype(np.float64))
    if val_loss < best_val_loss:
        best_model = type(model)(model.vocab_size, model.hidden_dim, model.output_dim)
        best_model.load_state_dict(model.state_dict())
        best_val_loss = val_loss
    val_losses.append(val_loss)
    time_taken = time.time() - epoch_start_time
    print(f'After epoch {epoch} the validation loss is {val_loss:.3f}.')

plt.plot(range(1, EPOCHS+1), val_losses)

In [None]:
# Get validation predictions to select best threshold
val_labels, val_probs = model_helpers.make_predictions(model, valid_dataloader)
# Get best threshold from validation data
threshold = model_helpers.select_threshold(val_labels, val_probs)

#### Evaluation

In [None]:
# Get dataframe with predictions and real values
test_results_df = model_helpers.get_test_results_df(best_model, test_dataloader, test_df[['case_id', 'win_side']])
test_results_df.head()

In [None]:
# Results dataframe per utterance
per_utterance_df = (test_results_df
                    .assign(pred=lambda x: x['prob']
                    .apply(lambda y: 1 if y > threshold else 0)))

# Results dataframe per case
per_case_df = (test_results_df
                .groupby('case_id')
                .mean()
                .assign(pred=lambda x: x['prob']
                .apply(lambda y: 1 if y > threshold else 0)))

In [None]:
# Evaluation metrics per utterance
model_helpers.get_evaluation_matrix(per_utterance_df['win_side'], per_utterance_df['pred'])

In [None]:
# Evaluation metrics per case
model_helpers.get_evaluation_matrix(per_case_df['win_side'], per_case_df['pred'])

In [None]:
# Confusion matrix by case
model_helpers.results_heatmap(per_case_df['win_side'],
                per_case_df['pred'],
                'Confusion Matrix by Case',
                target_names = ['respondent', 'petitioner'])

In [None]:
# Confusion matrix by utterance
model_helpers.results_heatmap(per_utterance_df['win_side'],
                per_utterance_df['pred'],
                'Confusion Matrix by Utterance',
                target_names = ['respondent', 'petitioner'])

### Justice Outcome Predictions

In [None]:
top_justices

In [None]:
# Set parameters, loss function and optimizer
BATCH_SIZE = 64
EPOCHS = 15
hidden_dim = 500
loss_function = torch.nn.BCELoss()

for justice in top_justices:

    print("Running model for: ", justice)

    # Have to do by utterance as grouping by case_id gives too few examples to learn from
    df_j = df[[justice, 'text', 'case_id']]
    df_j.head()

    # Keep only cases with outcomes 0 or 1
    df_j = df_j[df_j[justice].isin([0, 1])]

    # Get unique case_ids
    unique_case_ids = df_j['case_id'].unique()
    # Split the unique case_ids into training and testing sets
    train_case_ids, val_test_case_ids = train_test_split(unique_case_ids, test_size=0.2, random_state=123)
    val_case_ids, test_case_ids = train_test_split(val_test_case_ids, test_size=0.5, random_state=123)

    # Filter the original dataframe to create the train and test dataframes using the train and test case_ids
    train_df = df_j[df_j['case_id'].isin(train_case_ids)]
    val_df = df_j[df_j['case_id'].isin(val_case_ids)]
    test_df = df_j[df_j['case_id'].isin(test_case_ids)]

    # Data loaders
    train_dataloader = DataLoader(Dataset.from_pandas(train_df.drop(columns=['case_id']), preserve_index = False), batch_size=BATCH_SIZE,
                                shuffle=True,
                                collate_fn=lambda batch: embed.collate_into_bow(batch, vocab, justice))
    valid_dataloader = DataLoader(Dataset.from_pandas(val_df.drop(columns=['case_id']), preserve_index = False), batch_size=BATCH_SIZE,
                                shuffle=False, 
                                collate_fn=lambda batch: embed.collate_into_bow(batch, vocab, justice))
    test_dataloader = DataLoader(Dataset.from_pandas(test_df.drop(columns=['case_id']), preserve_index = False), batch_size=BATCH_SIZE,
                                shuffle=False, 
                                collate_fn=lambda batch: embed.collate_into_bow(batch, vocab, justice))
    
    # BoW NN Classifier
    model = pred_models.BoWNNClassifier(vocab_size=vocab_size, hidden_dim=hidden_dim, output_dim=1)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    # Train model
    print("Start training...")
    best_model = None
    best_val_loss = float('inf')
    val_losses = []
    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        model_helpers.train_an_epoch(model, train_dataloader, optimizer, loss_function)
        val_y_true, val_y_pred = model_helpers.make_predictions(model, valid_dataloader)
        val_loss = log_loss(val_y_true.astype(np.float64), val_y_pred.astype(np.float64))
        if val_loss < best_val_loss:
            best_model = type(model)(model.vocab_size, model.hidden_dim, model.output_dim)
            best_model.load_state_dict(model.state_dict())
            best_val_loss = val_loss
        val_losses.append(val_loss)
        time_taken = time.time() - epoch_start_time
        print(f'After epoch {epoch} the validation loss is {val_loss:.3f}.')
    
    # Get validation predictions to select best threshold
    val_labels, val_probs = model_helpers.make_predictions(model, valid_dataloader)
    # Get best threshold from validation data
    threshold = model_helpers.select_threshold(val_labels, val_probs)

    # Model evaluation
    print("Start test evaluation...")
    # Get dataframe with predictions and real values
    test_results_df = model_helpers.get_test_results_df(best_model, test_dataloader, test_df[['case_id', justice]])

    # Results dataframe per utterance
    per_utterance_df = (test_results_df
                        .assign(pred=lambda x: x['prob']
                        .apply(lambda y: 1 if y > threshold else 0)))

    # Results dataframe per case
    per_case_df = (test_results_df
                    .groupby('case_id')
                    .mean()
                    .assign(pred=lambda x: x['prob']
                    .apply(lambda y: 1 if y > threshold else 0)))
    
    # Evaluation metrics per utterance
    eval_metrics = model_helpers.get_evaluation_matrix(per_utterance_df[justice], per_utterance_df['pred'])
    print("Per utterance: ", eval_metrics)

    # Evaluation metrics per case
    eval_metrics_case = model_helpers.get_evaluation_matrix(per_case_df[justice], per_case_df['pred'])
    print("Per case: ", eval_metrics_case)

