In [1]:
import os
import time
import torch
import subprocess as sp
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, random_split

%matplotlib inline

In [2]:
base_path = sp.getoutput('git rev-parse --show-toplevel')
os.chdir(base_path)

from src import embed, pred_models, model_helpers

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nuriaadellraventos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Data Preparation

In [3]:
# Load data
data_dir = 'data'
file = 'utterances_clean2014-2018.csv'
df_raw = pd.read_csv(os.path.join(base_path, data_dir, file))

  df_raw = pd.read_csv(os.path.join(base_path, data_dir, file))


In [4]:
# OPTION 1
# In the context of pandas' groupby and agg methods, 
# 'first' is an aggregation function that returns the 
# first non-null value in each group of values.
df = (df_raw.groupby('case_id')
        .agg({'text': ' '.join, 'win_side': 'first'})
        .reset_index()
        .drop(['case_id'], axis=1))

df.head()

Unnamed: 0,text,win_side
0,We will hear argument first this morning in Ca...,1.0
1,"We'll hear argument next in Case 12-1497, Kell...",1.0
2,"We will hear argument next in Case 131010, M&G...",1.0
3,We'll hear argument first this morning in Case...,1.0
4,We will hear argument first this morning in Ca...,1.0


In [5]:
# OPTION 2
df = df_raw[['text', 'win_side']]
df.head()

Unnamed: 0,text,win_side
0,"We'll hear argument next in Case No. 13-553, t...",1.0
1,"Thank you, Mr. Chief Justice, and may it pleas...",1.0
2,"Well, is said that -- it said that in -- in (b...",1.0
3,Right. I -- but I think--,1.0
4,"Another tax that discriminates is all it says,...",1.0


In [6]:
# TO DISCUSS
df.dropna(subset=['win_side'], inplace=True)
df = df[df.win_side != 2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=['win_side'], inplace=True)


In [7]:
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)

### BoW Embedding

In [8]:
# Bag of Words
vocab = embed.get_vocab(train_df, min_freq=10)
vocab_size = len(vocab)
vocab_size

8155

### Create training, validation, and testing dataloaders

In [9]:
BATCH_SIZE = 64

train_valid_data = list(train_df.values)
num_train = int(len(train_valid_data) * 0.95)
num_valid = len(train_valid_data) - num_train
train_data, valid_data = random_split(
    train_valid_data, [num_train, num_valid])
test_data, _ = random_split(list(test_df.values), [len(test_df), 0])

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE,
                              shuffle=True, 
                              collate_fn=lambda batch: embed.collate_into_bow(batch, vocab)) # pass vocab to collate function
valid_dataloader = DataLoader(valid_data, batch_size=BATCH_SIZE,
                              shuffle=False, 
                              collate_fn=lambda batch: embed.collate_into_bow(batch, vocab))
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE,
                             shuffle=False, 
                             collate_fn=lambda batch: embed.collate_into_bow(batch, vocab))

### Training

In [10]:
# BoW NN Classifier
model = pred_models.BoWNNClassifier(vocab_size=vocab_size, hidden_dim=300, output_dim=1)

In [12]:
EPOCHS = 15
loss_function = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

accuracies=[]
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    model_helpers.train_an_epoch(model, train_dataloader, optimizer, loss_function)
    accuracy = model_helpers.get_accuracy(model, valid_dataloader, 0.5)
    accuracies.append(accuracy)
    time_taken = time.time() - epoch_start_time
    print(f'After epoch {epoch} the validation accuracy is {accuracy:.3f}.')
    
plt.plot(range(1, EPOCHS+1), accuracies)

At iteration 200 the loss is 0.534.
At iteration 400 the loss is 0.556.
At iteration 600 the loss is 0.471.
At iteration 800 the loss is 0.493.
After epoch 1 the validation accuracy is 0.740.
At iteration 200 the loss is 0.371.
At iteration 400 the loss is 0.386.
At iteration 600 the loss is 0.394.
At iteration 800 the loss is 0.386.
After epoch 2 the validation accuracy is 0.689.
At iteration 200 the loss is 0.555.
At iteration 400 the loss is 0.375.
At iteration 600 the loss is 0.373.
At iteration 800 the loss is 0.394.
After epoch 3 the validation accuracy is 0.723.
At iteration 200 the loss is 0.354.
At iteration 400 the loss is 0.260.


In [None]:
model_helpers.get_accuracy(test_dataloader)

0.7275838466803559