# Supplementary:
## Absractive Seq2Seq GRU Model with Attention 

In [1]:
# libraries
import json
import lzma
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from IPython.core.display import display, HTML
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords  
import nltk
nltk.download('stopwords')
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sns.set()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/40982191/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Model Overview 
This supplementary notebook includes an implementation of the abastractive Seq2Seq model with attention using GRU. Different from the one included in the main notebook, this model is implemented in PyTorch, which is more often used for natural language processing and more efficient than Keras. This model uses encoders and attention decoders with two layers of GRU to genereate summary.

### Data Cleanning

In [2]:
# defining a fucnction to remove \n and HTML tags
# function adapted from https://www.analyticsvidhya.com/blog/2019/06/comprehensive-guide-text-summarization-using-deep-learning-python/
stop_words = set(stopwords.words('english')) 
def text_cleaner(text):
    text_divided = text.splitlines()
    text_divided_clean = " ".join(text_divided)
    text_divided_clean = text_divided_clean.lower()
    text_divided_clean = re.sub('"','', text_divided_clean) # remove '"'
    text_divided_clean = re.sub(r"'s\b","",text_divided_clean) # remove ''s'
    text_divided_clean = re.sub("[^a-zA-Z]", " ", text_divided_clean) # removes all strings that contains a non-letter
    return text_divided_clean

# setting up tokenizer
tokenizer = RegexpTokenizer('\s+', gaps=True)

In [3]:
def get_data(state):
    # reading json files
    cases = []
    with lzma.open(state + '/data/data.jsonl.xz', 'r') as jsonl_file:
        for case in jsonl_file:
            cases.append(json.loads(str(case, 'utf-8')))

    df = pd.DataFrame(cases).sort_values('decision_date').reset_index(drop=True)
    df['decision_date'] = pd.to_datetime(df['decision_date'])

    # parsing data
    storage = []
    for i in range(df.shape[0]):
        casebody_idx = df.columns.get_loc("casebody")
        judges = df.iloc[i,casebody_idx]['data']['judges']
        attorneys = df.iloc[i,casebody_idx]['data']['attorneys']
        headnotes = df.iloc[i,casebody_idx]['data']['head_matter']
        if df.iloc[i,casebody_idx]['data']['opinions'] != []:
            opinions = df.iloc[i,casebody_idx]['data']['opinions'][0]['text']

        headnotes_clean = text_cleaner(headnotes)
        opinions_clean = text_cleaner(opinions)

        storage.append({'judges': judges,
                        'attorneys': attorneys,
                        'headnote': headnotes_clean,
                        'opinion_text': opinions_clean})
    df_parsed = pd.DataFrame(storage)
    df = df_parsed.merge(df, left_index=True, right_index=True)

    # tokenizing headnotes and opinions
    df['headnotes_token'] = df['headnote'].apply(lambda x: tokenizer.tokenize(x))
    df['opinions_token'] = df['opinion_text'].apply(lambda x: tokenizer.tokenize(x))
    df['headnotes_num_tokens'] = [len(notes) for notes in df['headnotes_token']]
    df['opinions_num_tokens'] = [len(opinions) for opinions in df['opinions_token']]

    return df

In [4]:
# get North Carolina data 
df_nc = get_data('North Carolina')

### Model Creation 
The implementation of a Seq2Seq model with GRU below is adapted from https://www.kaggle.com/rahuldshetty/text-summarization-in-pytorch with the following changes to accomodate the North Carolina dataset: 
1. Adapt the evaluate function to test trained model on validation data;
2. Adjust the max length requirement on the sentence level;
3. Add one more layer of GRU in encoder, decoder, and attention-decoder; 
 

In [5]:
# The implementation below is borrowed from 
# https://www.kaggle.com/rahuldshetty/text-summarization-in-pytorch
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import time
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SOS_token = 0
EOS_token = 1

MAX_LENGTH = 1000 # Max length of the summary 
teacher_forcing_ratio = 0.5 # percentage of training to use teacher forcing 

# language class to store words and embeddings 
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"} # beginning and end of sentence 
        self.n_words = 2  

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

# read into the language class 
def readLangs(text, summary, reverse=False):
    print("Reading lines...")
    
    pairs = [[text[i],summary[i]] for i in range(len(text))]

    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(summary)
        output_lang = Lang(text)
    else:
        input_lang = Lang(text)
        output_lang = Lang(summary)

    return input_lang, output_lang, pairs

# read all texts and summaries into the language class 
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    return input_lang, output_lang, pairs

# define an RNN encoder class with 2 layers of GRU 
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)# get embeddings 
        output = embedded
        # two layers of GRU 
        output1, hidden1 = self.gru(output, hidden)
        output, hidden = self.gru(output1, hidden1)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# RNN decoder class with 2 layers of GRU 
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output1, hidden1 = self.gru(output, hidden)
        output, hidden = self.gru(output1, hidden1)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# decoder with attention 
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1) # get embeddings 
        embedded = self.dropout(embedded)
        
        # update weights for attention scores 
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        # concatenate attnetion scores to output
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        
        # two layers of GRU
        output1, hidden1 = self.gru(output, hidden)
        output, hidden = self.gru(output1, hidden1)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
# prepare words from sentence     
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]
# create tensor from words of each sentence
def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)
# create tensors from all texts and summaries 
def tensorsFromPair(input_lang,output_lang,pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)


# a function to train encoder and decoder 
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()
    
    # initialize optimizer 
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0 # initialize loss 
    
    # decide proper input length 
    if input_length > max_length:
        length = max_length
    else:
        length = input_length
        
    # run encoder on each tensor 
    for ei in range(length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden
    
    # decide if use teacher forcing to improve efficiency 
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # get hidden state with attention scores 
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs) # use actual target as input 
            # update loss function 
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  

    else:
        for di in range(target_length):
            # get hidden state with attention scores 
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)# get the most recent prediction 
            decoder_input = topi.squeeze().detach()  

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()
    
    # update optimizer 
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length


# train iteratively 
def trainIters(encoder, decoder, n_iters, print_every=1000, learning_rate=0.01):
    print("Training....")
    print_loss_total = 0  
    
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    # generate tranining pairs from the number of iterations 
    training_pairs = [tensorsFromPair(x_train_lang,y_train_lang,random.choice(train_pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()
    
    # iterate through 
    for iter in range(1, n_iters + 1):
        if iter% 1000 == 0:
            print(iter,"/",n_iters + 1)
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        
        # apply train function in each iteration and updates loss 
        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('(%d %d%%) %.4f' % (iter, iter / n_iters * 100, print_loss_avg))
def evaluate(encoder, decoder, in_lang,out_lang,sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        # get tensor from input 
        input_tensor = tensorFromSentence(in_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()
        
        # initialize output 
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
        
        # check for max length requirement 
        if input_length > max_length:
            length = max_length
        else:
            length = input_length
        
        # generate each token from encoder 
        for ei in range(length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]
    
        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden
        
        decoded_words = [] # initialize decoded words 
        decoder_attentions = torch.zeros(max_length, max_length) # initialize attention 
        # generate each token from attention decoder 
        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs) # get decoder output 
            decoder_attentions[di] = decoder_attention.data # get attention scores 
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token: 
                decoded_words.append('<EOS>') # break when reaching end of sentence 
                break
            else:
                decoded_words.append(out_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]


### Model training 

In [10]:
train_size = 5000
val_size = 100

train_df = df_nc.sample(n=train_size,replace=False, random_state=1)
val_df = df_nc.sample(n=val_size,replace=False, random_state=1)

x_train,y_train = train_df.opinion_text.tolist(),train_df.headnote.tolist()
x_val,y_val = val_df.opinion_text.tolist(),val_df.headnote.tolist()

x_train_lang, y_train_lang, train_pairs = prepareData( x_train, y_train , False)
x_val_lang, y_val_lang, val_pairs = prepareData( x_val, y_val , False)

Reading lines...
Reading lines...


In [7]:
# see original training iteration scope at
# https://www.kaggle.com/rahuldshetty/text-summarization-in-pytorch
hidden_size = 150
encoder1 = EncoderRNN(x_train_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, y_train_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 100, print_every=1)

Training....
(1 1%) 10.4913
(2 2%) 9.7428
(3 3%) 41.4239
(4 4%) 167.3436
(5 5%) 51.5854
(6 6%) 40.2086
(7 7%) 38.4778
(8 8%) 118.7396
(9 9%) 130.1008
(10 10%) 594.8017
(11 11%) 336.5368
(12 12%) 788.0958
(13 13%) 269.5146
(14 14%) 117.3960
(15 15%) 298.8359
(16 16%) 42.6356
(17 17%) 157.3596
(18 18%) 963.7615
(19 19%) 812.1887
(20 20%) 907.2435
(21 21%) 1541.8153
(22 22%) 827.2287
(23 23%) 1141.2617
(24 24%) 715.3318
(25 25%) 599.7248
(26 26%) 1374.8220
(27 27%) 521.2188
(28 28%) 541.5331
(29 28%) 494.0919
(30 30%) 771.2525
(31 31%) 479.9329
(32 32%) 476.3551
(33 33%) 379.4948
(34 34%) 278.6584
(35 35%) 237.9760
(36 36%) 305.7043
(37 37%) 104.3085
(38 38%) 59.4516
(39 39%) 46.9063
(40 40%) 198.6581
(41 41%) 279.5665
(42 42%) 344.4051
(43 43%) 306.1651
(44 44%) 125.0189
(45 45%) 104.2100
(46 46%) 162.1568
(47 47%) 112.2507
(48 48%) 101.6270
(49 49%) 217.0972
(50 50%) 67.2271
(51 51%) 250.2664
(52 52%) 298.4876
(53 53%) 212.7325
(54 54%) 32.1909
(55 55%) 247.3997
(56 56%) 438.2434
(57 56

In [13]:
# save weights of the trained model 
torch.save(encoder1.state_dict(), './supp_seq2seq_encoder.w')
torch.save(attn_decoder1.state_dict(), './supp_seq2seq_attention_decoder.w')

### Evaluate and report ROUGE scores

In [9]:
# set max recurssion for ROUGE 
#https://github.com/pltrdy/rouge/issues/19
sys.setrecursionlimit(train_size * MAX_LENGTH + 10)

In [14]:
import rouge #https://pypi.org/project/rouge/
rouge = rouge.Rouge()
# test on 100 validation samples 
scores_r1 = np.zeros(val_size) # ROUGE-1
scores_r2 = np.zeros(val_size) # ROUGE-2
for i in range(val_size):
    pair = val_pairs[i]
    # get generated words 
    out_words, _ = evaluate(encoder1, attn_decoder1, x_val_lang,y_val_lang, pair[0])
    # concatenate the genrated words into full summary 
    out_sentence = ' '.join(out_words)
    sc = rouge.get_scores(out_sentence, pair[1])
    scores_r1[i] = sc[0]['rouge-1']['f']
    scores_r2[i] = sc[0]['rouge-2']['f']

In [15]:
print('The average Rouge 1 F-scores on 1000 random cases is')
print(np.mean(scores_r1))
print('The average Rouge 2 F-scores on 1000 random cases is')
print(np.mean(scores_r2))

The average Rouge 1 F-scores on 1000 random cases is
0.007588122918098881
The average Rouge 2 F-scores on 1000 random cases is
5.2490542906115484e-05


### Summary
We decided against including this model due to its poor performance and its similarity to the Abastract Seq2Seq model in the main notebook. Nevertheless, we believe this model can further prove that abstractive models are not suitable with legal texts, potentially due to the thematic structures and citations of the legal texts, which are discussed in the main notebook. At the same time, the model also requires deep training. As seen in the original code, the model was intended to train for iterations in the hundred-thousand scale, which is not achieveable given our computing resources. Therefore, we cannot rule out the possibility that with deeper training, this model might perform well. Still, extractive models are more sensible choices give the scope of this project. 