In [1]:
# libraries
import json
import lzma
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from IPython.core.display import display, HTML
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords  
import nltk
nltk.download('stopwords')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

# pd.options.display.max_columns = 999
# pd.options.display.max_rows = 999

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/qilongxin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# defining a fucnction to remove \n and HTML tags
# function adapted from https://www.analyticsvidhya.com/blog/2019/06/comprehensive-guide-text-summarization-using-deep-learning-python/
stop_words = set(stopwords.words('english')) 
def text_cleaner(text):
    text_divided = text.splitlines()
    text_divided_clean = " ".join(text_divided)
    text_divided_clean = text_divided_clean.lower()
    text_divided_clean = re.sub('"','', text_divided_clean) # remove '"'
    text_divided_clean = re.sub(r"'s\b","",text_divided_clean) # remove ''s'
    text_divided_clean = re.sub("[^a-zA-Z]", " ", text_divided_clean) # removes all strings that contains a non-letter
    return text_divided_clean


# setting up tokenizer
tokenizer = RegexpTokenizer('\s+', gaps=True)

In [3]:
def get_data(state):
    # reading json files
    cases = []
    with lzma.open(state + '/data/data.jsonl.xz', 'r') as jsonl_file:
        for case in jsonl_file:
            cases.append(json.loads(str(case, 'utf-8')))

    df = pd.DataFrame(cases).sort_values('decision_date').reset_index(drop=True)
    df['decision_date'] = pd.to_datetime(df['decision_date'])

    # parsing data
    storage = []
    for i in range(df.shape[0]):
        casebody_idx = df.columns.get_loc("casebody")
        judges = df.iloc[i,casebody_idx]['data']['judges']
        attorneys = df.iloc[i,casebody_idx]['data']['attorneys']
        headnotes = df.iloc[i,casebody_idx]['data']['head_matter']
        if df.iloc[i,casebody_idx]['data']['opinions'] != []:
            opinions = df.iloc[i,casebody_idx]['data']['opinions'][0]['text']

        headnotes_clean = text_cleaner(headnotes)
        opinions_clean = text_cleaner(opinions)

        storage.append({'judges': judges,
                        'attorneys': attorneys,
                        'headnotes': headnotes_clean,
                        'opinions': opinions_clean})
    df_parsed = pd.DataFrame(storage)
    df = df_parsed.merge(df, left_index=True, right_index=True)

    # tokenizing headnotes and opinions
#     df['headnotes'] = df['headnotes'].apply(lambda x: tokenizer.tokenize(x))
#     df['opinions'] = df['opinions'].apply(lambda x: tokenizer.tokenize(x))
    df['headnotes_num_tokens'] = [len(notes) for notes in df['headnotes']]
    df['opinions_num_tokens'] = [len(opinions) for opinions in df['opinions']]

    return df

In [84]:
df_ar = get_data('Arkansas')

In [85]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [86]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [87]:
def readLangs(text, summary, reverse=False):
    print("Reading lines...")
    
    # Split every line into pairs and normalize
    pairs = [[text[i],summary[i]] for i in range(len(text))]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(summary)
        output_lang = Lang(text)
    else:
        input_lang = Lang(text)
        output_lang = Lang(summary)

    return input_lang, output_lang, pairs

In [88]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [133]:
train_size = 1000 
val_size = 200

train = df_ar.sample(n=train_size,replace=False, random_state=1)
val = df_ar.sample(n=val_size,replace=False, random_state=1)

x_train,y_train = train.opinions.tolist(),train.headnotes.tolist()
x_val,y_val = val['opinions'],val['headnotes']

In [134]:
input_lang, output_lang, pairs = prepareData( x, y , False)
print(random.choice(pairs))

Reading lines...
Read 1000 sentence pairs
Counting words...
Counted words:
0      the court overruled the motion  and said that ...
1      this was an indictment for rape committed on t...
2      opinion of the court  it is clear that the cou...
3      opinion oe the court    the court below dismis...
4      opinion of the court  the court below dismisse...
5      johnson  j   delivered the opinion of the cour...
6      opinion op the court   it is clear that the co...
7      johnson  j  william russell sued out from two ...
8      on motion of the prosecuting attorney  the cou...
9      opinion oe the court    in this case it would ...
10     opinion oe the court    in this case it would ...
11     per curiam  after the commencement of a suit a...
12     opinion op the court   daniel plott recovered ...
13     opinion of the court  this is an action on the...
14     opinion of the court  the only question we dee...
15     opinion of the court  this is a case brought h...
16     opinio

In [135]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [136]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [137]:
MAX_LENGTH = 90

In [138]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [139]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [140]:
teacher_forcing_ratio = 0.5
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0
    if input_length > max_length:
        length = max_length
    else:
        length = input_length
        
    for ei in range(length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [141]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [142]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [143]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    print("Training....")
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        if iter% 1000 == 0:
            print(iter,"/",n_iters + 1)
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [144]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
        
        if input_length > max_length:
            length = max_length
        else:
            length = input_length
        
        for ei in range(length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [145]:
def evaluateRandomly(encoder, decoder, n=1):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [146]:
hidden_size = 300
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 10, print_every=1)

Training....
0m 0s (- 0m 6s) (1 10%) 9.9147
1m 30s (- 6m 0s) (2 20%) 9.8303
1m 59s (- 4m 38s) (3 30%) 68.3030
2m 2s (- 3m 3s) (4 40%) 257.8905
2m 20s (- 2m 20s) (5 50%) 114.6407
2m 27s (- 1m 38s) (6 60%) 74.2404
2m 48s (- 1m 12s) (7 70%) 96.5427
3m 0s (- 0m 45s) (8 80%) 208.3252
3m 5s (- 0m 20s) (9 90%) 301.8702
3m 31s (- 0m 0s) (10 100%) 209.3794


In [147]:
torch.save(encoder1.state_dict(), './enc.w')
torch.save(attn_decoder1.state_dict(), './att.w')

In [154]:
evaluateRandomly(encoder1, attn_decoder1)

> dickinson j   delivered the opinion of the court  section    of the rev  stat  ark  under the title  execution  p       authorizes the sheriff to permit the defendant in execution  to retain possession of the property levied upon  by giving bond in favor of the plaintiff with sufficient security  in double the value of such property  conditioned for the delivery of the property to the officer at the time and place of sale to be named in such condition   and by the provisions of the   th section of the same act   if the condition of the bond be broken  and the execution be returned unsatisfied  the defendants and his securities shall be  deemed to have notice of the facts  and the plaintiff  without further notice  may on the relurri day of the execution  or on any subsequent day of the term at which such execution is returned  move the court for judgment on the bond against the defendant and his sureties  or any of them  or the plaintiff may at his option bring an ordinary suit on th

In [150]:
import rouge

rouge = rouge.Rouge()

def getRougeScore(encoder, decoder, n=10):
    scores = np.zeros(n)
    for i in range(n):
        pair = random.choice(pairs)
        output_words, _ = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        sc = rouge.get_scores(output_sentence, pair[1])
        scores[i] = sc[0]['rouge-1']['f']
    return scores 

In [155]:
getRougeScore(encoder1,attn_decoder1,100)

array([0.03571428, 0.03179916, 0.04423748, 0.0530303 , 0.        ,
       0.03789127, 0.02886598, 0.02443281, 0.04910714, 0.01204819,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

0.0