## Lexrank Implementation

In [211]:
import math
import numpy

def lex_rank(sentences, n, t):
    cosine_matrix = numpy.zeros((n, n))
    degrees = numpy.zeros((n,))
    l = []

    for i in range(n):
        for j in range(n):
            cosine_matrix[i][j] = idf_modified_cosine(sentences, sentences[i], sentences[j])
            if cosine_matrix[i][j] > t:
                cosine_matrix[i][j] = 1
                degrees[i] += 1
            else:
                cosine_matrix[i][j] = 0

    for i in range(n):
        for j in range(n):
            cosine_matrix[i][j] = cosine_matrix[i][j] / degrees[i]

    ratings = power_method(cosine_matrix, n, t)

    return zip(sentences, ratings)

In [200]:
def idf_modified_cosine(sentences, sentence1, sentence2):
    tf1 = compute_tf(sentence1)
    tf2 = compute_tf(sentence2)
    idf_metrics = compute_idf(sentences)
    return cosine_similarity(sentence1, sentence2, tf1, tf2, idf_metrics)

In [201]:
from collections import Counter

def compute_tf(sentence):
    tf_values = Counter(sentence)
    tf_metrics = {}

    max_tf = find_tf_max(tf_values)

    for term, tf in tf_values.items():
        tf_metrics[term] = tf / max_tf

    return tf_metrics


def find_tf_max(terms):
    return max(terms.values()) if terms else 1


def compute_idf(sentences):
    idf_metrics = {}
    sentences_count = len(sentences)

    for sentence in sentences:
        for term in sentence:
            if term not in idf_metrics:
                n_j = sum(1 for s in sentences if term in s)
                idf_metrics[term] = math.log(sentences_count / (1 + n_j))

    return idf_metrics


def cosine_similarity(sentence1, sentence2, tf1, tf2, idf_metrics):
    unique_words1 = set(sentence1)
    unique_words2 = set(sentence2)
    common_words = unique_words1 & unique_words2

    numerator = sum((tf1[t] * tf2[t] * idf_metrics[t] ** 2) for t in common_words)
    denominator1 = sum((tf1[t] * idf_metrics[t]) ** 2 for t in unique_words1)
    denominator2 = sum((tf2[t] * idf_metrics[t]) ** 2 for t in unique_words2)

    if denominator1 > 0 and denominator2 > 0:
        return numerator / (math.sqrt(denominator1) * math.sqrt(denominator2))
    else:
        return 0.0    

In [202]:
def power_method(cosine_matrix, n, e):
    transposed_matrix = cosine_matrix.T
    sentences_count = n

    p_vector = numpy.array([1.0 / sentences_count] * sentences_count)

    lambda_val = 1.0

    while lambda_val > e:
        next_p = numpy.dot(transposed_matrix, p_vector)
        lambda_val = numpy.linalg.norm(numpy.subtract(next_p, p_vector))
        p_vector = next_p

    return p_vector

In [203]:
def text_cleaner(text):
    text_divided = text.splitlines()
    text_divided_clean = " ".join(text_divided)
    return text_divided_clean

## Summarizing legal text

In [138]:
# libraries
import json
import lzma
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from IPython.core.display import display, HTML
import re
from nltk.tokenize import RegexpTokenizer
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [139]:
# defining a fucnction to remove \n and HTML tags
def text_cleaner(text):
    text_divided = text.splitlines()
    text_divided_clean = " ".join(text_divided)
    return text_divided_clean

In [140]:
# The file size for some states are too large to open into memory
# This function loads individual cases into memory, parses headnotes and 
# opinions, cleans the text, tokenizes the text, and returns counts of tokens
# for each case.

tokenizer = RegexpTokenizer('\s+', gaps=True)

def get_counts(state):
    cases = []
    with lzma.open("../" + state + '-text/data/data.jsonl.xz', 'r') as jsonl_file:
        for case in jsonl_file:
            c = json.loads(str(case, 'utf-8'))

            date = c['decision_date']
            
            headnotes = text_cleaner(c['casebody']['data']['head_matter'])
            headnotes_tokenized = tokenizer.tokenize(headnotes)
            num_headnotes = len(headnotes_tokenized)

            opinions = c['casebody']['data']['opinions']
            if opinions == []:
                num_opinions = 0
            else:
                opinions = text_cleaner(opinions[0]['text'])
                opinions_tokenized = tokenizer.tokenize(opinions)
                num_opinions = len(opinions_tokenized)
            cases.append({'date':date, 'num_headnotes':num_headnotes, 'headnotes': headnotes, 'num_opinions':num_opinions, 'opinions':opinions})
        return pd.DataFrame(cases)

In [141]:
%%time

states = ['Arkansas']
counts_ar = get_counts(states[0])
# counts_il = get_counts(states[1])
# counts_nm = get_counts(states[2])
# counts_nc = get_counts(states[3])

CPU times: user 46 s, sys: 689 ms, total: 46.7 s
Wall time: 46.8 s


In [332]:
counts_ar.head(20)

Unnamed: 0,date,num_headnotes,headnotes,num_opinions,opinions
0,1829-11,29,"Case No. 4,822a. FISHER v. REIDER. [Hempst. 82...",230,OPINION OF THE COIÍRT. This is an action of de...
1,1828-05,28,"Case No. 4,785a. FIKES v. BENTLEY. [Hempst. 61...",62,OPINION OP THE COURT. This is an appeal from t...
2,1836-02,27,"Case No. 4,863a. FLETCHER v. ELLIS. [Hempst. 3...",616,"CROSS, Judge. The record in this case shows th..."
3,1999-07-15,46,Michael NORRIS v. STATE of Arkansas CR 98-1429...,3936,"W. H.“Dub” Arnold, Chief Justice. This is a ca..."
4,1999-10-07,39,Roger Allen HAMMON v. STATE of Arkansas CR 98-...,1788,"Ray Thornton, Justice. Appellant brings this a..."
5,1999-10-07,49,Joe Louis DANSBY v. STATE of Arkansas CR 97-14...,8076,"Annabelle Clinton Imber, Justice. Mr. Joe Loui..."
6,1999-06-10,38,David McGREW v. STATE of Arkansas CR 98-426 99...,848,"W. H.“Dub” Arnold, Chief Justice. The appellan..."
7,1999-07-01,64,ST. PAUL FIRE & MARINE INSURANCE COMPANY v. GR...,3050,"Robert L. Brown, Justice. Appellant St. Paul F..."
8,1999-10-14,36,Sylvester RICHARDS v. STATE of Arkansas CR 99-...,909,"Lavenski R. Smith, Justice. Appellant, Sylvest..."
9,1999-10-14,35,Patricia OSBURN v. Bryan BUSBEE d/b/a Busbee C...,303,"Per Curiam. Appellee Bryan Busbee, d/b/a Busbe..."


## Example 1

In [316]:
test_opinions = counts_ar.iloc[2,4]

In [317]:
test_opinions

"CROSS, Judge. The record in this case shows that the plaintiff in error [Frederick Fletcher] brought an action of trespass on the case against the. defendant [William Ellis], in the Conway circuit court, and in his declaration alleged “that the said plaintiff and one Alexander Rogers, were indebted to Daniel Gilmore in a large sum of money, namely,' in the amount of fifty-five dollars, upon which said Gilmore had brought suit and obtained judgment, and sued out execution against the plaintiff and the said Rogers, and the plaintiff avers that he and Rogers had, in the county of Conway, sufficient goods and chattels to have satisfied the execution, and the plaintiff avers that the defendant being an evil disposed person, fond of encouraging litigation and fomenting strife, and wishing to harass, impoverish, and distress the plaintiff, did, on the first day of October, 1834, at the county of Conway, and within the jurisdiction of this court, maliciously persuade and procure the said Dani

In [318]:
# Reference: https://nlpforhackers.io/splitting-text-into-sentences/

from pprint import pprint
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
 
trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(test_opinions)
 
tokenizer = PunktSentenceTokenizer(trainer.get_params())

In [319]:
test_opinions_list = tokenizer.tokenize(test_opinions)

In [320]:
def text_summarizer(text, n, t=1):
    """
    n: number of sentences
    t: error tolerance
    """
    words_list = []
    for i in range(len(text)):
        words = text[i].split()
        words_list.append(words)
    zipped = lex_rank(words_list, len(text), t)
    unzipped = list(zip(*zipped))
    scores = np.array(unzipped[1])
    highest_index = scores.argsort()[-n:][::-1]
    summarized = []
    high_scores = []
    for i in range(len(highest_index)):
        sentence = text[i]
        score = scores[i]
        high_scores.append(score)
        summarized.append(sentence)     
    print("\nOriginally", len(text), "sentences\n")
    print("Summarized in", n, "sentences\n")
    print("Summarized:  ", summarized,"\n")
    print("Score for each sentence:  ", high_scores)

In [330]:
%%time

# 616 words
text_summarizer(test_opinions_list, 4, 0.1)


Originally 14 sentences

Summarized in 4 sentences

Summarized:   ['CROSS, Judge.', 'The record in this case shows that the plaintiff in error [Frederick Fletcher] brought an action of trespass on the case against the.', "defendant [William Ellis], in the Conway circuit court, and in his declaration alleged “that the said plaintiff and one Alexander Rogers, were indebted to Daniel Gilmore in a large sum of money, namely,' in the amount of fifty-five dollars, upon which said Gilmore had brought suit and obtained judgment, and sued out execution against the plaintiff and the said Rogers, and the plaintiff avers that he and Rogers had, in the county of Conway, sufficient goods and chattels to have satisfied the execution, and the plaintiff avers that the defendant being an evil disposed person, fond of encouraging litigation and fomenting strife, and wishing to harass, impoverish, and distress the plaintiff, did, on the first day of October, 1834, at the county of Conway, and within the 

## Example 2

In [322]:
test2_opinions = counts_ar.iloc[6,4]
test2_opinions_list = tokenizer.tokenize(test2_opinions)

In [340]:
%%time

# 848 words
text_summarizer(test2_opinions_list, 5, 0.1)


Originally 64 sentences

Summarized in 5 sentences

Summarized:   ['W. H.“Dub” Arnold, Chief Justice.', 'The appellant, David McGrew, was found guilty of misdemeanor sexual misconduct involving a minor and was sentenced to ninety days in the Mississippi County jad and fined $500.00.', 'On appeal, McGrew challenges (1) the denial of his motion to dismiss, based upon his prosecution for a misdemeanor offense more than one year after the commission of the offense, (2) the admission of rebuttal testimony, and (3) the sufficiency of the evidence.', 'We accepted certification of this case from the Court of Appeals in order to resolve an issue of first impression, specifically, the application of Ark.', 'Code Ann.'] 

Score for each sentence:   [0.015625, 0.012431795634920636, 0.023004150669591845, 0.022475658022533023, 0.015625]
CPU times: user 16.6 s, sys: 20 ms, total: 16.6 s
Wall time: 16.7 s


## Example 3

In [326]:
test3_opinions = counts_ar.iloc[4,4]
test3_opinions_list = tokenizer.tokenize(test3_opinions)

In [331]:
%%time

# 1788 words
text_summarizer(test3_opinions_list, 3, 0.1)


Originally 107 sentences

Summarized in 3 sentences

Summarized:   ['Ray Thornton, Justice.', 'Appellant brings this appeal of his conviction for capital murder in the shooting death of Roger Cousins on May 29, 1998, urging that the trial court erred in admitting the victim’s dying declaration naming appellant as his assailant.', 'We find no error and affirm appellant’s conviction and sentence of life imprisonment.'] 

Score for each sentence:   [0.009345794392523364, 0.014806203508787346, 0.006972311177918654]
CPU times: user 2min 42s, sys: 71.2 ms, total: 2min 42s
Wall time: 2min 42s


## Example 4

In [333]:
test4_opinions = counts_ar.iloc[3,4]
test4_opinions_list = tokenizer.tokenize(test4_opinions)

In [334]:
%%time

# 3936 words
text_summarizer(test4_opinions_list, 5, 0.1)


Originally 208 sentences

Summarized in 5 sentences

Summarized:   ['W. H.“Dub” Arnold, Chief Justice.', 'This is a case involving warrantless activity surrounding entry into the appellant’s home and his eventual arrest for the offense of driving while intoxicated, first offense.', 'On December 21, 1997, at approximately 1:00 p.m., appellant was allegedly seen driving erratically by another driver.', 'The citizen followed appellant to his home and called the police.', 'Based on the citizen’s information, the officer approached appellant’s home, where he was admitted into the house by appellant’s visiting mother-in-law, Ms.'] 

Score for each sentence:   [0.004807692307692308, 0.007964789137740697, 0.0027243589743589742, 0.001945240043644299, 0.005007451839366661]
CPU times: user 37min 8s, sys: 3.62 s, total: 37min 11s
Wall time: 37min 16s


## Filtering

In [337]:
counts_ar.iloc[3,2]

'Michael NORRIS v. STATE of Arkansas CR 98-1429 993 S.W.2d 918 Supreme Court of Arkansas Opinion delivered July 15, 1999 [Supplemental opinion on grant of rehearing issued September 16, 1999.] Doug Norwood, for appellant. Mark Pryor, Att’y Gen., by: Vada Berger, Ass’t Att’y Gen., for appellee.'