In [1]:
# libraries
import json
import lzma
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from IPython.core.display import display, HTML
import re
from nltk.tokenize import RegexpTokenizer
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
def text_cleaner(text):
    text_divided = text.splitlines()
    text_divided_clean = " ".join(text_divided)
    return text_divided_clean

In [3]:
# The file size for some states are too large to open into memory
# This function loads individual cases into memory, parses headnotes and 
# opinions, cleans the text, tokenizes the text, and returns counts of tokens
# for each case.

tokenizer = RegexpTokenizer('\s+', gaps=True)

def get_counts(state):
    cases = []
    with lzma.open("../" + state + '-text/data/data.jsonl.xz', 'r') as jsonl_file:
        for case in jsonl_file:
            c = json.loads(str(case, 'utf-8'))

            date = c['decision_date']
            
            headnotes = text_cleaner(c['casebody']['data']['head_matter'])
            headnotes_tokenized = tokenizer.tokenize(headnotes)
            num_headnotes = len(headnotes_tokenized)

            opinions = c['casebody']['data']['opinions']
            if opinions == []:
                num_opinions = 0
            else:
                opinions = text_cleaner(opinions[0]['text'])
                opinions_tokenized = tokenizer.tokenize(opinions)
                num_opinions = len(opinions_tokenized)
            cases.append({'date':date, 'num_headnotes':num_headnotes, 'headnotes': headnotes, 'num_opinions':num_opinions, 'opinions':opinions})
        return pd.DataFrame(cases)

In [4]:
%%time

states = ['Arkansas']
counts_ar = get_counts(states[0])
# counts_il = get_counts(states[1])
# counts_nm = get_counts(states[2])
# counts_nc = get_counts(states[3])

CPU times: user 46.9 s, sys: 700 ms, total: 47.6 s
Wall time: 47.8 s


In [5]:
counts_ar.head(20)

Unnamed: 0,date,num_headnotes,headnotes,num_opinions,opinions
0,1829-11,29,"Case No. 4,822a. FISHER v. REIDER. [Hempst. 82...",230,OPINION OF THE COIÍRT. This is an action of de...
1,1828-05,28,"Case No. 4,785a. FIKES v. BENTLEY. [Hempst. 61...",62,OPINION OP THE COURT. This is an appeal from t...
2,1836-02,27,"Case No. 4,863a. FLETCHER v. ELLIS. [Hempst. 3...",616,"CROSS, Judge. The record in this case shows th..."
3,1999-07-15,46,Michael NORRIS v. STATE of Arkansas CR 98-1429...,3936,"W. H.“Dub” Arnold, Chief Justice. This is a ca..."
4,1999-10-07,39,Roger Allen HAMMON v. STATE of Arkansas CR 98-...,1788,"Ray Thornton, Justice. Appellant brings this a..."
5,1999-10-07,49,Joe Louis DANSBY v. STATE of Arkansas CR 97-14...,8076,"Annabelle Clinton Imber, Justice. Mr. Joe Loui..."
6,1999-06-10,38,David McGREW v. STATE of Arkansas CR 98-426 99...,848,"W. H.“Dub” Arnold, Chief Justice. The appellan..."
7,1999-07-01,64,ST. PAUL FIRE & MARINE INSURANCE COMPANY v. GR...,3050,"Robert L. Brown, Justice. Appellant St. Paul F..."
8,1999-10-14,36,Sylvester RICHARDS v. STATE of Arkansas CR 99-...,909,"Lavenski R. Smith, Justice. Appellant, Sylvest..."
9,1999-10-14,35,Patricia OSBURN v. Bryan BUSBEE d/b/a Busbee C...,303,"Per Curiam. Appellee Bryan Busbee, d/b/a Busbe..."


### Do our legal texts really have the linguistic markars?

In [97]:
# https://www.aclweb.org/anthology/W04-1006.pdf
markars_introduction = ['application for judicial review', 'application to review a decision', 'motion filed by', 'Statement of Claim']

markars_context = ["advise","indicate","concern","request"]

markars_juridical_analysis = ['this court','In reviewing',
                            'Pursuant to section','As I have stated','In the present case']

markars_conclusion = ['note','accept','summarise','scrutinize','think','say','satisfy','discuss','conclude','find','believe','reach','persuade',
                      'agree','indicate','review']

In [129]:
markars = [markars_introduction, markars_context, markars_juridical_analysis, markars_conclusion]

def markar_detector(text_list):
    for i in range(len(text_list)):
        sentence = text_list[i]
        
        for markar in markars:
            for j in range(len(markar)):
                markar_word = markar[j]
                TF = markar_word in sentence
                if TF == True:
                    if markar == markars_context:
                        type_markar = "context"
                        print("Linguistic markar '"+markar_word+"' detected! Sentence #", i, "of "+str(len(text_list))+". This is",type_markar,"markar.")
                    elif markar == markars_juridical_analysis:
                        type_markar = "juridical analysis"
                        print("Linguistic markar '"+markar_word+"' detected! Sentence #", i, "of "+str(len(text_list))+". This is",type_markar,"markar.") 
                    else:
                        type_markar = "conclusion"
                        print("Linguistic markar '"+markar_word+"' detected! Sentence #", i, "of "+str(len(text_list))+". This is",type_markar,"markar.")      

### Example 1

In [130]:
test_text = counts_ar.iloc[4,4]

In [131]:
# Reference: https://nlpforhackers.io/splitting-text-into-sentences/

from pprint import pprint
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
 
trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(test_text)
 
tokenizer = PunktSentenceTokenizer(trainer.get_params())

In [132]:
test_text_list = tokenizer.tokenize(test_text)

In [133]:
markar_detector(test_text_list)

Linguistic markar 'find' detected! Sentence # 2 of 70. This is conclusion markar.
Linguistic markar 'request' detected! Sentence # 6 of 70. This is context markar.
Linguistic markar 'say' detected! Sentence # 7 of 70. This is conclusion markar.
Linguistic markar 'indicate' detected! Sentence # 15 of 70. This is context markar.
Linguistic markar 'indicate' detected! Sentence # 15 of 70. This is conclusion markar.
Linguistic markar 'say' detected! Sentence # 20 of 70. This is conclusion markar.
Linguistic markar 'say' detected! Sentence # 21 of 70. This is conclusion markar.
Linguistic markar 'concern' detected! Sentence # 22 of 70. This is context markar.
Linguistic markar 'believe' detected! Sentence # 22 of 70. This is conclusion markar.
Linguistic markar 'this court' detected! Sentence # 24 of 70. This is juridical analysis markar.
Linguistic markar 'agree' detected! Sentence # 33 of 70. This is conclusion markar.
Linguistic markar 'concern' detected! Sentence # 34 of 70. This is con

### Example 2

In [134]:
test_text2 = counts_ar.iloc[5,4]
test_text_list2 = tokenizer.tokenize(test_text2)

markar_detector(test_text_list2)

Linguistic markar 'find' detected! Sentence # 6 of 561. This is conclusion markar.
Linguistic markar 'indicate' detected! Sentence # 25 of 561. This is context markar.
Linguistic markar 'indicate' detected! Sentence # 25 of 561. This is conclusion markar.
Linguistic markar 'indicate' detected! Sentence # 61 of 561. This is context markar.
Linguistic markar 'indicate' detected! Sentence # 61 of 561. This is conclusion markar.
Linguistic markar 'conclude' detected! Sentence # 89 of 561. This is conclusion markar.
Linguistic markar 'request' detected! Sentence # 94 of 561. This is context markar.
Linguistic markar 'request' detected! Sentence # 96 of 561. This is context markar.
Linguistic markar 'indicate' detected! Sentence # 108 of 561. This is context markar.
Linguistic markar 'indicate' detected! Sentence # 108 of 561. This is conclusion markar.
Linguistic markar 'agree' detected! Sentence # 138 of 561. This is conclusion markar.
Linguistic markar 'conclude' detected! Sentence # 142 

### Example 3

In [135]:
test_text3 = counts_ar.iloc[19,4]
test_text_list3 = tokenizer.tokenize(test_text3)

markar_detector(test_text_list3)

Linguistic markar 'agree' detected! Sentence # 6 of 222. This is conclusion markar.
Linguistic markar 'find' detected! Sentence # 14 of 222. This is conclusion markar.
Linguistic markar 'advise' detected! Sentence # 38 of 222. This is context markar.
Linguistic markar 'note' detected! Sentence # 48 of 222. This is conclusion markar.
Linguistic markar 'say' detected! Sentence # 73 of 222. This is conclusion markar.
Linguistic markar 'say' detected! Sentence # 76 of 222. This is conclusion markar.
Linguistic markar 'say' detected! Sentence # 77 of 222. This is conclusion markar.
Linguistic markar 'say' detected! Sentence # 82 of 222. This is conclusion markar.
Linguistic markar 'note' detected! Sentence # 90 of 222. This is conclusion markar.
Linguistic markar 'note' detected! Sentence # 91 of 222. This is conclusion markar.
Linguistic markar 'In reviewing' detected! Sentence # 95 of 222. This is juridical analysis markar.
Linguistic markar 'review' detected! Sentence # 95 of 222. This i