### Filter Citations

In [1]:
# libraries
import json
import lzma
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from IPython.core.display import display, HTML
import re
from nltk.tokenize import RegexpTokenizer
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
def text_cleaner(text):
    text_divided = text.splitlines()
    text_divided_clean = " ".join(text_divided)
    return text_divided_clean

In [3]:
# libraries
import json
import lzma
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from IPython.core.display import display, HTML
import re
from nltk.tokenize import RegexpTokenizer
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [4]:
# The file size for some states are too large to open into memory
# This function loads individual cases into memory, parses headnotes and 
# opinions, cleans the text, tokenizes the text, and returns counts of tokens
# for each case.

tokenizer = RegexpTokenizer('\s+', gaps=True)

def get_counts(state):
    cases = []
    with lzma.open("../" + state + '-text/data/data.jsonl.xz', 'r') as jsonl_file:
        for case in jsonl_file:
            c = json.loads(str(case, 'utf-8'))

            date = c['decision_date']
            
            headnotes = text_cleaner(c['casebody']['data']['head_matter'])
            headnotes_tokenized = tokenizer.tokenize(headnotes)
            num_headnotes = len(headnotes_tokenized)

            opinions = c['casebody']['data']['opinions']
            if opinions == []:
                num_opinions = 0
            else:
                opinions = text_cleaner(opinions[0]['text'])
                opinions_tokenized = tokenizer.tokenize(opinions)
                num_opinions = len(opinions_tokenized)
            cases.append({'date':date, 'num_headnotes':num_headnotes, 'headnotes': headnotes, 'num_opinions':num_opinions, 'opinions':opinions})
        return pd.DataFrame(cases)

In [5]:
%%time

states = ['Arkansas']
counts_ar = get_counts(states[0])
# counts_il = get_counts(states[1])
# counts_nm = get_counts(states[2])
# counts_nc = get_counts(states[3])

CPU times: user 44.5 s, sys: 671 ms, total: 45.2 s
Wall time: 45.3 s


In [6]:
counts_ar.head(20)

Unnamed: 0,date,num_headnotes,headnotes,num_opinions,opinions
0,1829-11,29,"Case No. 4,822a. FISHER v. REIDER. [Hempst. 82...",230,OPINION OF THE COIÍRT. This is an action of de...
1,1828-05,28,"Case No. 4,785a. FIKES v. BENTLEY. [Hempst. 61...",62,OPINION OP THE COURT. This is an appeal from t...
2,1836-02,27,"Case No. 4,863a. FLETCHER v. ELLIS. [Hempst. 3...",616,"CROSS, Judge. The record in this case shows th..."
3,1999-07-15,46,Michael NORRIS v. STATE of Arkansas CR 98-1429...,3936,"W. H.“Dub” Arnold, Chief Justice. This is a ca..."
4,1999-10-07,39,Roger Allen HAMMON v. STATE of Arkansas CR 98-...,1788,"Ray Thornton, Justice. Appellant brings this a..."
5,1999-10-07,49,Joe Louis DANSBY v. STATE of Arkansas CR 97-14...,8076,"Annabelle Clinton Imber, Justice. Mr. Joe Loui..."
6,1999-06-10,38,David McGREW v. STATE of Arkansas CR 98-426 99...,848,"W. H.“Dub” Arnold, Chief Justice. The appellan..."
7,1999-07-01,64,ST. PAUL FIRE & MARINE INSURANCE COMPANY v. GR...,3050,"Robert L. Brown, Justice. Appellant St. Paul F..."
8,1999-10-14,36,Sylvester RICHARDS v. STATE of Arkansas CR 99-...,909,"Lavenski R. Smith, Justice. Appellant, Sylvest..."
9,1999-10-14,35,Patricia OSBURN v. Bryan BUSBEE d/b/a Busbee C...,303,"Per Curiam. Appellee Bryan Busbee, d/b/a Busbe..."


In [63]:
test_text = counts_ar.iloc[4,4]

In [64]:
# Reference: https://nlpforhackers.io/splitting-text-into-sentences/

from pprint import pprint
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
 
trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(test_text)
 
tokenizer = PunktSentenceTokenizer(trainer.get_params())

In [65]:
test_text_list = tokenizer.tokenize(test_text)

In [100]:
import re

# Reference: https://library.csustan.edu/apalegal

def citation_filter(text_list):
    new_list = []
    
    for i in range(len(text_list)):
        sentence = text_list[i]
        
        if re.search('\(\d\d\d\d\)', sentence) != None:
            pass
        elif re.search('v\.', sentence) != None:
            pass
        elif re.search('§', sentence) != None:
            pass
        elif re.search('R\.', sentence) != None:
            pass
        elif re.search('Rule', sentence) != None:
            pass
        # Ark. = Arkansas
        elif re.search('Ark\.', sentence) != None:
            pass
        else:
            new_list.append(sentence)
            
    return new_list

### Example 1

In [101]:
print("Length of text:  ", len(test_text_list),'\n\n')
test_text_list

Length of text:   70 




['Ray Thornton, Justice.',
 'Appellant brings this appeal of his conviction for capital murder in the shooting death of Roger Cousins on May 29, 1998, urging that the trial court erred in admitting the victim’s dying declaration naming appellant as his assailant.',
 'We find no error and affirm appellant’s conviction and sentence of life imprisonment.',
 'Roger Cousins was found on the edge of a highway outside Judsonia by Tommy Cole.',
 'Cousins had been shot four times in the back and called out to Cole as he drove by, “Please help me.',
 'I’ve been shot.',
 'I’m dying.” Cole stopped his car and called 911 and requested assistance.',
 'Cole then heard Cousins say that a “Roger Hammon” had “shot me all to pieces.” Upon the arrival of the police and being told that his wounds were life-threatening, Cousins replied, “I know,” and said twice more that “Roger Hammon shot me.” According to testimony, Cousins and Hammon were together earlier in the evening in a car driven by Shirley Estes, 

In [102]:
print("Length of text after filtering:  ", len(citation_filter(test_text_list)),'\n\n')

citation_filter(test_text_list)

Length of text after filtering:   49 




['Ray Thornton, Justice.',
 'Appellant brings this appeal of his conviction for capital murder in the shooting death of Roger Cousins on May 29, 1998, urging that the trial court erred in admitting the victim’s dying declaration naming appellant as his assailant.',
 'We find no error and affirm appellant’s conviction and sentence of life imprisonment.',
 'Roger Cousins was found on the edge of a highway outside Judsonia by Tommy Cole.',
 'Cousins had been shot four times in the back and called out to Cole as he drove by, “Please help me.',
 'I’ve been shot.',
 'I’m dying.” Cole stopped his car and called 911 and requested assistance.',
 'Cole then heard Cousins say that a “Roger Hammon” had “shot me all to pieces.” Upon the arrival of the police and being told that his wounds were life-threatening, Cousins replied, “I know,” and said twice more that “Roger Hammon shot me.” According to testimony, Cousins and Hammon were together earlier in the evening in a car driven by Shirley Estes, 

### Example 2

In [103]:
test_text2 = counts_ar.iloc[5,4]
test_text_list2 = tokenizer.tokenize(test_text2)

print("Length of text:  ", len(test_text_list2),'\n\n')

print("Length of text after filtering:  ", len(citation_filter(test_text_list2)),'\n\n')

Length of text:   561 


Length of text after filtering:   510 




### Example 3

In [104]:
test_text3 = counts_ar.iloc[19,4]
test_text_list3 = tokenizer.tokenize(test_text3)

print("Length of text:  ", len(test_text_list3),'\n\n')

print("Length of text after filtering:  ", len(citation_filter(test_text_list3)),'\n\n')

Length of text:   222 


Length of text after filtering:   192 


