#Cleaning and organizing data

In [2]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option("display.width", 500)
pd.set_option("display.max_columns", 100)
pd.set_option("display.notebook_repr_html", True)
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context("poster")

##Import data
We import the sample dataset. This is a subsample of all the cases. 

In [3]:
sample_df = pd.read_csv("sample_cases.csv")

In [4]:
sample_df.head()

Unnamed: 0,full_cite,text,url,us_cite,year,case,case_id,caseId,caseOriginState,dateArgument,decisionDirection,decisionType,docket,docketId,issueArea,jurisdiction,lawType,majOpinWriter,majVotes,minVotes,usCite
0,United States v. Jimenez Recio 537 U.S. 270 (2...,"OCTOBER TERM, 2002 Syllabus UNITED STATES v. J...",https://supreme.justia.com/cases/federal/us/53...,537 U.S. 270,2003,United States v. Jimenez Recio,10400,2002-016,,11/12/2002,1,1,1/1/1184,2002-016-01,1,1,,110.0,8,1,537 U.S. 270
1,United States v. Jones 345 U.S. 377 (1953),United States v. Jones No. 556 Decided April 1...,https://supreme.justia.com/cases/federal/us/34...,345 U.S. 377,1953,United States v. Jones,927,1952-078,12.0,,1,2,556,1952-078-01,9,2,6.0,,9,0,345 U.S. 377
2,"Joy Oil Co., Ltd. v. State Tax Commission 337 ...","Joy Oil Co., Ltd. v. State Tax Commission No. ...",https://supreme.justia.com/cases/federal/us/33...,337 U.S. 286,1949,"Joy Oil Co., Ltd. v. State Tax Commission",461,1948-090,27.0,1/6/1949,2,1,223,1948-090-01,8,1,1.0,80.0,6,3,337 U.S. 286
3,Witte v. United States 515 U.S. 389 (1995),"OCTOBER TERM, 1994 Syllabus WITTE v. UNITED ST...",https://supreme.justia.com/cases/federal/us/51...,515 U.S. 389,1995,Witte v. United States,9663,1994-076,,4/17/1995,1,1,94-6187,1994-076-01,1,1,2.0,104.0,8,1,515 U.S. 389
4,Warth v. Seldin 422 U.S. 490 (1975),"Warth v. Seldin No. 73-2024 Argued March 17, 1...",https://supreme.justia.com/cases/federal/us/42...,422 U.S. 490,1975,Warth v. Seldin,5850,1974-140,,3/17/1975,1,1,73-2024,1974-140-01,9,1,,101.0,5,4,422 U.S. 490


##Cleaning text

We only use one regular expression operator, which is any letter between parentheses, for example: (a).

In [5]:
import re 
regex1 = r"\(.\)" 

The below function takes an opinion (a string), and returns the verbs, nouns, adjectives, foreign words, and court precedents cited in the opinion. 

In [6]:
from pattern.en import parse
from pattern.en import pprint
from pattern.en import conjugate, lemma, lexeme
from pattern.vector import stem, PORTER, LEMMA
from sklearn.feature_extraction import text
import string

#stopwords and punctuation
stopwords=text.ENGLISH_STOP_WORDS
punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')

def get_parts(opinion):
    oplow = opinion.lower()
    #REMOVING CHARACTERS: we have ugly text, and remove unnecssary characters.
    oplow = unicode(oplow, 'ascii', 'ignore') #remove non-unicode characters 
    oplow = str(oplow).translate(string.maketrans("\n\t\r", "   ")) #remove characters like \n 
    #justices (eg, Justice Breyer) are referred to as J. (eg,Breyer, J.); we remove the J., also JJ. for plural
    oplow = oplow.replace('j.','')
    oplow = oplow.replace('jj.','')
    oplow = oplow.replace('c.','') #remove C. for chief justice 
    oplow = oplow.replace('pp.','') #page numbers
    oplow = oplow.replace('  ','') #multiple spaces
    oplow = ''.join([i for i in oplow if not i.isdigit()]) #remove digits 
    oplow=re.sub(regex1, ' ', oplow)
    #Remove the Justia disclaimer at the end of the case, if it appears in the string
    justiadisclaimer = "disclaimer: official"
    if justiadisclaimer in oplow: 
        optouse = oplow.split(justiadisclaimer)[0]
    else:
        optouse = oplow
    
    #GET A LIST OF PRECEDENTS CITED IN THE OPINION 
    wordslist = optouse.split()
    #find precedents based on string 'v.' (eg, 'Brown v. Board')
    indices = [i for i in range(len(wordslist)) if wordslist[i]=='v.']
    precedents = [wordslist[i-1]+ ' ' + wordslist[i]+ ' ' + wordslist[i+1] for i in indices]
    
    #remove precedents, as we have already accounted for these
    for precedent in precedents:
        optouse = optouse.replace(precedent,'')
    
    #PARSE INTO LIST OF LISTS --> GET WORDS
    parsed = parse(optouse,tokenize=True,chunks=False,lemmata=True).split()
    verbs = [] 
    nouns = [] 
    adjectives = [] 
    foreign = [] 
    i=0
    #Create lists of lists of verbs, nouns, adjectives and foreign words in each sentence.
    for sentence in parsed: #for each sentence 
        verbs.append([])
        nouns.append([])
        adjectives.append([])
        foreign.append([])
        for token in sentence: #for each word in the sentence 
            if token[0] in punctuation or token[0] in stopwords or len(token[0])<=2:
                continue
            wordtouse = token[0]
            for x in punctuation:
                wordtouse = wordtouse.replace(x,' ') #if punctuation in word, take it out
            if token[1] in ['VB','VBZ','VBP','VBD','VBN','VBG']:
                verbs[i].append(lemma(wordtouse)) #append the lemmatized verb (we relemmatize because lemmata in parse does not seem to always work)
            if token[1] in ['NN','NNS','NNP','NNPS']:
                nouns[i].append(lemma(wordtouse))
            if token[1] in ['JJ','JJR','JJS']:
                adjectives.append(lemma(wordtouse))
            if token[1] in ['FW']:
                foreign.append(wordtouse)  
        i+=1  
    #Zip together lists so each tuple is a sentence. 
    out=zip(verbs,nouns,adjectives,foreign)
    verbs2 = []
    nouns2 = []
    adjectives2 = []
    foreign2 = []
    for sentence in out: 
        if sentence[0]!=[] and sentence[1]!=0: #if the sentence has at least one verb and noun, keep it. Otherwise, drop it.
            if type(sentence[0])==list: 
                verbs2.append(sentence[0])
            else: 
                verbs2.append([sentence[0]]) #if verb is a string rather than a list, put string in list
            if type(sentence[1])==list:
                nouns2.append(sentence[1])
            else:
                nouns2.append([sentence[1]])
            if type(sentence[2])==list:
                adjectives2.append(sentence[2])
            else:
                adjectives2.append([sentence[2]])
            if type(sentence[3])==list:
                foreign2.append(sentence[3])
            else:
                foreign2.append([sentence[3]])
    return(verbs2,nouns2,adjectives2,foreign2,precedents)

###Lists of words, vocabularies
In the next cell, we run get_parts on all the opinions.

In [14]:
%%time 
verbwords = []
nounwords = []
adjwords = []
forwords = []
precedents_all = []
for op in sample_df.text:
    verbs,nouns,adjectives,foreign,precedents = get_parts(op)
    verbwords.append(verbs)
    nounwords.append(nouns)
    adjwords.append(adjectives)
    forwords.append(foreign)
    precedents_all.append(precedents)

CPU times: user 1min 54s, sys: 693 ms, total: 1min 55s
Wall time: 1min 55s


We create a list of issue areas (our y variable), which is in the same order as the cases in the sample_df (and thus matches the order of cases in verbwords, nounwords, etc.)

In [29]:
issue_areas = sample_df.issueArea.tolist()

We next create vocabularies, and also create maps between word id's and words (and vice versa).

In [15]:
#create precedents vocab
precedents_vocab = list(set([precedent for sublist in precedents_all for precedent in sublist]))
#create other vocabs
verbvocab = list(set([word for sublist in verbwords for subsublist in sublist for word in subsublist]))
nounvocab = list(set([word for sublist in nounwords for subsublist in sublist for word in subsublist]))
adjvocab = list(set([word for sublist in adjwords for subsublist in sublist for word in subsublist]))
forvocab = list(set([word for sublist in forwords for subsublist in sublist for word in subsublist]))

In [16]:
#dictionaries: id --> word
id2prec = dict(enumerate(precedents_vocab))
id2verb = dict(enumerate(verbvocab))
id2noun = dict(enumerate(nounvocab))
id2adj = dict(enumerate(adjvocab))
id2for = dict(enumerate(forvocab))
#dictionaries: word --> id
prec2id = dict(zip(id2prec.values(),id2prec.keys()))
verb2id = dict(zip(id2verb.values(),id2verb.keys()))
noun2id = dict(zip(id2noun.values(),id2noun.keys()))
adj2id = dict(zip(id2adj.values(),id2adj.keys()))
for2id = dict(zip(id2for.values(),id2for.keys()))

Finally, we create corpuses (one for each word type and precedents). Each corpus is a list of lists. Each inner list corresponds to an opinion, and has as its elements tuples of the form `(wordid, count)`, where `count` refers to the number of times the word appears in the opinion. A sample corpus may look like the following (this sample corpus is taken from problem set 5): 

```
[[(5912, 1), (3809, 1), (14131, 1), (3876, 1)],
 [(3266, 1), (3652, 1), (11644, 1), (2296, 1), (27516, 1), (8382, 1)],
 [(17217, 1), (22979, 1), (11210, 1), (18736, 1), (3893, 1), (21307, 1)],
 ...,
 [(23980, 1), (24730, 1), (22979, 1), (20012, 1), (11206, 2)]]
```



In [18]:
#this function takes a list of words, and outputs a list of tuples 
counter = lambda x:list(set([(i,x.count(i)) for i in x]))

#corpus_creator takes a list of lists of lists like verbwords, or a list of lists like precedents_all. 
#It also takes a word2id dictionary.
def corpus_creator(sentence_word_list,word2id):
    counter = lambda x:list(set([(word2id[i],x.count(i)) for i in x]))
    op_word_list = []
    if type(sentence_word_list[0][0])==list: #if list of lists of lists 
        for opinion in sentence_word_list: 
            #for each list (which corresponds to an opinion) in sentence_word_list, get a list of the words
            op_word_list.append([word for sublist in opinion for word in sublist])
    else: #if list of lists 
        op_word_list = sentence_word_list
    corpus = []
    for element in op_word_list: 
        corpus.append(counter(element))
    return(corpus)

### Split into training and test sets

In [19]:
#get nouncorpus, as an example
nouncorpus = corpus_creator(nounwords,noun2id)

In [33]:
#function takes a corpus (a list of tuples, each tuple corresponding to a document) and a list of issue areas
# (ie, a list of the issue area corresponding to each document, such as issue_areas above) as arguments. 
#For example, you might run: train_test_split(nouncorpus,issue_areas)
def train_test_split(corpus,issue_areas):
    #Run train-test split: randomly sample from the corpus 
    traintestarray = np.random.choice([0, 1], size=(len(corpus)), p=[.3, .7])
    #create train and test corpuses
    train_corpus = [corpus[i] for i in range(len(corpus)) if traintestarray[i]==1]
    test_corpus = [corpus[i] for i in range(len(corpus)) if traintestarray[i]==0]
    #create train and test issue_area lists
    train_issue_areas = [issue_areas[i] for i in range(len(corpus)) if traintestarray[i]==1]
    test_issue_areas = [issue_areas[i] for i in range(len(corpus)) if traintestarray[i]==0]
    return(train_corpus,train_issue_areas,test_corpus,test_issue_areas)
#The function returns four lists: train_corpus, train_issue_areas,test_corpus,test_issue_areas

In [34]:
#this runs the above function on nouns
train_corpus,train_issue_areas,test_corpus,test_issue_areas = train_test_split(nouncorpus,issue_areas)