In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option("display.width", 500)
pd.set_option("display.max_columns", 100)
pd.set_option("display.notebook_repr_html", True)
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context("poster")

#SVM

##Import data
We now import all data from all cases (not just a subsample).

In [2]:
all_data = pd.read_csv('all_cases.csv')

In [3]:
all_data.head()

Unnamed: 0,full_cite,text,url,us_cite,case,case_id,caseId,docketId,usCite,docket,dateArgument,caseOriginState,jurisdiction,issueArea,decisionDirection,decisionType,lawType,majOpinWriter,majVotes,minVotes,year
0,Eagles v. Samuels 329 U.S. 304 (1946),"Eagles v. Samuels No. 59 Argued November 21, 1...",https://supreme.justia.com/cases/federal/us/32...,329 U.S. 304,Eagles v. Samuels,0,1946-020,1946-020-01,329 U.S. 304,59,11/21/1946,35.0,1,3,1,1,3,81,9,0,1946
1,Eagles v. Horowitz 329 U.S. 317 (1946),"Eagles v. Horowitz No. 58 Argued November 21, ...",https://supreme.justia.com/cases/federal/us/32...,329 U.S. 317,Eagles v. Horowitz,1,1946-021,1946-021-01,329 U.S. 317,58,11/21/1946,35.0,1,3,1,1,3,81,9,0,1946
2,Labor Board v. A. J. Tower Co. 329 U.S. 324 (1...,Labor Board v. A. J. Tower Co. No. 60 Argued N...,https://supreme.justia.com/cases/federal/us/32...,329 U.S. 324,Labor Board v. A. J. Tower Co.,2,1946-022,1946-022-01,329 U.S. 324,60,11/21/1946,,1,7,2,1,3,82,8,1,1946
3,Gibson v. United States 329 U.S. 338 (1946),Gibson v. United States No. 23 Argued January ...,https://supreme.justia.com/cases/federal/us/32...,329 U.S. 338,Gibson v. United States,3,1946-023,1946-023-01,329 U.S. 338,23,1/2/1946,49.0,1,3,2,1,3,85,9,0,1946
4,Illinois v. Campbell 329 U.S. 362 (1946),Illinois ex rel. Gordon v. Campbell No. 35 Arg...,https://supreme.justia.com/cases/federal/us/32...,329 U.S. 362,Illinois v. Campbell,4,1946-024,1946-024-01,329 U.S. 362,35,3/28/1946,17.0,1,12,2,1,3,85,7,2,1946


##Train-test split
We split the cases in the entire dataset into training and test sets.

In [4]:
#select training and test cases (70% training, 30% test) and add a train-test column to the dataframe
#add train-test column to dataframe 
all_data['training'] = np.random.choice([0, 1], size=(len(all_data)), p=[.3, .7])

##Cleaning text

Just as in SVM.ipynb, the `get_parts` function below takes an opinion (a string), and returns the verbs and nouns in the opinion. 

In [5]:
import re 
regex1 = r"\(.\)"

In [10]:
from pattern.en import parse
from pattern.en import pprint
from pattern.en import conjugate, lemma, lexeme
from pattern.vector import stem, PORTER, LEMMA
from sklearn.feature_extraction import text
import string

#stopwords and punctuation
stopwords=text.ENGLISH_STOP_WORDS
punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')

def get_parts(opinion):
    oplow = opinion.lower()
    #REMOVING CHARACTERS: we have ugly text, and remove unnecssary characters.
    oplow = unicode(oplow, 'ascii', 'ignore') #remove non-unicode characters 
    oplow = str(oplow).translate(string.maketrans("\n\t\r", "   ")) #remove characters like \n 
    #justices (eg, Justice Breyer) are referred to as J. (eg,Breyer, J.); we remove the J., also JJ. for plural
    oplow = oplow.replace('j.','')
    oplow = oplow.replace('jj.','')
    oplow = oplow.replace('c.','') #remove C. for chief justice 
    oplow = oplow.replace('pp.','') #page numbers
    oplow = oplow.replace('  ','') #multiple spaces
    oplow = ''.join([i for i in oplow if not i.isdigit()]) #remove digits 
    oplow=re.sub(regex1, ' ', oplow)
    #Remove the Justia disclaimer at the end of the case, if it appears in the string
    justiadisclaimer = "disclaimer: official"
    if justiadisclaimer in oplow: 
        optouse = oplow.split(justiadisclaimer)[0]
    else:
        optouse = oplow
    
    #GET A LIST OF PRECEDENTS CITED IN THE OPINION 
    wordslist = optouse.split()
    #find precedents based on string 'v.' (eg, 'Brown v. Board')
    indices = [i for i in range(len(wordslist)) if wordslist[i]=='v.']
    precedents = [wordslist[i-1]+ ' ' + wordslist[i]+ ' ' + wordslist[i+1] for i in indices]
    
    #remove precedents, as we have already accounted for these
    for precedent in precedents:
        optouse = optouse.replace(precedent,'')
    
    #PARSE INTO LIST OF LISTS --> GET WORDS
    parsed = parse(optouse,tokenize=True,chunks=False,lemmata=True).split()
    verbsnouns = [] 
    i=0
    #Create lists of lists of verbs and nouns in each sentence.
    for sentence in parsed: #for each sentence 
        verbsnouns.append([])
        for token in sentence: #for each word in the sentence 
            if token[0] in punctuation or token[0] in stopwords or len(token[0])<=2:
                continue
            wordtouse = token[0]
            for x in punctuation:
                wordtouse = wordtouse.replace(x,' ') #if punctuation in word, take it out
            if token[1] in ['VB','VBZ','VBP','VBD','VBN','VBG','NN','NNS','NNP','NNPS']:
                verbsnouns[i].append(lemma(wordtouse)) #append the lemmatized word (we relemmatize because lemmata in parse does not seem to always work)
        i+=1  
    #Zip together lists so each tuple is a sentence. 
    #out=zip(verbs,nouns)
    #verbs2 = []
    #nouns2 = []
    #for sentence in out: 
    #    if sentence[0]!=[] and sentence[1]!=0: #if the sentence has at least one verb and noun, keep it. Otherwise, drop it.
    #        if type(sentence[0])==list: 
    #            verbs2.append(sentence[0])
    #        else: 
    #            verbs2.append([sentence[0]]) #if verb is a string rather than a list, put string in list
    #        if type(sentence[1])==list:
    #            nouns2.append(sentence[1])
    #        else:
    #            nouns2.append([sentence[1]])
    return(verbsnouns)

In [6]:
from pattern.en import parse
from pattern.en import pprint
from pattern.en import conjugate, lemma, lexeme
from pattern.vector import stem, PORTER, LEMMA
from sklearn.feature_extraction import text
import string

#stopwords and punctuation
stopwords=text.ENGLISH_STOP_WORDS
punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')

def get_parts(opinion):
    oplow = opinion.lower()
    #REMOVING CHARACTERS: we have ugly text, and remove unnecssary characters.
    oplow = unicode(oplow, 'ascii', 'ignore') #remove non-unicode characters 
    oplow = str(oplow).translate(string.maketrans("\n\t\r", "   ")) #remove characters like \n 
    #justices (eg, Justice Breyer) are referred to as J. (eg,Breyer, J.); we remove the J., also JJ. for plural
    oplow = oplow.replace('j.','')
    oplow = oplow.replace('jj.','')
    oplow = oplow.replace('c.','') #remove C. for chief justice 
    oplow = oplow.replace('pp.','') #page numbers
    oplow = oplow.replace('  ','') #multiple spaces
    oplow = ''.join([i for i in oplow if not i.isdigit()]) #remove digits 
    oplow=re.sub(regex1, ' ', oplow)
    #Remove the Justia disclaimer at the end of the case, if it appears in the string
    justiadisclaimer = "disclaimer: official"
    if justiadisclaimer in oplow: 
        optouse = oplow.split(justiadisclaimer)[0]
    else:
        optouse = oplow
    
    #GET A LIST OF PRECEDENTS CITED IN THE OPINION 
    wordslist = optouse.split()
    #find precedents based on string 'v.' (eg, 'Brown v. Board')
    indices = [i for i in range(len(wordslist)) if wordslist[i]=='v.']
    precedents = [wordslist[i-1]+ ' ' + wordslist[i]+ ' ' + wordslist[i+1] for i in indices]
    
    #remove precedents, as we have already accounted for these
    for precedent in precedents:
        optouse = optouse.replace(precedent,'')
    
    #PARSE INTO LIST OF LISTS --> GET WORDS
    parsed = parse(optouse,tokenize=True,chunks=False,lemmata=True).split()
    verbs = [] 
    nouns = [] 
    i=0
    #Create lists of lists of verbs and nouns in each sentence.
    for sentence in parsed: #for each sentence 
        verbs.append([])
        nouns.append([])
        for token in sentence: #for each word in the sentence 
            if token[0] in punctuation or token[0] in stopwords or len(token[0])<=2:
                continue
            wordtouse = token[0]
            for x in punctuation:
                wordtouse = wordtouse.replace(x,' ') #if punctuation in word, take it out
            if token[1] in ['VB','VBZ','VBP','VBD','VBN','VBG']:
                verbs[i].append(lemma(wordtouse)) #append the lemmatized verb (we relemmatize because lemmata in parse does not seem to always work)
            if token[1] in ['NN','NNS','NNP','NNPS']:
                nouns[i].append(lemma(wordtouse))
        i+=1  
    #Zip together lists so each tuple is a sentence. 
    out=zip(verbs,nouns)
    verbs2 = []
    nouns2 = []
    for sentence in out: 
        if sentence[0]!=[] and sentence[1]!=0: #if the sentence has at least one verb and noun, keep it. Otherwise, drop it.
            if type(sentence[0])==list: 
                verbs2.append(sentence[0])
            else: 
                verbs2.append([sentence[0]]) #if verb is a string rather than a list, put string in list
            if type(sentence[1])==list:
                nouns2.append(sentence[1])
            else:
                nouns2.append([sentence[1]])
    return(verbs2,nouns2)

###Lists of words, vocabularies

First, the dataframe does seem to have one case that does not have a string type for the text, for some reason. This cell removes all cases whose text is not a string.

In [8]:
str_types = np.array(map(type, all_data.text)) == str
all_data = all_data[str_types]
all_data = all_data.reset_index(drop=True)
# should return true, if the text column is only strings
(np.array(map(type, all_data.text)) == str).all()

True

In the next cell, we run get_parts on all the opinions to get lists of verbs and nouns.

In [12]:
%%time 
words = []
for op in all_data.text:
    words.append(get_parts(op))

Wall time: 10min 37s


We create a list of issue areas (our y variable), which is in the same order as the cases in all_data (and thus matches the order of cases in verbwords, nounwords, etc.)

In [14]:
issue_areas = all_data.issueArea.tolist()

We next create vocabularies, and also create maps between word id's and words (and vice versa).

In [21]:
#create vocabs
vocab = list(set([word for sublist in words for subsublist in sublist for word in subsublist]))

In [22]:
#dictionaries: id --> word
id2word = dict(enumerate(vocab))
#dictionaries: word --> id
word2id = dict(zip(id2word.values(),id2word.keys()))

Finally, we create corpuses (one for each word type). Each corpus is a list of lists: each inner list corresponds to an opinion, and has as its elements tuples of the form `(wordid, count)`, where `count` refers to the number of times the word appears in the opinion. This is described in further detail in SVM.ipynb.

In [23]:
#this function takes a list of words, and outputs a list of tuples 
counter = lambda x:list(set([(i,x.count(i)) for i in x]))

#corpus_creator takes a list of lists of lists like verbwords, or a list of lists like precedents_all. 
#It also takes a word2id dictionary.
def corpus_creator(sentence_word_list,word2id):
    counter = lambda x:list(set([(word2id[i],x.count(i)) for i in x]))
    op_word_list = []
    if type(sentence_word_list[0][0])==list: #if list of lists of lists 
        for opinion in sentence_word_list: 
            #for each list (which corresponds to an opinion) in sentence_word_list, get a list of the words
            op_word_list.append([word for sublist in opinion for word in sublist])
    else: #if list of lists 
        op_word_list = sentence_word_list
    corpus = []
    for element in op_word_list: 
        corpus.append(counter(element))
    return(corpus)

This function takes a corpus and a number of words, and returns a matrix in which the element at row i and column j is the number of occurrences of word j in document i.

In [24]:
# takes a corpus and a number of words, and returns a matrix in which the element at row i and column j is the number of
# occurrences of word j in document i.
def corpus_to_mat(corpus, num_words):
    n = len(corpus)
    M = np.zeros((n, num_words))
    for i,doc in enumerate(corpus):
        for word,count in doc:
            M[i][word] = count
    return M

In [26]:
%%time
corpus = corpus_creator(words, word2id)

Wall time: 20.9 s


In [29]:
%%time
train_corpus = [corpus[i] for i in range(len(corpus)) if all_data['training'][i]==1]
train_mat = corpus_to_mat(train_corpus, len(vocab))

Wall time: 835 ms


In [30]:
%%time
test_corpus = [corpus[i] for i in range(len(corpus)) if all_data['training'][i]==0]
test_mat = corpus_to_mat(test_corpus, len(vocab))

Wall time: 379 ms


In [35]:
from sklearn.feature_extraction.text import TfidfTransformer
#this function takes a training matrix of size n_documents_training*vocab_size and a test matrix
#of size n_documents_test*vocab_size. The function outputs the corresponding tfidf matrices.
#Note that we fit on the training data, and then apply that fit to the test data.
def tfidf_mat_creator(trainmatrix,testmatrix):
    tf_idf_transformer=TfidfTransformer()
    tfidf_fit = tf_idf_transformer.fit(trainmatrix)
    tfidf_train = tfidf_fit.transform(trainmatrix).toarray()
    tfidf_test = tfidf_fit.transform(testmatrix).toarray()
    return(tfidf_train,tfidf_test)

In [36]:
train_issue_areas = [issue_areas[i] for i in range(len(issue_areas)) if all_data['training'][i]==1]
test_issue_areas = [issue_areas[i] for i in range(len(issue_areas)) if all_data['training'][i]==0]

In [42]:
%%time
train_mat,test_mat = tfidf_mat_creator(train_mat,test_mat)

Wall time: 9.91 s


In [43]:
"""
Function
--------
confusion_mat_creator

Inputs
------
predictions: a list of length n_documents. Each value in the list is the predicted
             issue area of the corresponding document. ("Predicted" as predicted by the SVM.)
actuals: a list of length n_documents. Each value is the actual issue area of the corresponding document.

Returns
-------
A 14*14 confusion matrix. Cell i,j is the number of cases with actual issue area j 
that were predicted as issue area i. Thus, the diagonal represents correct predictions.

Notes
-----
see do_classify below for an example of how this is used
"""

def confusion_mat_creator(predictions,actuals): 
    confusion_mat = np.zeros((14,14))
    for i in range(len(predictions)):
        #get predicted and actual issue ares; subtract by 1 since matrix is 0-indexed
        p_val = predictions[i]-1
        a_val = actuals[i]-1
        confusion_mat[p_val,a_val]+=1 #Matrix is thus predicted values*actual values 
    return(confusion_mat)

In [44]:
"""
Function
--------
cv_optimize

Inputs
------
clf : an instance of a scikit-learn classifier
parameters: a parameter grid dictionary thats passed to GridSearchCV
X: a document-word matrix (e.g., noun_train_tfidf). Should be training data.
y: the response vector (train_issue_areas)
n_folds: the number of cross-validation folds (default 5)
score_func: a score function we might want to pass (default python None)
   
Returns
-------
Two things: (1) The best estimator from the GridSearchCV, after the GridSearchCV has been used to
fit the model; and (2) the best parameter. 
     
Notes
-----
see do_classify below for an example of how this is used
"""
from sklearn.grid_search import GridSearchCV
#note: this code comes directly from lab 6
def cv_optimize(clf, parameters, X, y, n_folds=5, score_func=None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    gs.fit(X, y)
    best_estimator = gs.best_estimator_
    best_param = gs.best_params_
    return (best_estimator,best_param)

In [45]:
"""
Function
--------
do_classify

Inputs
------
clf : an instance of a scikit-learn classifier
parameters: a parameter grid dictionary thats passed to GridSearchCV
Xtrain: a training matrix document-word matrix (e.g., noun_train_tfidf)
ytrain: the corresponding training response vector (train_issue_areas)
Xtest: a test matrix document-word matrix (e.g., noun_test_tfidf)
ytest: the corresponding test resonse vector (e.g., noun)
n_folds: the number of cross-validation folds (default 5)
score_func: a score function we might want to pass (default python None)
   
Returns
-------
4 things, in the following order: (1) an array of predicted y values (ie, topic areas) for the test data; 
                                  (2) the accuracy score; (3) the confusion matrix of the test data predictions;
                                  (4) the best parameter from the gridsearch.

"""

##from sklearn.metrics import confusion_matrix
def do_classify(clf, parameters, Xtrain, ytrain, Xtest, ytest, score_func=None, n_folds=5):
    if parameters:
        clf,best_param = cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.2f" % (training_accuracy)
    print "Accuracy on test data:     %0.2f" % (test_accuracy)
    print "Best parameter: ", best_param
    pred = clf.predict(Xtest)
    print "########################################################"
    print confusion_mat_creator(pred, ytest)
    return(pred,test_accuracy,confusion_mat_creator(pred,ytest),best_param,clf)

Before we finally run the optimized SVM on the TF-IDF matrices, the issue areas contain some NaN's that we need to remove manually.

In [63]:
train_valid_indices = ~np.isnan(train_issue_areas)
test_valid_indices = ~np.isnan(test_issue_areas)

In [64]:
train_mat = train_mat[train_valid_indices]
train_issue_areas = np.array(train_issue_areas)[train_valid_indices]
print train_mat.shape
print train_issue_areas.shape

(5662L, 49101L)
(5662L,)


In [67]:
test_mat = test_mat[test_valid_indices]
test_issue_areas = np.array(test_issue_areas)[test_valid_indices]
print test_mat.shape
print np.array(test_issue_areas).shape

(2497L, 49101L)
(2497L,)


Now, we can run optimized SVM on the tfidf matrices.

In [68]:
%%time
from sklearn.svm import LinearSVC
best_predictions, best_accuracy, best_con_mat, best_param, best_clf=do_classify(LinearSVC(loss='hinge'), {'C':[1.0]},
                                                                                train_mat, train_issue_areas,
                                                                                test_mat, test_issue_areas)

############# based on standard predict ################
Accuracy on training data: 0.94
Accuracy on test data:     0.76
Best parameter:  {'C': 1.0}
########################################################
[[ 507.   28.   14.   21.    2.    2.    1.    8.   28.    1.    1.    3.
     1.    0.]
 [   5.  295.   11.   13.    2.    2.    3.   13.   27.   12.    0.    0.
     1.    0.]
 [   4.    4.  162.    1.    0.    0.    0.    6.    8.    1.    0.    3.
     0.    0.]
 [   7.    2.    1.   41.    0.    1.    0.    0.    0.    1.    0.    0.
     0.    0.]
 [   1.    2.    0.    0.   19.    0.    0.    0.    2.    0.    0.    0.
     1.    0.]
 [   1.    0.    1.    0.    0.   17.    0.    5.    1.    0.    0.    0.
     0.    0.]
 [   0.    3.    1.    2.    0.    2.   93.    2.    5.   11.    0.    0.
     0.    0.]
 [  18.   15.    9.    9.    1.    4.   10.  411.   32.   40.    2.   17.
     1.    0.]
 [  17.   33.    8.    8.    0.    0.    6.   33.  218.   12.    1.    4.
     3. 



#Naive Bayes

In [None]:
# turn issue areas into dummy column
issue_areas = ["criminal procedure","civil rights","first amendment","due process","privacy","attorneys",
              "unions","economic activity","judicial power","federalism","interstate  amendment",
              "federal taxation","miscellaneous","private action"]

for issue, num in zip(issue_areas,range(1,15)):
    all_data[issue] = all_data.issueArea.apply(lambda x: 1 if x == num else 0)

In [None]:
# turn decision directions into dummy column (conservative, liberal, neutral)
decision_areas = ["conservative","liberal","neutral"]

for decision, num in zip(decision_areas,range(1,4)):
    all_data[decision] = all_data.decisionDirection.apply(lambda x: 1 if x == num else 0)

### Vectorizing Text

We use the function **make_xy** to convert the syllabi (a collection of text documents) to numerical data (a matrix of token counts). The default vectorizer we use for this task is CountVectorizer, which produces a sparse representation of the counts.

In [None]:
# function (from lab 9) to vectorize text - adapted to accomodate topics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix

def make_xy(df, issue, vectorizer=None):   
    if vectorizer is None:
        vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df.text)
    X = X.tocsc()
    y = (df[issue] == 1).values.astype(np.int)
    return X, y

### Cross-Validation

We use the function **cv_score** to estimate the cross-validated value of a scoring function, given a classifier and data. In k-fold cross-validation, the test set is no longer needed; rather, the training set is split into *k* smaller sets. The following procedure is followed for each of the k “folds”: (1) the model is trained using k-1 of the folds as training data, (b) the model is validated on the remaining part of the data, (3) the performance of the model is 
calculated as the average of the values computed in the loop. Because we are working with a relatively small data sample (about 1,600 cases), approaches that are efficient in terms of data usage are preferable.

In [None]:
from sklearn.cross_validation import KFold

# function to return cross-validation score (lab 9)
def cv_score(clf, X, y, scorefunc):
    result = 0.
    nfold = 5
    for train, test in KFold(y.size, nfold): # split data into train/test groups, 5 times
        clf.fit(X[train], y[train]) # fit
        result += scorefunc(clf, X[test], y[test]) # evaluate score function on held-out data
    return result / nfold # average

### Log Probabilities

We define a function to return an array of log probabilities of the samples for each class in the model.

In [13]:
def log_likelihood(clf, x, y):
    prob = clf.predict_log_proba(x)
    not_topic = y == 0
    topic = ~not_topic
    return prob[not_topic, 0].sum() + prob[topic, 1].sum()

### Running the Model

Using the supporting functions above, we can now classify each issue area and store the output. One difference now is that we generally do not need to reuse cross-validation; since we already used cross-validatoin to determine optimal parameters for each topic area within the subsample data (in the Naive Bayes notebook), we can just use those optimal parameters here by loading them from a csv.

In [None]:
nb_sample_results = pd.read_csv('naive_bayes_sample_model_results.csv')