In [1]:
from urllib.request import urlopen
import re
from collections import defaultdict
import numpy as np
import chardet
import pandas as pd
import csv
import json
from bs4 import BeautifulSoup

In [2]:
def get_variable_names():
    #contains all variables corresponding to a string
    varNames_dict = {}

    # Grab the page from the web
    f = urlopen("http://scdb.wustl.edu/documentation.php?var=chief")
    html = str(f.read())


    reviews = html.split('<li class="toc">')[1:]
    for variable in reviews:
        str_1 = variable.split("</a>")[0].split('class="toc')
        if (re.split(".var=",str_1[0])[1].replace('" ','') == 'intro'):
            continue
        if (re.split(".var=",str_1[0])[1].replace('" ','') == 'cite'):
            continue
        varName_dict[str_1[1].replace('">','')] = re.split(".var=",str_1[0])[1].replace('" ','')
    
    return varName_dict

In [3]:
def get_all_var_values(varName_dict):
    #contains all values possible for a var
    varValues_dict = {}
    
    for var in varName_dict.keys():
        #print("---------------------------")
        #print(str(var)+":"+str(varName_dict[var]))
        f = urlopen("http://scdb.wustl.edu/documentation.php?var="+str(varName_dict[var]))
        html = str(f.read())

        #for variables which are not encoded and stright values are used
        if ('<strong>Values:</strong><br />' not in html):
            continue

        #print(html)
        # Extract the (30) review elements
        reviews = html.split('<strong>Values:</strong><br />')[1]
        reviews = reviews.split('<div class="toc" onClick="toggleBlock')[0]
        #print(reviews)

        soup = BeautifulSoup(reviews)
        #print(soup.text)

        final_str = soup.text.replace("\\t","#").replace("\\n","#")
        #print(final_str)
        list_val = re.split("#+",final_str)
        list_val = [x for x in list_val if x not in [""," "]]

        assert(len(list_val)%2 == 0 or len(list_val)%3 == 0)

        #choose which format it is
        pattern = re.compile("\d+")
        
        #print(len(list_val))
        if (len(list_val)>4):
            if (pattern.match(list_val[0]) and pattern.match(list_val[2]) and pattern.match(list_val[4])):
                div = 2
            elif (pattern.match(list_val[0]) and pattern.match(list_val[3]) and pattern.match(list_val[6])):
                div = 3
            else:
                print("diff pattern found")
        elif (len(list_val)<4):
            if (len(list_val)%2==0):
                div = 2
            elif (len(list_val)%2==0):
                div = 3
        #print(div)
        
        varValues_dict[str(varName_dict[var])] = {}
        
        if (div == 2):
            for x in np.arange(0,len(list_val)//2):
                #print("varValues_dict["+str(varName_dict[var])+"]["+str(list_val[x*2])+"] = "+str(list_val[(x*2)+1]))
                varValues_dict[str(varName_dict[var])][list_val[x*2]] = str(list_val[(x*2)+1])
        elif (div == 3):
            for x in np.arange(0,len(list_val)//3):
                #print("varValues_dict["+str(varName_dict[var])+"]["+str(list_val[x*3])+"] = "+str(list_val[(x*3)+1])+ str(list_val[(x*3)+2]))
                varValues_dict[str(varName_dict[var])][list_val[x*3]] = str(list_val[(x*3)+1])#+";"+str(list_val[(x*3)+2])
    return varValues_dict

In [4]:
#fill dicts
varName_dict = {}
varName_dict = get_variable_names()

varValues_dict = {}
varValues_dict = get_all_var_values(varName_dict)

In [5]:
def fill_csv_with_var_values(varValues_dict,fileName,readLinesCount=None):
    #entireListDict has values instead of keys
    entireListDict = []
    
    #csv file to read
    csvfile = open(fileName, 'r')

    #fieldnames = ("caseId","sctCite")
    reader = csv.DictReader( csvfile)
    
    count = 0
    for row in reader:
        entireListDict.append(row)
        for column in row:
            if column in varValues_dict.keys():
                if (row[column] in varValues_dict[column].keys()):
                    entireListDict[count][column] = str(varValues_dict[column][row[column]])
                else:
                    entireListDict[count][column] = str(row[column])
            else:
                entireListDict[count][column] = str(row[column])
        count += 1
    
        #how many lines to read
        if readLinesCount is not None:
            if count >= 1:
                break
    return entireListDict

In [6]:
def write_csv_with_values(entireListDict,outputFileName):
    #write the output replacing keys with values
    with open(outputFileName,'w') as f:
        wr = csv.writer(f) 
        for rowNum in np.arange(len(entireListDict)):
            if rowNum == 0:
                wr.writerow([key for key,val in entireListDict[rowNum].items()])
            wr.writerow([val for key,val in entireListDict[rowNum].items()])
        print("File "+str(outputFileName)+" is generated!")

In [7]:
##for reading csv as dict format
# with open('sample.csv') as csv_file:
#     csv_reader = csv.reader(csv_file, delimiter=',')
#     line_count = 0
#     for row in csv_reader:
#         if line_count == 0:
#             print(f'Column names are {", ".join(row)}')
#             line_count += 1
#             #break
#         else:
#             print(row)
#             line_count += 1
#     print(f'Processed {line_count} lines.')

In [111]:
#test dicts
for var in varName_dict.keys():
    print(str(var)+":"+str(varName_dict[var]))

x = 'caseOrigin'
for y in varValues_dict[x]:
    print(str(y)+"\t"+str(varValues_dict[x][y]))
    
#get list of dicts aka csv file
entireListDict = fill_csv_with_var_values(varValues_dict,"JusticeData.csv")
#print(len(entireListDict))
#write_csv_with_values(entireListDict,"sample_output_2.csv")

SCDB Case ID:caseId
SCDB Docket ID:docketId
SCDB Issues ID:caseIssuesId
SCDB Vote ID:voteId
U.S. Reporter Citation:usCite
Supreme Court Citation:sctCite
Lawyers Edition Citation:ledCite
LEXIS Citation:lexisCite
Docket Number:docket
Case Name:caseName
Petitioner:petitioner
Petitioner State:petitionerState
Respondent:respondent
Respondent State:respondentState
Manner in which the Court takes Jurisdiction:jurisdiction
Administrative Action Preceeding Litigation:adminAction
Administrative Action Preceeding Litigation State:adminActionState
Three-Judge District Court:threeJudgeFdc
Origin of Case:caseOrigin
Origin of Case State:caseOriginState
Source of Case:caseSource
Source of Case State:caseSourceState
Lower Court Disagreement:lcDisagreement
Reason for Granting Cert:certReason
Lower Court Disposition:lcDisposition
Lower Court Disposition Direction:lcDispositionDirection
Date of Decision:dateDecision
Term of Court:term
Natural Court:naturalCourt
OnChief Justice:chief
Date of Oral Argument:

In [84]:
df = pd.DataFrame(entireListDict)

In [116]:
df['partyWinning'].value_counts()

petitioning party received a favorable disposition         70822
no favorable disposition for petitioning party apparent    42286
favorable disposition for petitioning party unclear           18
                                                               9
Name: partyWinning, dtype: int64

# Feature Engineering

In [252]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Categorize different kind of features

VarType ={'SCDB Case ID': 'identifier',
 'SCDB Docket ID': 'identifier',
 'SCDB Issues ID': 'identifier',
 'SCDB Vote ID': 'identifier',
 'Case Name': 'identifier',
 'Petitioner': 'predictive',
 'Respondent': 'predictive',
 'Manner in which the Court takes Jurisdiction': 'predictive',
 'Origin of Case': 'predictive',
 'Source of Case': 'predictive',
 'Lower Court Disagreement': 'predictive',
 'Reason for Granting Cert': 'predictive',
 'Lower Court Disposition': 'predictive',
 'Lower Court Disposition Direction': 'predictive',
 'OnChief Justice': 'predictive',                               #Error in Name
 'Date of Decision': 'extra',
 'Term of Court': 'predictive',
 'Winning Party': 'outcome',                                  #Key Prediction
 'Majority Votes': 'outcome',
 'Minority Votes': 'outcome',
 'Justice Name': 'predictive',
 'The Vote in the Case': 'outcome',
 "Direction of the Individual Justice's Votes": 'outcome',     #Key Prediction
 'Majority and Minority Voting by Justice': 'outcome'}

predictive = [varName_dict[var] for var in VarType if VarType[var]=='predictive']

In [253]:
# Remove rows with non-relevant outcomes

df = df[df['direction'].isin(["liberal","conservative"])]
df['direction'].value_counts()
#df['partyWinning']
#df = df[df['partyWinning'].isin(["petitioning party received a favorable disposition",
#                                 "no favorable disposition for petitioning party apparent"])]
#df['partyWinning'].value_counts()

liberal         59532
conservative    53576
Name: direction, dtype: int64

In [254]:
# One hot encoding for categorical Values
dfX = pd.get_dummies(df[predictive], prefix_sep='=', drop_first=True)

# Turn y values to integer labels
le = LabelEncoder()
yEncoded = le.fit_transform(df['direction'])
list(enumerate(le.classes_))

#Create Training, Validation and Test Datasets

[(0, 'conservative'), (1, 'liberal')]

# Trivial Model

In [255]:
clf_dummy = DummyClassifier(strategy='most_frequent').fit(dfX, yEncoded)
clf_dummy.score(dfX,yEncoded)

0.5263288184743785

# Logistic Regression

In [None]:
data_final_vars=dfX.columns.values.tolist()
y=['y']
X=[i for i in data_final_vars if i not in y]
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='liblinear')
rfe = RFE(logreg, 20)
rfe = rfe.fit(dfX, yEncoded)
print(rfe.support_)
print(rfe.ranking_)

In [233]:
dfXreducedCols = [x for _,x in sorted(zip(rfe.ranking_,data_final_vars))]
dfXreducedCols[:50]

['caseOrigin=154',
 'caseOrigin=158',
 'caseOrigin=U.S. Court of Military Review',
 'caseOrigin=U.S. Customs Court',
 'caseOrigin=U.S. District Court for the Canal Zone',
 'caseSource=Hawaii U.S. District Court',
 'caseSource=Michigan Western U.S. District Court',
 'caseSource=Mississippi Northern U.S. District Court',
 'caseSource=Tennessee Eastern U.S. District Court',
 'jurisdiction=docketing fee',
 'jurisdiction=writ of habeas corpus',
 'petitioner=Social Security Administration or Commissioner',
 'petitioner=brewery, distillery',
 'petitioner=medical or Medicaid claimant',
 'respondent=Federal Maritime Commission',
 'respondent=Federal Reserve Board of Governors',
 'respondent=Federal Savings and Loan Insurance Corporation',
 'respondent=Occupational Safety and Health Review Commission',
 'respondent=Pension Benefit Guaranty Corporation',
 'respondent=shopping center, mall',
 'respondent=Selective Service System',
 'petitioner=Pension Benefit Guaranty Corporation',
 'caseSource=Vi

In [232]:
dfXreducedCols = [col for idx,col in enumerate(data_final_vars) if rfe.support_[idx]==True]
dfXreducedCols

['petitioner=Social Security Administration or Commissioner',
 'petitioner=brewery, distillery',
 'petitioner=medical or Medicaid claimant',
 'respondent=Federal Maritime Commission',
 'respondent=Federal Reserve Board of Governors',
 'respondent=Federal Savings and Loan Insurance Corporation',
 'respondent=Occupational Safety and Health Review Commission',
 'respondent=Pension Benefit Guaranty Corporation',
 'respondent=shopping center, mall',
 'jurisdiction=docketing fee',
 'jurisdiction=writ of habeas corpus',
 'caseOrigin=154',
 'caseOrigin=158',
 'caseOrigin=U.S. Court of Military Review',
 'caseOrigin=U.S. Customs Court',
 'caseOrigin=U.S. District Court for the Canal Zone',
 'caseSource=Hawaii U.S. District Court',
 'caseSource=Michigan Western U.S. District Court',
 'caseSource=Mississippi Northern U.S. District Court',
 'caseSource=Tennessee Eastern U.S. District Court']

In [251]:
lamValues = [0,0.01,0.1,1,10,100]
scores = []
for lam in lamValues:
    clf = LogisticRegression(C=lam, solver='liblinear', max_iter=500)  #.fit(dfX, yEncoded)
    crossScores = cross_val_score(clf, dfX['justiceName'], yEncoded, cv=5, scoring='accuracy')
    print(lam,crossScores)
    scores.append(crossScores.mean())

KeyError: 'justiceName'

In [235]:
bestModelIdx = scores.index(max(scores))
print("Lamda and Corresponding scores: ", list(zip(lamValues, scores)))
print("Best lamda is: ", lamValues[bestModelIdx])

Lamda and Corresponding scores:  [(0.01, 0.5285391500500867), (0.1, 0.5329067589856086), (1, 0.5324558505377063), (10, 0.5324558505377063), (100, 0.5324558505377063)]
Best lamda is:  0.1


In [237]:
sorted(list(zip(np.round(clf.coef_[0],3),([col.split("=") for col in dfX.columns]))))

AttributeError: 'LogisticRegression' object has no attribute 'coef_'

# Random Forest

In [244]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=50,random_state=1)
clf.fit(dfX[dfXreducedCols[:50]],yEncoded)
print(clf.score(dfX[dfXreducedCols[:50]],yEncoded))
score = cross_val_score(clf, dfX[dfXreducedCols[:50]], yEncoded, cv=5, scoring='accuracy')
print(score.mean())

0.5365845033065743
0.5324558505377063


In [246]:
dfX[dfXreducedCols[:50]]

Unnamed: 0,caseOrigin=154,caseOrigin=158,caseOrigin=U.S. Court of Military Review,caseOrigin=U.S. Customs Court,caseOrigin=U.S. District Court for the Canal Zone,caseSource=Hawaii U.S. District Court,caseSource=Michigan Western U.S. District Court,caseSource=Mississippi Northern U.S. District Court,caseSource=Tennessee Eastern U.S. District Court,jurisdiction=docketing fee,...,respondent=U.S. House of Representatives,petitioner=Alien Property Custodian,petitioner=National Endowment for the Arts,caseSource=Louisiana Western U.S. District Court,petitioner=Federal Savings and Loan Insurance Corporation,petitioner=Federal Energy Administration,caseSource=Louisiana Middle U.S. District Court,respondent=Federal Aviation Agency or Administration,petitioner=realtor,petitioner=wholesale trade
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
