#Vectorizing and cleaning text data

In [103]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option("display.width", 500)
pd.set_option("display.max_columns", 100)
pd.set_option("display.notebook_repr_html", True)
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context("poster")

In [93]:
#get dataframe of some opinions from csv 
opinion_df = pd.read_table("raw_clistener_data.csv",nrows=45000)

#dftouse=pd.read_csv("dftouse.csv")
#dftouse.head()

In [107]:
opinion_df.head()

Unnamed: 0.1,Unnamed: 0,blocked,date,id,name,text,url
0,0,False,1922-05-29,100000,Morrisdale Coal Co. v. United States,259 U.S. 188 (1922)MORRISDALE COAL COMPANYv.UN...,
1,1,False,1922-05-29,100001,Pine Hill Coal Co. v. United States,"259 U.S. 191 (1922)PINE HILL COAL COMPANY, INC...",
2,2,False,1922-05-29,100002,Santa Fe Pacific R. Co. v. Fall,259 U.S. 197 (1922)SANTA FE PACIFIC RAILROAD C...,
3,3,False,1922-05-29,100003,"Federal Baseball Club of Baltimore, Inc. v. Na...",259 U.S. 200 (1922)FEDERAL BASEBALL CLUB OF BA...,
4,4,False,1922-05-29,100004,Mutual Life Ins. Co. of NY v. Liebing,259 U.S. 209 (1922)MUTUAL LIFE INSURANCE COMPA...,


###Code for parsing, cleaning, vectorizing
Below, I take three cases as a small sample on which to write code that parses, cleans and vectorizes the text

The python pattern module is useful: http://www.clips.ua.ac.be/pattern

In [263]:
#Get sample cases, and put them in list
op1 = opinion_df.iloc[1]['text']
op2 = opinion_df.iloc[2]['text']
op3 = opinion_df.iloc[3]['text']
opsdict = {'op1':[op1],'op2':[op2],'op3':[op3]}

In [264]:
#We create regular expressions [MAYBE USE LATER SOMEHOW]
import re
regex1=re.compile(r"\.{2,}")
regex2=re.compile(r"\-{2,}")

In [287]:
#We only consider the actual opinion. This is what comes after the phrase "delivered the opinion of the court." 
deliverstring="delivered the opinion of the court." 
for key,value in opsdict.iteritems():
    oplow = value[0].lower() #make text lowercase
    #If opinion includes deliverstring, we use the opinion for analysis
    if deliverstring in oplow: 
        optouse = oplow.split(deliverstring)[1]
        opsdict[key].append(optouse)
        
#GET A LIST OF PRECEDENTS CITED IN THE OPINION 
for key,value in opsdict.iteritems():
    wordslist = value[1].split()
    #find precedents based on string 'v.' (eg, 'Brown v. Board')
    indices = [i for i in range(len(wordslist)) if wordslist[i]=='v.']
    precedents = [wordslist[i-1]+ ' ' + wordslist[i]+ ' ' + wordslist[i+1] for i in indices]
    opsdict[key].append(precedents)
#note: each opsdict[key] is now a list of [original opinion, optouse, [precedents]]

#CLEAN DATA: REMOVE STOPWORDS
#we want to get a list of all the nouns and all the adjectives used in each case, 

The below cell will ultimately be turned into a function like get_parts in HW5. Probably combine with above to also return precedents.

In [300]:
from pattern.en import parse
from pattern.en import pprint
from pattern.en import conjugate, lemma, lexeme
from pattern.vector import stem, PORTER, LEMMA
#using this text for writing code
testop = opsdict['op2'][1]

#get stopwords, punctuation
from sklearn.feature_extraction import text 
stopwords=text.ENGLISH_STOP_WORDS
punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')
#remove precedents, as we have already accounted for these
for precedent in opsdict['op2'][2]:
    testop = testop.replace(precedent,'')
#parse into list of lists 
parsed = parse(testop,tokenize=True,chunks=False,lemmata=True).split()
verbs = [] 
nouns = [] 
adjectives = [] 
foreign = [] 
i=0
#Create lists of lists of verbs, nouns, adjectives and foreign words in each sentence.
for sentence in parsed: #for each sentence 
    verbs.append([])
    nouns.append([])
    adjectives.append([])
    foreign.append([])
    for token in sentence: #for each word in the sentence 
        if token[0]  in punctuation or token[0] in stopwords or len(token[0])==1:
            continue
        else:
            if token[1] in ['VB','VBZ','VBP','VBD','VBN','VBG']:
                verbs[i].append(lemma(token[0])) #append the lemmatized verb (we relemmatize because lemmata in parse does not seem to always work)
            if token[1] in ['NN','NNS','NNP','NNPS']:
                nouns[i].append(lemma(token[0]))
            if token[1] in ['JJ','JJR','JJS']:
                adjectives.append(lemma(token[0]))
            if token[1] in ['FW']:
                foreign.append(token[0])  
    i+=1  
#Zip together lists so each tuple is a sentence. 
out=zip(verbs,nouns,adjectives,foreign)
verbs2 = []
nouns2 = []
adjectives2 = []
foreign2 = []
for sentence in out: 
    if sentence[0]!=[]&sentence[1]!=0: #if the sentence has at least one verb and noun, keep it. Otherwise, drop it.
        verbs2.append(sentence[0])
        nouns2.append(sentence[1])
        adjectives2.append(sentence[2])
        foreign2.append(sentence[3])
## return(verbs2,nouns2,adjectives2,foreign2)

**To do next**: 
Create vocabulary lists
Create corpus in bag of words forms

**Also to consider**: 
more data cleaning: (1) can do regex stuff (2) can do ngrams (3) maybe try to look for legal terms (4) more rigorous/complex method for finding precedents. In particular, don't just define the precedent as word before and word after "v."