In [2]:
# Import Scientific Packages into Python Kernel
from bs4 import BeautifulSoup 
import requests 
import re 

import pandas as pd
import numpy as np

# Set a user-agent in your header so you aren't flagged by the browser when making an HTTP request
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) \
           Chrome/39.0.2171.95 Safari/537.36'}

In [4]:
# Link url from caselaw which has a repository of Supreme Court ruling opinions and assign to variable
root_url = "http://caselaw.findlaw.com/court/us-supreme-court/years/"

In [5]:
# Assign a variable to house an array of supreme court documents listed within the years you'd like to explore
years = [root_url + str(year) for year in range(1760,2018)]

# Define a method that executes your url request and returns the data (HTML or XML) as an Object 
def Beautiful_soup_grabber(link):
    
    response = requests.get(link, headers = headers) #optional add timeout (seconds) keeps requests from running indefinitely 
    
    return BeautifulSoup(response.text, "lxml") #Returns BeautifulSoup object, which represents the document as a nested data structure


# Define a method which calls the above method for each year within the range you've requested and convert result object into table
def year_getter(years):
    
    y = {}
    for year in years:
        soup = Beautiful_soup_grabber(year)
        souplist = soup.findAll("a")
        
        #use regular expressions to 
        for i in souplist:
            if re.search("us-supreme-court", str(i)) and not re.search("years", str(i)) and not re.search("/court/", str(i)):
                b = i["href"]
                y[b] = [re.sub("[^0-9]", "", b.split("/")[-1])]
    
    
    return pd.DataFrame(y).transpose().reset_index() #converts results to data frame table using pandas

In [6]:
df = year_getter(years) #call the above function and assign it to a shorthand varible (this will take several minutes to execute)

df.columns = ["case_url", "docket"] #assign column names

In [7]:
df.head(5) #Return the first n rows (default n=5) to check table values and header names aligned correctly 

Unnamed: 0,case_url,docket
0,http://caselaw.findlaw.com/us-supreme-court/05...,51101
1,http://caselaw.findlaw.com/us-supreme-court/06...,611951
2,http://caselaw.findlaw.com/us-supreme-court/06...,6263
3,http://caselaw.findlaw.com/us-supreme-court/06...,65590
4,http://caselaw.findlaw.com/us-supreme-court/07...,71390


In [8]:
df.ix[0, "case_url"] #select a specific row in the dataframe to check value

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


'http://caselaw.findlaw.com/us-supreme-court/05-1101.html'

In [9]:
df.to_pickle("supcourt_yearlist.pickle") #Python Object serialization - “Pickling” is the process whereby a Python object hierarchy is converted into a byte stream

In [11]:
df.shape #The shape attribute for numpy arrays returns the dimensions of the array

#this will return the number of cases in our table

(23393, 2)

In [12]:
# Import Scientific Packages into Python Kernel
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [13]:
supcourt = pd.read_pickle("supcourt_yearlist.pickle") #Read in output from previous step (Year and case title df from previous notebook)

In [14]:
#split dataframe into three temporary dfs (caselaw can detect too may requests from a scraper and block your ip)
test_df = supcourt.iloc[5000:15000]
test3_df = supcourt.iloc[0:5000]
test2_df = supcourt.iloc[15000:23268]

In [15]:
# Define a function which collects the description of each case and appends to data table
def supcourtdescr(link):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    allitems = []
    response = requests.get(link, headers =  headers)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    pagesoup = soup.find_all(class_="caselawcontent searchable-content") 
    
    for item in pagesoup:
        txtt = item.get_text()
        allitems.append(txtt)
    return ' '.join(allitems)

In [None]:
test_df.loc[:,("case")] = test_df.case_url.apply(supcourtdescr)
test_df.to_pickle("temp2.pickle")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [None]:
test2_df.loc[:,("case")] = test2_df.case_url.apply(supcourtdescr)
test2_df.to_pickle("temp1.pickle")

In [None]:
test3_df.loc[:,("case")] = test3_df.case_url.apply(supcourtdescr)
test3_df.to_pickle("temp3.pickle")

In [None]:
full_project = pd.concat([test2_df, test3_df, test_df]) #putting it all together

In [None]:
full_project

In [None]:
full_project.to_pickle("full_proj_preproc.pickle")

In [None]:
import pandas as pd
import re
import spacy
import en_core_web_sm
import nltk
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords as stopwords
from nltk.corpus import names as names
nlp = en_core_web_sm.load()

In [None]:
male_names = names.words('male.txt')
female_names = names.words('female.txt')
male_names = [w.lower() for w in male_names]
male_names_plur = [(w.lower() + "s") for w in male_names]
female_names_plur = [(w.lower() + "s") for w in female_names]
female_names = [w.lower() for w in female_names]
casenames = list(pd.read_csv("casetitles.csv",encoding = 'iso-8859-1'))
statenames = list(pd.read_csv("statenames.csv"))

In [None]:
homespun_words = ['join', 'seek', 'ginnane', 'kestenbaum', 'hummel', 'loevinger', 'note', 'curiam', 'mosk', 'pd', \
                'paxton', 'rhino', 'buchsbaum', 'hirshowitz', 'misc', 'assistant', 'whereon', 'dismiss', 'sod', \
                'vote', 'present', 'entire', 'frankfurter', 'ante', 'leave', 'concur', 'entire', 'mootness', \
                'track', 'constitution', 'jj', 'blackmun', 'rehnquist', 'amici,sup', 'rep', 'stat', 'messes', \
                'like', 'rev', 'trans', 'bra', 'teller', 'vii', 'erisa', 'usca', 'annas', 'lead', 'cf', 'cca', \
                'fsupp', 'afdc', 'amicus', 'ante', 'orrick', 'kansa', 'pd', 'foth', 'stucky', 'aver',"united", \
                "may", "argued", "argue", "decide", "rptr", "nervine", "pp","fd" ,"june", "july", \
                "august", "september", "october", "november", "states", "ca", "joyce", "certiorari", "december",\
                "january", "february", "march", "april", "writ", "supreme court", "court", "dissent", \
                "opinion", "footnote","brief", "decision", "member", "curiam", "dismiss", "note", "affirm", \
                "question", "usc", "file"]

STOPLIST = set(stopwords.words('english') + list(homespun_words) + list(ENGLISH_STOP_WORDS) \
               + list(statenames) + list(casenames) + list(female_names) + list(male_names) + \ 
               list(female_names_plur) + list(male_names_plur))

In [None]:
STOPLIST = set(list(stopwords.words('english')) + list(sub_list) + list(ENGLISH_STOP_WORDS))

def tokenizeText(sample):
    separators = ["\xa0\xa0\xa0\xa0", "\r", "\n", "\t", "n't", "'m", "'ll", '[^a-z ]']
    for i in separators:
        sample = re.sub(i, " ", sample.lower())
        
    ## get the tokens using spaCy - this makes it possible to lemmatize the words
    tokens = nlp(sample)
    tokens = [tok.lemma_.strip() for tok in tokens]

    ## apply our stoplist
    return [tok for tok in tokens if len(tok) != 1 and tok not in STOPLIST]

In [None]:
doc_list["lem"] = doc_list.case.apply(text_processing)
doc_list.to_pickle("full_proj_lemmatized.pickle") ## to be used in model selection

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from textblob import TextBlob
from sklearn.preprocessing import Normalizer

In [None]:
doc_list.read_pickle("full_proj_lemmatized3.pickle") #always save your work!

In [None]:
doc_list.shape #checking to make sure we have the info we expected to have

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    
    
def modeler(corp, n_topics, n_top_words, clf, vect):
    df = .80
    str_vect = str(vect).split("(")[0]
    str_clf = str(clf).split("(")[0]

    print("Extracting {} features for {}...".format(str_vect, str_clf))
    vect_trans = vect.fit_transform(corp)


    # Fit the model
    print("Fitting the {} model with {} features, "
          "n_topics= {}, n_topic_words= {}, n_features= {}..."
          .format(str_clf, str_vect, n_topics, n_top_words, n_features))

    clf = clf.fit(vect_trans)
    if str_clf == "TruncatedSVD":
        print("\nExplained variance ratio", clf.explained_variance_ratio_)
        
    print("\nTopics in {} model:".format(str_clf))
    feature_names = vect.get_feature_names()
    return print_top_words(clf, feature_names, n_top_words)

In [None]:
modeler(doc_list.lem, 30, 30, LatentDirichletAllocation(n_topics=30, max_iter=5, learning_method='online', \
        learning_offset=50.,random_state=0), CountVectorizer(max_df=.80, min_df=2, 
                                                             stop_words='english'))

In [None]:
LDA_mod(doc_list.lem, .95, 2, 2000,10) #df is a way to extract 'meaningful text' in this case

In [None]:
modeler(doc_list.lem, 100, 30, TruncatedSVD(2, algorithm = 'arpack'), TfidfVectorizer(max_df=.8, min_df=2,stop_words='english'))

In [None]:
modeler(doc_list.lem, 30, 30, NMF(n_components=30, random_state=1, alpha=.1, l1_ratio=.5), \ 
        TfidfVectorizer(max_df=.98, min_df=2,stop_words='english'))

In [None]:
import pandas as pd
import re

In [None]:
##########################################  modeling imports  #######################################################
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
#from sklearn.preprocessing import Normalizer

In [None]:
df = pd.read_pickle("full_proj_lemmatized3.pickle")

In [None]:
df.head(5)

In [None]:
df.ix[15000, "case_url"]
#'http://caselaw.findlaw.com/us-supreme-court/382/12.html'

In [None]:
def nmf_mod(corp ):
    df = .80
    n_topics = 30
    n_features = 2000
    n_top_words = 40
    
    # Use tf-idf features for NMF.
    print("Extracting tf-idf features for NMF...")
    tfidf_vectorizer = TfidfVectorizer(max_df=df, min_df=5, # ngram_range=(1,2), #max_features=n_features,
                                       stop_words='english')

    tfidf = tfidf_vectorizer.fit_transform(corp)


    # Fit the NMF model
    print("Fitting the NMF model with tf-idf features, "
          "n_topics= %d, n_topic_words= %d, n_features= %d..."
          % (n_topics, n_top_words, n_features))

    nmf = NMF(n_components=n_topics, random_state=2, alpha=.1, l1_ratio=.5).fit(tfidf)
    
    print("\nTopics in NMF model:")
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    #return print_top_words(nmf, tfidf_feature_names, n_top_words) 
    return tfidf,nmf

In [None]:
tfidf, nmf_mod_test = nmf_mod(df.lem)

In [None]:
out =nmf_mod_test.transform(tfidf)
out[49] #verified that each of these is different

In [None]:
import operator
topics = []
for item in out:
    max_index, max_value = max(enumerate(item), key=operator.itemgetter(1))
    topics.append(max_index) 
    
df["topicnumber"] = pd.Series(topics, index=df.index)

In [None]:
topics_likelihood = []
for item in out:
    max_index, max_value = max(enumerate(item), key=operator.itemgetter(1))
    topics_likelihood.append(max_value)
    
df["strengthoftopic"] = pd.Series(topics_likelihood, index=df.index)

In [None]:
df.topicnumber.value_counts() #let's make sure this is a good model...

In [None]:
def nmf_topics_dict(corp, n_topics):
    df = .80
    n_top_words = 40
    
    tfidf_vectorizer = TfidfVectorizer(max_df=df, min_df=5,# ngram_range=(1,2), #max_features=n_features,
                                       stop_words='english')

    tfidf = tfidf_vectorizer.fit_transform(corp)
    nmf = NMF(n_components=n_topics, random_state=2, alpha=.1, l1_ratio=.5).fit(tfidf)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
      
    topic_dict = {}
    for topic_idx, topic in enumerate(nmf.components_):
        topic_dict[topic_idx] = ", ".join([tfidf_feature_names[i] \
                                    for i in topic.argsort()[:-n_top_words - 1:-1]])
    return topic_dict

In [None]:
# After testing different topic distributions, 30 was optimal
nmf_words_30 = nmf_topics_dict(df.lem, 30) #dict object

In [None]:
nmf_words_30

In [None]:
import json
with open('finaliteration_topics.json', 'w') as fp:
    json.dump(nmf_words_30, fp)

In [None]:
def word_lookup(num):
    return nmf_words_30.get(num)

In [None]:
df["words"] = df.topicnumber.apply(word_lookup)

In [None]:
df.ix[15017,"words"] # This cell and the one below verifies that it worked

In [None]:
df.ix[14972,"lem"]

In [None]:
df.ix[15017,"case_url"]
# 'http://caselaw.findlaw.com/us-supreme-court/380/145.html'

In [None]:
df.to_pickle("full_project_modelled_final.pickle")

In [None]:
df = pd.read_pickle("full_project_modelled_final.pickle")

In [None]:
# some topics were extremely similar and at the suggestion of my instructors,
# for the sake of the visualization, I have condensed the topics to 20

def topic_condenser(topicnum):
    if topicnum == 20:
        return 24
    if topicnum == 25:
        return 1
    if topicnum == 2:
        return 12
    if topicnum == 27:
        return 26
    if topicnum == 18 or topicnum == 5:
        return 29
    if topicnum == 8 or topicnum == 22:
        return 7
    if topicnum == 15:
        return 16
    if topicnum == 9:
        return 14
    if topicnum == 19:
        return 3
    else: 
        return topicnum
df["condensedtopics"] = df.topicnumber.apply(topic_condenser)

In [None]:
# doing some research on the not so obvious topics
df = df[df["topicnumber"] != 2]
#df_16.ix[15065, "caseurl"]
df_16

In [None]:
df_details = pd.read_csv("detailsford3.csv", encoding = 'iso-8859-1')
df_details.columns = ["condensedtopics", "topicname", "title", "exampleURL", "leadpp", "topicwords"]
df_details

In [None]:
df_with_details = pd.merge(df, df_details, how = "inner", on = "condensedtopics")

In [None]:
#temp_df = df_with_details[['years', 'condensedtopics', "topicname", "title", "exampleURL", "leadpp", "topicwords"]]
#temp_df.to_csv("temp.csv")
temp_df = df_with_details[['years', 'condensedtopics']]
temp_df.condensedtopics.value_counts()

In [None]:
#dummy value for each existing topic. Pay no attention to this error.
temp_df["count"] = 1
temp_df

In [None]:
#this condenses each point for the same year into n number of points 
temp_df = temp_df.groupby(["years", "condensedtopics"]).count().reset_index()
temp_df

In [None]:
data_fillna = temp_df.pivot_table("count", "years", "condensedtopics").fillna(0).unstack().reset_index()

In [None]:
#we lose the count label column in the previous steps, so we're just renaming it here, and reordering columns based on 
#how they are arranged in the viz csv
data_fillna.columns = ["condensedtopics", "years", "count"]
data_fillna = data_fillna[["years", "condensedtopics", "count"]]

In [None]:
#merge data
final_data = pd.merge(data_fillna, df_details, how = "inner", on = "condensedtopics")
final_data

In [None]:
#sort by year
final_data.sort_values("years", inplace = True, ascending = True)
final_data

In [None]:
#backup file
final_data.to_csv("topicsbyyear.csv", index = False)
final_data.to_csv("year_topic_data2.csv", index = False)

In [None]:
'''the best part of this viz is the brushing side to side effect. For that, we need total cases for every year
and need no other information'''

data_fillna.groupby("years")["count"].sum().reset_index().to_csv("year_data.csv", index = False)