In [4]:
# pip install -U spacy
# pip install scispacy
# pip install "tensorflow_hub>=0.6.0"
# pip install "tensorflow>=2.0.0"

import numpy as np
import torch
import tensorflow
import pandas as pd
import os
import json
import time
import glob
import re
import sys
import collections
from itertools import chain
import random
import joblib
from bs4 import BeautifulSoup

# import dask
# from dask import delayed,compute
# import dask.dataframe as dd
# from dask.multiprocessing import get

from nltk import flatten

import scispacy
import spacy
import en_core_sci_lg
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from spacy.symbols import ORTH
from spacy.util import minibatch, compounding

import tensorflow as tf
import tensorflow_hub as hub

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from tqdm._tqdm_notebook import tqdm_notebook
from tqdm import tqdm
tqdm_notebook.pandas()


Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook
  from pandas import Panel


## Load the SpaCy and USE model

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
module_url = "https://tfhub.dev/google/nnlm-en-dim128/2"
embed = hub.KerasLayer(module_url)

## Load the dataset

In [17]:
df = pd.read_csv('action_state_text.csv', index_col=0)
df

Unnamed: 0,action,state
0,hear argument morning case number brnovich ver...,mr chief justice may please court think key co...
1,mr carvin understand test articulate reduce an...,entirely mr chief justice reason involve diffe...
2,talk concern analysis would drive racial propo...,well mean neutral system must change order max...
3,really maximize participation equalize word co...,well example would eliminate valuable antifrau...
4,thank mr chief justice mr carvin understand ra...,well justice thomas think speak precisely term...
...,...,...
11515,let's put aside unwilling second allege employ...,think would look know publicly disclose financ...
11516,allege dispute pleading stage envision kind pr...,well mean think allege sufficient particularit...
11517,question subject matter jurisdiction article i...,exactly right honor that's actually happen dis...
11518,think that's right separate factual proceeding...,exactly that's actually happen plan overfunded...


In [19]:
action = nltk.flatten(df["action"].to_list())
state = nltk.flatten(df["state"].to_list())

In [20]:
action_embeddings = embed(action)
action_embeddings.shape

TensorShape([11520, 128])

In [34]:
state_embeddings = embed(state)
state_embeddings.shape

TensorShape([11520, 128])

# Query functions

In [21]:
#https://towardsdatascience.com/nlp-based-information-retrieval-system-answer-key-questions-from-the-scientific-literature-b8e5c3aa5a3e

def cosine_similarity_func(embeddings,embeddings_query):
    '''
    Input:
         embeddings: array or tensor of all sentence embeddings (nX128 for n sentences)
         embeddings_query: array or tensor of query embedding (1X128)
    Output:
         cosine_similarity: cosine similarity of query with each sentence (nX1) 
    '''
    # x.y
    dot_product = np.sum(np.multiply(np.array(embeddings),np.array(embeddings_query)),axis=1)
    
    #||x||.||y||
    prod_sqrt_magnitude = np.multiply(np.sum(np.array(embeddings)**2,axis=1)**0.5, np.sum(np.array(embeddings_query)**2,axis=1)**0.5)
    
    #x.y/(||x||.||y||)
    cosine_similarity  = dot_product/prod_sqrt_magnitude
    return cosine_similarity

# function for recommend text based upon query
def recommended_text(query,embeddings,sent,threshold_min=.95,threshold_max = 1):
    '''
    Input:
         query: list of queries
         embeddings: embeddings of all sentences
         sent:list all sentences
         threshold_min: lower limit of threshold for which sentence is supposed to be similar with query
         threshold_max: upper limit of threshold for which sentence is supposed to be similar with query
         
    Output:
          recommend_text: list of similar sentences with query
    '''
    recommend_text = []
    embeddings_query = embed(query) #create embedding for query
    
    cosine_similarity = cosine_similarity_func(embeddings,embeddings_query) # get cosine similarity with all sentences
    
    # standardize cosine similarity output, Range(0,1)
    standardize_cosine_simi  = (cosine_similarity-min(cosine_similarity))/(max(cosine_similarity)-min(cosine_similarity))
    
    #sort sent based upon cosine similarity score
    sent_prob = list(map(lambda x, y:(x,y), standardize_cosine_simi, sent)) 
    sent_prob.sort(key=lambda tup: tup[0], reverse=True)

    # select sentences by using upper and lower threshold
    for i,j in sent_prob:
        if (i >threshold_min) and (i<=threshold_max):
            recommend_text.append(j)
    return recommend_text

## Example for one query result

In [59]:
query = [df['state'][0]]
query

["mr chief justice may please court think key conceptual point understand arizona deny anyone vote opportunity kind there's like literacy test deny right vote like vote dilution white bloc voting deny minority equal opportunity elect everyone eligible register vote utilize myriad opportunities arizona's offer day vote mail free person since there's denial opportunity disparate impact claim would even cognizable context title vii disparate impact relate denial employment opportunity job promotion get involve process one's ever bring title vii claim say can't require people send application minority less access transportation mail analogous claim make respondent try move disparate impact entirely different context since there's denial vote opportunity context circumstance time place manner rule violate section extraordinarily limit occur state organize time place manner rule stack way minority less opportunity non-minorities cast vote come directly plain language section also course prac

In [60]:
result = recommended_text(query, action_embeddings, action, threshold_min=.975)

In [61]:
len(result)

6

In [58]:
result

["that's one take point like sometimes grammar give way meaning clear meaning clear can't imagine eminent domain use maintain land kind like that's impossible argument brief know mean domestic airline drive think point mr clement might make meaning impossible know there's little bit awkwardness say number generator store number actually explain look kind dial device exist time device generate number immediate dialing device store number later dial meaning possible go mean combine fact grammatically proper way read sentence",
 'sorry mr clement statutory phrase interpret case structure fairly common list two activity store produce telephone number follow modifying phrase use random sequential number generator people make statement like time know hear read understand mean without look treatise grammar syntax usage interpretation way ask make sense give lot example little time question ask sense get arcane stuff question jump make sense speak store list telephone number use random sequent

In [54]:
df[df['action'].isin(result)].index.tolist()

[50, 71, 4080, 4980, 9505, 10009]

## Full sentences as queries

In [63]:
response = []
for i in tqdm(range(df.shape[0])):
    query = [df['state'][i]]
    result = recommended_text(query, action_embeddings, action, threshold_min=.975)
    action_idx = df[df['action'].isin(result)].index.tolist()
    response.append(action_idx)

100%|██████████| 11520/11520 [06:47<00:00, 28.25it/s]


In [81]:
response_full_sent_search = list(zip(*[list(range(df.shape[0])), 
                                       [', '.join(str(e) for e in sub_response) for sub_response in response]]))
full_sent_search_result = pd.DataFrame(response_full_sent_search, columns = ['state_query', 'action_result'])
full_sent_search_result.to_csv('full_sent_search_result.csv')

## Key words as queries

Only use NOUN and ADJ in the state (attorney's statement) as the search query.

In [93]:
response_key = []
for i in tqdm(range(df.shape[0])):
    sent_query = df['state'][i]
    doc = nlp(sent_query)
    query = [' '.join([token.text for token in doc if token.pos_ in(['NOUN', 'ADJ'])])]
    result = recommended_text(query, action_embeddings, action, threshold_min=.975)
    action_idx = df[df['action'].isin(result)].index.tolist()
    response_key.append(action_idx)

100%|██████████| 11520/11520 [09:13<00:00, 20.81it/s]


In [95]:
response_key_word_search = list(zip(*[list(range(df.shape[0])), 
                                       [', '.join(str(e) for e in sub_response) for sub_response in response_key]]))
key_word_search_result = pd.DataFrame(response_key_word_search, columns = ['state_query', 'action_result'])
key_word_search_result.to_csv('key_word_search_result.csv')

## Add other info to the search database

facts, questions, conclusions

In [128]:
years = ['2017', '2018', '2019', '2020']
additional_info = []

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

for y in tqdm(years):
    meta = joblib.load('data/cases_metadata/' + y + '.pkl')
    for i in tqdm(range(len(meta))):
        try:
            soup = BeautifulSoup(cleanhtml(meta[i][2]['facts_of_the_case']), 'html.parser')
            facts = re.split('\.|;|\n|\?|\xa0', soup.text.replace('D.C.', 'DC').replace('U.S.', 'US'))

            soup = BeautifulSoup(cleanhtml(meta[i][2]['question']), 'html.parser')
            questions = re.split('\.|;|\n|\?|\xa0', soup.text.replace('D.C.', 'DC').replace('U.S.', 'US'))

            soup = BeautifulSoup(cleanhtml(meta[i][2]['conclusion']), 'html.parser')
            conclusions = re.split('\.|;|\n|\?|\xa0', soup.text.replace('D.C.', 'DC').replace('U.S.', 'US'))

            additional_info.extend(facts + questions + conclusions)
        except Exception as e:
            print(str(e))
            continue

  0%|          | 0/4 [00:00<?, ?it/s]
100%|██████████| 79/79 [00:00<00:00, 3681.09it/s]
 25%|██▌       | 1/4 [00:00<00:00,  3.94it/s]

expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object



100%|██████████| 76/76 [00:00<00:00, 3182.20it/s]
 50%|█████     | 2/4 [00:00<00:00,  3.88it/s]
  0%|          | 0/61 [00:00<?, ?it/s]

expected string or bytes-like object


100%|██████████| 61/61 [00:00<00:00, 3414.10it/s]
 75%|███████▌  | 3/4 [00:00<00:00,  4.15it/s]
100%|██████████| 64/64 [00:00<00:00, 3336.30it/s]
100%|██████████| 4/4 [00:00<00:00,  4.91it/s]

expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
e




In [131]:
additional_info = [e for e in additional_info if (e != '') and (len(e.split(' '))>5)]
additional_info

['Individual voters in Texas, along with organizations representing Latinos and African Americans, filed a number of lawsuits in 2011, challenging the Texas legislature’s congressional and state house redistricting plans',
 ' The actions were consolidated and proceed in the US District Court for the Western District of Texas (“Texas District Court”)',
 ' The plaintiffs alleged racial gerrymandering in violation of § 2 of the Voting Rights Act (VRA) as well as the 14th and 15th Amendments to the United States Constitution',
 ' At that time Texas was bound by the preclearance requirements under § 5 of the VRA, and therefore the State simultaneously filed an action in the US District Court for the District of Columbia (“DC District Court”) seeking preclearance of the redistricting plans',
 'While trial proceedings were pending in both district courts, the 2012 primary elections were approaching',
 ' As a result, the Texas District Court assumed the task of implementing interim redistricti

In [132]:
additional_info_embeddings = embed(additional_info)
additional_info_embeddings.shape

TensorShape([5123, 128])

In [133]:
response_add_info = []
for i in tqdm(range(df.shape[0])):
    sent_query = df['state'][i]
    doc = nlp(sent_query)
    query = [' '.join([token.text for token in doc if token.pos_ in(['NOUN', 'ADJ'])])]
    result = recommended_text(query, additional_info_embeddings, additional_info, threshold_min=.975)
    response_add_info.append(result)

100%|██████████| 11520/11520 [04:39<00:00, 41.29it/s]


In [135]:
response_key_word_add_info_search = list(zip(*[list(range(df.shape[0])), 
                                       ['\n'.join(str(e) for e in sub_response) for sub_response in response_add_info]]))
key_word_add_info_search_result = pd.DataFrame(response_key_word_add_info_search, columns = ['state_query', 'action_result'])
key_word_add_info_search_result.to_csv('key_word_add_info_search_result.csv')

In [136]:
key_word_add_info_search_result

Unnamed: 0,state_query,action_result
0,0,Because subject matter jurisdiction cannot be...
1,1,The Chief Justice noted that the Secretary co...
2,2,The appeals court further held that other fac...
3,3,Federal district courts are split as to wheth...
4,4,Federal district courts are split as to wheth...
...,...,...
11515,11515,because the scheme did not aim to obtain money...
11516,11516,Juror testimony was also deemed inadmissible ...
11517,11517,Because subject matter jurisdiction cannot be...
11518,11518,ERISA pre-empts state laws that relate to a c...


In [141]:
pd.set_option('display.max_colwidth', None)
pd.concat([df, key_word_add_info_search_result], axis=1).iloc[:5,:]

Unnamed: 0,action,state,state_query,action_result
0,hear argument morning case number brnovich versus democratic national committee consolidated case mr carvin,mr chief justice may please court think key conceptual point understand arizona deny anyone vote opportunity kind there's like literacy test deny right vote like vote dilution white bloc voting deny minority equal opportunity elect everyone eligible register vote utilize myriad opportunities arizona's offer day vote mail free person since there's denial opportunity disparate impact claim would even cognizable context title vii disparate impact relate denial employment opportunity job promotion get involve process one's ever bring title vii claim say can't require people send application minority less access transportation mail analogous claim make respondent try move disparate impact entirely different context since there's denial vote opportunity context circumstance time place manner rule violate section extraordinarily limit occur state organize time place manner rule stack way minority less opportunity non-minorities cast vote come directly plain language section also course practical matter circumstance state erect kind cognizable barrier minority voting respondent alternative view war text section section say vote practice cannot provide less opportunity say voting practice provide opportunity nonetheless unlawful external socioeconomic factor somehow contribute disproportionate utilization language nowhere text never even mention legislative history clear,0,"Because subject matter jurisdiction cannot be waived by failure to challenge it, the district court dismissed Davis’s religious discrimination claim with prejudice\n The economy of litigation does however favor delaying individual claims until class certification has been denied"
1,mr carvin understand test articulate reduce anything deal time place manner intent test rather result test that's provide section word long time place manner restriction there's difference minority voter white voter problem true,entirely mr chief justice reason involve differential system unequal access regardless whether unequal access racially motivate would prove intent behind differential access provide minority suppress hinder minority vote that's key distinction mobile versus bolden,1,"The Chief Justice noted that the Secretary conducted an analysis weighing the value of obtaining more complete and accurate citizenship data against the uncertain risk that reinstating a citizenship question would result in a lower response rate\n Third, absent a need for protection, the public interest in fair and effective law enforcement weighs in favor of comprehensive access to evidence\n The appeals court further held that other factors significant under an international comity balancing test derived from 3rd and 9th Circuit case law affirmed that abstention was appropriate in the instant matter"
2,talk concern analysis would drive racial proportionality respondent approach understand concern talk districting bad thing talk electoral procedure,well mean neutral system must change order maximize minority voting strength regardless strong justification thing provide unfairness minority must rejigger every aspect time place manner registration election day early voting order maximize minority participation bad kind race-conscious activity subordinate,2,The appeals court further held that other factors significant under an international comity balancing test derived from 3rd and 9th Circuit case law affirmed that abstention was appropriate in the instant matter\n At issue is whether the plan systematically dilutes the voting strength of Democratic voters statewide\n The Court’s precedents establish that neutral government action is not unconstitutional solely because it fails to benefit religious exercise
3,really maximize participation equalize word come disparate result disparate result avoid,well example would eliminate valuable antifraud concern implicate ban ballot harvesting would substitute federal court state legislatures make rule question what's wrong question system impose unfairness group nonetheless change simply find different method vote convenient,3,"Federal district courts are split as to whether state courts have subject matter jurisdiction over covered class actions that allege only 1933 Act claims\n if federal law addresses the issue, state law is inapplicable\nPlaintiffs in this case challenge the plan as an unconstitutional partisan gerrymander"
4,thank mr chief justice mr carvin understand race neutrality argument normally see come context non-discrimination statute fourteenth amendment really require equal treatment race neutrality approach fit within language voting right act though speak term,well justice thomas think speak precisely term say voting practice cannot result minority less opportunity non-minorities say system need equally open say long everyone opportunity system equally open section condemn respondent however would say even minority give precisely opportunity unless utilize proportionally somehow come within constraint section there's nothing text section say need expand time place manner restriction enhance proportionality maximization indeed rule virtually every time place manner restriction country would illegal overnight severe disproportionate utilization socioeconomic disparity ubiquitous surely congress intend kind sea change would give hint legislative history rule contrary text section formulation congress intend,4,Federal district courts are split as to whether state courts have subject matter jurisdiction over covered class actions that allege only 1933 Act claims\nstate courts have subject matter jurisdiction over covered class actions that allege only Securities Act of 1933 claims
