# Searchlight

## Imports

In [18]:
import pandas as pd
import re
import numpy as np
from datascience import *
import urllib
from selenium import webdriver
from time import sleep
import requests
from bs4 import BeautifulSoup as Soup

## Initialize DataScience Tables

In [2]:
#Speech Table
speeches = Table().with_columns("speech_id", make_array(), 
                                "speaker_id", make_array(), 
                                "proceeding_id", make_array(), 
                                "topic_id", make_array(), 
                                "word_count", make_array(), 
                                "speech_text", make_array(),
                                'file_name', make_array())

In [3]:
speeches

speech_id,speaker_id,proceeding_id,topic_id,word_count,speech_text,file_name


In [4]:
#Speaker Table
speakers = Table().with_columns("speaker_id", make_array(), 
                                "first_name", make_array(), 
                                "last_name", make_array(), 
                                "type" , make_array(),
                                "party", make_array(), 
                                "state", make_array(), 
                                "district", make_array(),
                                "bio_guide_id", make_array(),
                                "congress_id", make_array())

In [5]:
speakers

speaker_id,first_name,last_name,type,party,state,district,bio_guide_id,congress_id


In [6]:
topics = Table().with_columns("topic_id", make_array(), 
                                "topic_name", make_array())

In [7]:
topics

topic_id,topic_name


In [8]:
#Proceedings Table
proceedings = Table().with_columns("proceeding_id", make_array(), 
                              "date", make_array(),
                              "title", make_array())

In [9]:
proceedings

proceeding_id,date,title


## Initializing Parsing

### Text Parsing

In [10]:
def sep_speech(string):
    parse_file = ''
    with open(string) as file:
        for line in file:
            parse_file += line
    parse_file = parse_file.replace('\n', '')
    parse_file = parse_file.replace('Mr. President', 'MrPresident')
    
    split = re.split(r'Mr. |Ms. |Mrs. ', parse_file)
    split.pop(0)
    name_and_speech = make_array()
    for i in np.arange(len(split)):
        try:
            lastname = re.match('\w*\. ', split[i]).group(0)[:-2]
            name_and_speech = np.append(name_and_speech, lastname)
            value = re.sub('[A-Z]\w*\. ', '', split[i])
            name_and_speech = np.append(name_and_speech, value)
        except:
            abcabcabc = 1
    return name_and_speech

In [11]:
def sep_date_from_file(file):
    abcdef = re.findall('[0-9]{4}-[0-9]{2}-[0-9]{2}', file)
    return re.split('-', abcdef[0])
sep_date_from_file('CREC-2018-01-04-pt1-PgS47.xml')[0]

'2018'

In [12]:
def find_title(string):
    parse_file = ''
    with open(string) as file:
        for line in file:
            parse_file += line
    parse_file = parse_file.replace('Mr. President', 'MrPresident')
    title = re.findall('[A-Z \'-]+[A-Z0-9-,\. ]*[Continued]*\\n', parse_file)
    return title[0].strip()

### Mods Parsing

In [19]:
def getAllExtensions(file):
    handler = open(file).read()
    soup = Soup(handler, "lxml")
    return soup.find_all('extension')

In [20]:
master_extensions = getAllExtensions("mastermods.xml")

In [21]:
def getCongMemberExtension(extensions, last_name):
    for extension in extensions:
        ext = str(extension)
        if last_name in ext:
            return extension

In [22]:
def getCongMemberExtensionFromFile(last_name, filename):
    handler = open(filename).read()
    soup = Soup(handler, "lxml")
    extensions = soup.find_all('extension')
    for extension in extensions:
        ext = str(extension)
        if last_name in ext:
            return extension

In [23]:
def getCongMemberTag(congMemberExtension):
    contents = congMemberExtension.contents
    for tag in contents:
        tag_str = str(tag)
        if 'congmember' in tag_str:
            return tag

In [24]:
def getParty(congMemberTag):
    return congMemberTag.attrs['party']
def getType(congMemberTag):
    return congMemberTag.attrs['type']
def getAuthorityId(congMemberTag):
    return congMemberTag.attrs['authorityid']
def getBioGuideId(congMemberTag):
    return congMemberTag.attrs['bioguideid']
def getState(congMemberTag):
    return congMemberTag.attrs['state']
def getCongressId(congMemberTag):
    return congMemberTag.attrs['congress']

In [25]:
def getDistrictTag(congMemberExtension):
    contents = congMemberExtension.contents
    for tag in contents:
        tag_str = str(tag)
        if 'district' in tag_str:
            return tag

In [26]:
def getFirstName(congMemberTag):
    contents = congMemberTag.contents
    name_tags = []
    for tag in contents:
        tag_str = str(tag)
        if 'name' in tag_str:
            name_tags += [tag]
    try:
        first_name = name_tags[1].string.split()[0]
    except:
        first_name = None
    return first_name

In [30]:
def getCongMemberInfoFromMaster(last_name, mods_filename):
    info = make_array()
    
    try:
        extension = getCongMemberExtension(master_extensions, last_name)
        congMemberTag = getCongMemberTag(extension)
    except:
        return getcongMemberInfoFromLocal(last_name, mods_filename)
    
    congMemType = getType(congMemberTag)
    district = 'N/A'
    if congMemType == 'DELEGATE':
        try:
            info = getcongMemberInfoFromLocal(last_name, mods_filename)
        except:
            info = np.append(info, '')
            info = np.append(info, '')
            info = np.append(info, last_name)
    else:
        if congMemType == 'REPRESENTATIVE':
            try:
                district_tag = getDistrictTag(extension)
                district = district_tag.string
            except:
                district = 'N/A'
        info = np.append(info, getAuthorityId(congMemberTag))
        info = np.append(info, getFirstName(congMemberTag))
        info = np.append(info, last_name)
        info = np.append(info, congMemType)
        info = np.append(info, getParty(congMemberTag))
        info = np.append(info, getState(congMemberTag))
        info = np.append(info, district)
        info = np.append(info, getBioGuideId(congMemberTag))
        info = np.append(info, getCongressId(congMemberTag))
        
        
    return info

In [31]:
def getChamber(congMemberTag):
    return congMemberTag.attrs['chamber']

In [32]:
def getCongMemberInfoFromLocal(last_name, mods_filename):
    extension = getCongMemberExtensionFromFile(last_name, mods_filename)
    info = make_array()
    congMemberTag = getCongMemberTag(extension)
    
    congMemChamber = getChamber(congMemberTag)
    congMemType = 'N/A'
    if congMemChamber == 'H':
        congMemType = 'REPRESENTATIVE'
    elif congMemChamber == 'S':
        congMemType = "SENATOR"
    
    district = 'N/A'
    if congMemType == 'REPRESENTATIVE':
        try:
            district_tag = getDistrictTag(extension)
            district = district_tag.string
        except:
            district = 'N/A'
            
    info = np.append(info, getAuthorityId(congMemberTag))
    info = np.append(info, getFirstName(congMemberTag))
    info = np.append(info, last_name)
    info = np.append(info, congMemType)
    info = np.append(info, getParty(congMemberTag))
    info = np.append(info, getState(congMemberTag))
    info = np.append(info, district)
    info = np.append(info, getBioGuideId(congMemberTag))
    info = np.append(info, getCongressId(congMemberTag))
    return info