# Searchlight

## Imports

In [1]:
import pandas as pd
import re
import numpy as np
from datascience import *
import urllib
from selenium import webdriver
from time import sleep
import requests
from bs4 import BeautifulSoup as Soup
import os

## Initialize DataScience Tables

In [2]:
#Speech Table
speeches = Table().with_columns("speech_id", make_array(), 
                                "speaker_id", make_array(), 
                                "proceeding_id", make_array(), 
                                "topic_id", make_array(), 
                                "word_count", make_array(), 
                                "speech_text", make_array(),
                                'file_name', make_array())

In [3]:
speeches

speech_id,speaker_id,proceeding_id,topic_id,word_count,speech_text,file_name


In [4]:
#Speaker Table
speakers = Table().with_columns("speaker_id", make_array(), 
                                "first_name", make_array(), 
                                "last_name", make_array(), 
                                "type" , make_array(),
                                "party", make_array(), 
                                "state", make_array(), 
                                "district", make_array(),
                                "bio_guide_id", make_array(),
                                "congress_id", make_array())

In [5]:
speakers

speaker_id,first_name,last_name,type,party,state,district,bio_guide_id,congress_id


In [6]:
topics = Table().with_columns("topic_id", make_array(), 
                                "topic_name", make_array())

In [7]:
topics

topic_id,topic_name


In [8]:
#Proceedings Table
proceedings = Table().with_columns("proceeding_id", make_array(), 
                              "date", make_array(),
                              "title", make_array())

In [9]:
proceedings

proceeding_id,date,title


## Initializing Parsing

### Text Parsing

In [10]:
def remove_space(regex):
    return regex.group().replace(' ', '')

In [11]:
def sep_speech(string):
    parse_file = ''
    with open(string) as file:
        for line in file:
            parse_file += line
    parse_file = parse_file.replace('\n', '')
#     parse_file = parse_file.replace('Mr. President', 'MrPresident')
#     parse_file = parse_file.replace('Mr. Short', 'Mr.Short')
    parse_file = re.sub('Mr. [A-Z][a-z]', remove_space, parse_file)
    
    split = re.split(r'Mr. |Ms. |Mrs. ', parse_file)
    split.pop(0)
    name_and_speech = make_array()
    for i in np.arange(len(split)):
        try:
            lastname = re.match('[A-Z]*\. ', split[i]).group(0)[:-2]
            name_and_speech = np.append(name_and_speech, lastname)
            value = re.sub('[A-Z]\w*\. ', '', split[i])
            name_and_speech = np.append(name_and_speech, value)
        except:
            abcabcabc = 1
    return name_and_speech

In [12]:
def sep_date_from_file(file):
    abcdef = re.findall('[0-9]{4}-[0-9]{2}-[0-9]{2}', file)
    return re.split('-', abcdef[0])

In [13]:
def find_title(file_name):
    parse_file = ''
    with open(file_name) as file:
        for line in file:
            parse_file += line
    parse_file = parse_file.replace('Mr. President', 'MrPresident')
    title = re.findall('[A-Z \'-]+[A-Z0-9-,\. ]*[Continued]*\\n', parse_file)
    return title[0].strip()

### Mods Parsing

In [14]:
def getAllExtensions(file):
    handler = open(file).read()
    soup = Soup(handler, "lxml")
    return soup.find_all('extension')

In [15]:
master_extensions = getAllExtensions("mastermods.xml")

In [16]:
def getCongMemberExtension(extensions, last_name):
    for extension in extensions:
        ext = str(extension)
        if last_name in ext:
            return extension

In [17]:
def getCongMemberExtensionFromFile(last_name, filename):
    handler = open(filename).read()
    soup = Soup(handler, "lxml")
    extensions = soup.find_all('extension')
    for extension in extensions:
        ext = str(extension)
        if last_name in ext:
            return extension

In [18]:
def getCongMemberTag(last_name, congMemberExtension):
    contents = congMemberExtension.contents
    for tag in contents:
        tag_str = str(tag)
        if 'congmember' in tag_str:
            if last_name in tag_str:
                return tag

In [19]:
def getParty(congMemberTag):
    return congMemberTag.attrs['party']
def getType(congMemberTag):
    return congMemberTag.attrs['type']
def getAuthorityId(congMemberTag):
    return congMemberTag.attrs['authorityid']
def getBioGuideId(congMemberTag):
    return congMemberTag.attrs['bioguideid']
def getState(congMemberTag):
    return congMemberTag.attrs['state']
def getCongressId(congMemberTag):
    return congMemberTag.attrs['congress']

In [20]:
def getDistrictTag(congMemberExtension):
    contents = congMemberExtension.contents
    for tag in contents:
        tag_str = str(tag)
        if 'district' in tag_str:
            return tag

In [21]:
def getFirstName(congMemberTag):
    contents = congMemberTag.contents
    name_tags = []
    for tag in contents:
        tag_str = str(tag)
        if 'name' in tag_str:
            name_tags += [tag]
    try:
        first_name = name_tags[1].string.split()[0]
    except:
        first_name = None
    return first_name

In [40]:
def getCongMemberInfoFromMaster(last_name, mods_filename):
    info = make_array()
    print(mods_filename)
    if last_name == 'FISHCER':
        last_name = 'FISCHER'
    if last_name == 'VANHOLLEN':
        last_name = 'VAN HOLLEN'
    if last_name == 'BORDALLO' or last_name == 'CAPITO':
        return getCongMemberInfoFromLocal(last_name, mods_filename)
    if last_name == 'SOUZZI':
        print(mods_filename)
        return [2341, 'Thomas', 'SUOZZI', 'REPESENTATIVE', 'R', 'NY', 'N/A', 'S001201', 115 ]
    if last_name == 'ROUNDS':
        print(mods_filename)
        return [2288, 'Mike', 'ROUNDS', 'SENATOR', 'R', 'SD', 'N/A', "R000605", 115]
    try:
        extension = getCongMemberExtension(master_extensions, last_name)
        if extension is None:
            return getCongMemberInfoFromLocal(last_name, mods_filename)
        congMemberTag = getCongMemberTag(last_name, extension)
    except:
        return getCongMemberInfoFromLocal(last_name, mods_filename)
    
    
    
    congMemType = getType(congMemberTag)
    district = 'N/A'
    if congMemType == 'DELEGATE':
        try:
            info = getcongMemberInfoFromLocal(last_name, mods_filename)
        except:
            info = [99999999999999, 'First Name unavailable', last_name, congMemType, 'Party Info Unavailable','state info unavailable', district, 'BioGuideID unavailable', 'CongressID unavailable']
    else:
        if congMemType == 'REPRESENTATIVE':
            try:
                district_tag = getDistrictTag(extension)
                district = district_tag.string
            except:
                district = 'N/A'
    try:
        bioGuideID = getBioGuideId(congMemberTag)
    except:
        bioGuideID = 99999999999999999
    info = [getAuthorityId(congMemberTag), getFirstName(congMemberTag), last_name, congMemType, getParty(congMemberTag), getState(congMemberTag), district, bioGuideID, getCongressId(congMemberTag) ]
        
    return info

In [23]:
def getChamber(congMemberTag):
    return congMemberTag.attrs['chamber']

In [24]:
def getCongMemberInfoFromLocal(last_name, mods_filename):
    print('checking local file')
    extension = getCongMemberExtensionFromFile(last_name, mods_filename)
    info = make_array()
    congMemberTag = getCongMemberTag(last_name, extension)
    
    congMemChamber = getChamber(congMemberTag)
    congMemType = 'N/A'
    if congMemChamber == 'H':
        congMemType = 'REPRESENTATIVE'
    elif congMemChamber == 'S':
        congMemType = "SENATOR"
    
    district = 'N/A'
    if congMemType == 'REPRESENTATIVE':
        try:
            district_tag = getDistrictTag(extension)
            district = district_tag.string
        except:
            district = 'N/A'
            
#     info = np.append(info, getAuthorityId(congMemberTag))
#     info = np.append(info, getFirstName(congMemberTag))
#     info = np.append(info, last_name)
#     info = np.append(info, congMemType)
#     info = np.append(info, getParty(congMemberTag))
#     info = np.append(info, getState(congMemberTag))
#     info = np.append(info, district)
#     info = np.append(info, getBioGuideId(congMemberTag))
#     info = np.append(info, getCongressId(congMemberTag))
    try:
        authID = getAuthorityId(congMemberTag)
    except:
        authID = 99999999999999
    try:
        party = getParty(congMemberTag)
    except:
        party = 'Party information Unavailable'
    try:
        state = getState(congMemberTag)
    except:
        state = 'State Info Unavailable'
    try:
        bioID = getBioGuideId(congMemberTag)
    except:
        bioID = 9999999999999999
    info = [authID, getFirstName(congMemberTag), last_name, congMemType, party, state, district, bioID, getCongressId(congMemberTag) ]
    return info

In [25]:
getCongMemberInfoFromLocal('JAYAPAL', 'CREC-2017-01-05-pt1-PgH101-5.xml')

checking local file


['2354',
 'Pramila',
 'JAYAPAL',
 'REPRESENTATIVE',
 'D',
 'WA',
 'N/A',
 'J000298',
 '115']

## Parsing

def divide_chunks(l, n):
     
    # looping till length l
    for i in range(0, len(l), n): 
        yield l[i:i + n]

In [26]:
#Populate the Speech Table
count = 0
list_of_files = os.listdir("/Users/cun-yuwang/Desktop/scrapeX")

for file in list_of_files:
    print(file)
    if file.endswith(".txt"):
        separated = sep_speech(file)
        i = 0
        while i < len(separated):
            row = make_array()
            text = separated[i+1]
            text = text.replace('MrPresident', 'Mr. President')
            if len(text) > 30:
                row = [count, separated[i], 'proceeding_id', 'topic-id', len(text.split()), text, file] 
                count += 1
                speeches = speeches.with_row(row)     
            i +=2
        count+= 1
        print('finished with file ', count)

.ipynb_checkpoints
CREC-2017-01-05-pt1-PgH101-5.txt
finished with file  190
CREC-2017-01-05-pt1-PgH101-5.xml
mastermods.xml
Searchlight Master.ipynb


In [27]:
speeches.show()

speech_id,speaker_id,proceeding_id,topic_id,word_count,speech_text,file_name
0,GOODLATTE,proceeding_id,topic-id,295,"Mr.Speaker, this morning, for the fourth time in the his ...",CREC-2017-01-05-pt1-PgH101-5.txt
1,HULTGREN,proceeding_id,topic-id,29,"Article I, section 1: ``All legislative powers herein g ...",CREC-2017-01-05-pt1-PgH101-5.txt
2,GOODLATTE,proceeding_id,topic-id,9,I now yield to the gentlewoman from Texas (,CREC-2017-01-05-pt1-PgH101-5.txt
3,GOODLATTE,proceeding_id,topic-id,9,I now yield to the gentleman from Maine (Mr.Poliquin).,CREC-2017-01-05-pt1-PgH101-5.txt
4,POLIQUIN,proceeding_id,topic-id,82,``No person shall be a Representative who shall not have ...,CREC-2017-01-05-pt1-PgH101-5.txt
5,GOODLATTE,proceeding_id,topic-id,9,I now yield to the gentleman from Minnesota (Mr.Walz).,CREC-2017-01-05-pt1-PgH101-5.txt
6,WALZ,proceeding_id,topic-id,69,``The number of Representatives shall not exceed one for ...,CREC-2017-01-05-pt1-PgH101-5.txt
7,GOODLATTE,proceeding_id,topic-id,10,I now yield to the gentleman from New Jersey (Mr.Lance).,CREC-2017-01-05-pt1-PgH101-5.txt
8,LANCE,proceeding_id,topic-id,41,``When vacancies happen in the representation from any S ...,CREC-2017-01-05-pt1-PgH101-5.txt
9,GOODLATTE,proceeding_id,topic-id,10,I now yield to the gentleman from Texas (Mr.Gene Green).,CREC-2017-01-05-pt1-PgH101-5.txt


In [28]:
#create dictionairy of unique last names to files
distinct_lastname_table = speeches.group('speaker_id')
lastname_file_table = speeches.join('speaker_id', distinct_lastname_table, 'speaker_id')
lastname_file_table = lastname_file_table.drop('count').drop('speech_id').drop('proceeding_id').drop('topic_id').drop('word_count').drop('speech_text')
name_to_xml = {}
lastnames = lastname_file_table.column(0)
files = lastname_file_table.column(1)
count = 0
while count < len(lastnames):
    name_to_xml[lastnames[count]] = files[count].replace('.txt', '.xml')
    count += 1

In [None]:
#Populate Speaker Table

for name in list(name_to_xml.keys()):
    print(name)
    row = getCongMemberInfoFromMaster(name, name_to_xml[name])
    speakers = speakers.with_row(row)
speakers.show(5)
speeches.show(5)

ALLEN
CREC-2017-01-05-pt1-PgH101-5.xml
ARRINGTON
CREC-2017-01-05-pt1-PgH101-5.xml
checking local file
BABIN
CREC-2017-01-05-pt1-PgH101-5.xml
BACON
CREC-2017-01-05-pt1-PgH101-5.xml
checking local file
BARRAGAN
CREC-2017-01-05-pt1-PgH101-5.xml
checking local file
BERA
CREC-2017-01-05-pt1-PgH101-5.xml
BEYER
CREC-2017-01-05-pt1-PgH101-5.xml
BIGGS
CREC-2017-01-05-pt1-PgH101-5.xml
checking local file
BLUMENAUER
CREC-2017-01-05-pt1-PgH101-5.xml
BONAMICI
CREC-2017-01-05-pt1-PgH101-5.xml
BOST
CREC-2017-01-05-pt1-PgH101-5.xml
CARBAJAL
CREC-2017-01-05-pt1-PgH101-5.xml
checking local file
CARDENAS
CREC-2017-01-05-pt1-PgH101-5.xml
CARTWRIGHT
CREC-2017-01-05-pt1-PgH101-5.xml
CORREA
CREC-2017-01-05-pt1-PgH101-5.xml
checking local file
CROWLEY
CREC-2017-01-05-pt1-PgH101-5.xml
DAVIDSON
CREC-2017-01-05-pt1-PgH101-5.xml
checking local file
DEMINGS
CREC-2017-01-05-pt1-PgH101-5.xml
checking local file
DENHAM
CREC-2017-01-05-pt1-PgH101-5.xml
DUNN
CREC-2017-01-05-pt1-PgH101-5.xml
checking local file
ESTY
CRE

In [31]:
speakers.show()

speaker_id,first_name,last_name,type,party,state,district,bio_guide_id,congress_id
2239,Rick,ALLEN,REPRESENTATIVE,R,GA,,A000372,115
2350,Jodey,ARRINGTON,REPRESENTATIVE,R,TX,,A000375,115
2270,Brian,BABIN,REPRESENTATIVE,R,TX,,B001291,115
2337,Don,BACON,REPRESENTATIVE,R,NE,,B001298,115
2311,Nanette,BARRAGAN,REPRESENTATIVE,D,CA,,B001300,115
2102,Ami,BERA,REPRESENTATIVE,D,CA,,B001287,115
2272,Donald,BEYER,REPRESENTATIVE,D,VA,,B001292,115
2307,Andy,BIGGS,REPRESENTATIVE,R,AZ,,B001302,115
99,Earl,BLUMENAUER,REPRESENTATIVE,D,OR,,B000574,115
2092,Suzanne,BONAMICI,REPRESENTATIVE,D,OR,,B001278,115


In [32]:
names = speakers.column('last_name')
ids = speakers.column('speaker_id')
name_to_id = dict(zip(names, ids))
name_to_id['CAPITO'] = 1676

In [34]:
newcol = make_array()
for name in speeches.sort('speaker_id').column('speaker_id'):
    if name == 'SOUZZI':
        name = 'SUOZZI'
    if name == 'VANHOLLEN':
        name = 'VAN HOLLEN'
    if name == 'FISHCER':
        name = 'FISCHER'
    newcol = np.append(name_to_id[name], newcol)
speeches = speeches.sort('speaker_id').drop('speaker_id').with_column('speaker_id', np.flip(newcol, 0))

In [35]:
title_column = make_array()
year_column = make_array()
month_column = make_array()
day_column = make_array()
for file_name in speeches.column('file_name'):
    title_column = np.append(find_title(file_name), title_column)
    year, month, day = sep_date_from_file(file_name)
    year_column = np.append(int(year), year_column)
    month_column = np.append(int(month), month_column)
    day_column = np.append(int(day), day_column)

    title_column = np.flip(title_column, 0)
year_column = np.flip(year_column, 0)
mont_column = np.flip(month_column, 0)
day_column = np.flip(day_column, 0)


In [36]:
speeches = speeches.drop('proceeding_id')
speeches = speeches.with_columns('session_title', title_column, 'year', year_column, 'month', month_column, 'day', day_column)

In [37]:
speeches.show(10)

speech_id,topic_id,word_count,speech_text,file_name,speaker_id,session_title,year,month,day
80,topic-id,59,"``Each State shall appoint, in such manner as the legisl ...",CREC-2017-01-05-pt1-PgH101-5.txt,2239,READING OF THE CONSTITUTION,2017,1,5
151,topic-id,115,Amendment XII: ``The electors shall meet in their respe ...,CREC-2017-01-05-pt1-PgH101-5.txt,2350,READING OF THE CONSTITUTION,2017,1,5
84,topic-id,62,"``No person except a natural born citizen, or a citizen ...",CREC-2017-01-05-pt1-PgH101-5.txt,2270,READING OF THE CONSTITUTION,2017,1,5
98,topic-id,44,"``. . . he may, on extraordinary occasions, convene both ...",CREC-2017-01-05-pt1-PgH101-5.txt,2337,READING OF THE CONSTITUTION,2017,1,5
17,topic-id,53,``The Senate shall have the sole power to try all impeac ...,CREC-2017-01-05-pt1-PgH101-5.txt,2311,READING OF THE CONSTITUTION,2017,1,5
54,topic-id,46,"``. . . to declare war, grant letters of marque and repr ...",CREC-2017-01-05-pt1-PgH101-5.txt,2102,READING OF THE CONSTITUTION,2017,1,5
89,topic-id,85,Section 2: ``The President shall be Commander in Chief ...,CREC-2017-01-05-pt1-PgH101-5.txt,2272,READING OF THE CONSTITUTION,2017,1,5
140,topic-id,54,Amendment V: ``No person shall be held to answer for a ...,CREC-2017-01-05-pt1-PgH101-5.txt,2307,READING OF THE CONSTITUTION,2017,1,5
93,topic-id,35,``. . . but the Congress may by law vest the appointment ...,CREC-2017-01-05-pt1-PgH101-5.txt,99,READING OF THE CONSTITUTION,2017,1,5
96,topic-id,35,Section 3: ``He shall from time to time give the Congre ...,CREC-2017-01-05-pt1-PgH101-5.txt,2092,READING OF THE CONSTITUTION,2017,1,5


In [38]:
month_int_name = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 
                 10: 'October', 11: 'November', 12: 'December'}
new_month = make_array()
for month in speeches.column('month'):
    new_month = np.append(month_int_name[month], new_month)
new_month = np.flip(new_month, 0)
new_month
speeches = speeches.drop('month').with_column('month', new_month)
    

In [39]:
speeches.show(10)
speakers.where('speaker_id', are.equal_to('99999999999999'))

speech_id,topic_id,word_count,speech_text,file_name,speaker_id,session_title,year,day,month
80,topic-id,59,"``Each State shall appoint, in such manner as the legisl ...",CREC-2017-01-05-pt1-PgH101-5.txt,2239,READING OF THE CONSTITUTION,2017,5,January
151,topic-id,115,Amendment XII: ``The electors shall meet in their respe ...,CREC-2017-01-05-pt1-PgH101-5.txt,2350,READING OF THE CONSTITUTION,2017,5,January
84,topic-id,62,"``No person except a natural born citizen, or a citizen ...",CREC-2017-01-05-pt1-PgH101-5.txt,2270,READING OF THE CONSTITUTION,2017,5,January
98,topic-id,44,"``. . . he may, on extraordinary occasions, convene both ...",CREC-2017-01-05-pt1-PgH101-5.txt,2337,READING OF THE CONSTITUTION,2017,5,January
17,topic-id,53,``The Senate shall have the sole power to try all impeac ...,CREC-2017-01-05-pt1-PgH101-5.txt,2311,READING OF THE CONSTITUTION,2017,5,January
54,topic-id,46,"``. . . to declare war, grant letters of marque and repr ...",CREC-2017-01-05-pt1-PgH101-5.txt,2102,READING OF THE CONSTITUTION,2017,5,January
89,topic-id,85,Section 2: ``The President shall be Commander in Chief ...,CREC-2017-01-05-pt1-PgH101-5.txt,2272,READING OF THE CONSTITUTION,2017,5,January
140,topic-id,54,Amendment V: ``No person shall be held to answer for a ...,CREC-2017-01-05-pt1-PgH101-5.txt,2307,READING OF THE CONSTITUTION,2017,5,January
93,topic-id,35,``. . . but the Congress may by law vest the appointment ...,CREC-2017-01-05-pt1-PgH101-5.txt,99,READING OF THE CONSTITUTION,2017,5,January
96,topic-id,35,Section 3: ``He shall from time to time give the Congre ...,CREC-2017-01-05-pt1-PgH101-5.txt,2092,READING OF THE CONSTITUTION,2017,5,January


speaker_id,first_name,last_name,type,party,state,district,bio_guide_id,congress_id


In [32]:
speeches.to_csv('speeches_2017_Apr_3_to_5.csv')

In [33]:
speakers.to_csv('speakers_2017_Apr_3_to_5.csv')