# Searchlight

## Imports

In [None]:
import pandas as pd
import re
import numpy as np
from datascience import *
import urllib
from selenium import webdriver
from time import sleep
import requests
from bs4 import BeautifulSoup as Soup
import os

## Initialize DataScience Tables

In [None]:
#Speech Table
speeches = Table().with_columns("speech_id", make_array(), 
                                "speaker_id", make_array(), 
                                "proceeding_id", make_array(), 
                                "topic_id", make_array(), 
                                "word_count", make_array(), 
                                "speech_text", make_array(),
                                'file_name', make_array())

In [None]:
speeches

In [None]:
#Speaker Table
speakers = Table().with_columns("speaker_id", make_array(), 
                                "first_name", make_array(), 
                                "last_name", make_array(), 
                                "type" , make_array(),
                                "party", make_array(), 
                                "state", make_array(), 
                                "district", make_array(),
                                "bio_guide_id", make_array(),
                                "congress_id", make_array())

In [None]:
speakers

In [None]:
topics = Table().with_columns("topic_id", make_array(), 
                                "topic_name", make_array())

In [None]:
topics

In [None]:
#Proceedings Table
proceedings = Table().with_columns("proceeding_id", make_array(), 
                              "date", make_array(),
                              "title", make_array())

In [None]:
proceedings

## Initializing Parsing

### Text Parsing

In [None]:
def remove_space(regex):
    return regex.group().replace(' ', '')

In [None]:
def sep_speech(string):
    parse_file = ''
    with open(string) as file:
        for line in file:
            parse_file += line
    parse_file = parse_file.replace('\n', '')
#     parse_file = parse_file.replace('Mr. President', 'MrPresident')
#     parse_file = parse_file.replace('Mr. Short', 'Mr.Short')
    parse_file = re.sub('Mr. [A-Z][a-z]', remove_space, parse_file)
    
    split = re.split(r'Mr. |Ms. |Mrs. ', parse_file)
    split.pop(0)
    name_and_speech = make_array()
    for i in np.arange(len(split)):
        try:
            lastname = re.match('[A-Z]*\. ', split[i]).group(0)[:-2]
            name_and_speech = np.append(name_and_speech, lastname)
            value = re.sub('[A-Z]\w*\. ', '', split[i])
            name_and_speech = np.append(name_and_speech, value)
        except:
            abcabcabc = 1
    return name_and_speech

In [None]:
def sep_date_from_file(file):
    abcdef = re.findall('[0-9]{4}-[0-9]{2}-[0-9]{2}', file)
    return re.split('-', abcdef[0])

In [None]:
def find_title(file_name):
    parse_file = ''
    with open(file_name) as file:
        for line in file:
            parse_file += line
    parse_file = parse_file.replace('Mr. President', 'MrPresident')
    title = re.findall('[A-Z \'-]+[A-Z0-9-,\. ]*[Continued]*\\n', parse_file)
    return title[0].strip()

### Mods Parsing

In [None]:
def getAllExtensions(file):
    handler = open(file).read()
    soup = Soup(handler, "lxml")
    return soup.find_all('extension')

In [None]:
master_extensions = getAllExtensions("mastermods.xml")

In [None]:
def getCongMemberExtension(extensions, last_name):
    for extension in extensions:
        ext = str(extension)
        if last_name in ext:
            return extension

In [None]:
def getCongMemberExtensionFromFile(last_name, filename):
    handler = open(filename).read()
    soup = Soup(handler, "lxml")
    extensions = soup.find_all('extension')
    for extension in extensions:
        ext = str(extension)
        if last_name in ext:
            return extension

In [None]:
def getCongMemberTag(congMemberExtension):
    contents = congMemberExtension.contents
    for tag in contents:
        tag_str = str(tag)
        if 'congmember' in tag_str:
            return tag

In [None]:
def getParty(congMemberTag):
    return congMemberTag.attrs['party']
def getType(congMemberTag):
    return congMemberTag.attrs['type']
def getAuthorityId(congMemberTag):
    return congMemberTag.attrs['authorityid']
def getBioGuideId(congMemberTag):
    return congMemberTag.attrs['bioguideid']
def getState(congMemberTag):
    return congMemberTag.attrs['state']
def getCongressId(congMemberTag):
    return congMemberTag.attrs['congress']

In [None]:
def getDistrictTag(congMemberExtension):
    contents = congMemberExtension.contents
    for tag in contents:
        tag_str = str(tag)
        if 'district' in tag_str:
            return tag

In [None]:
def getFirstName(congMemberTag):
    contents = congMemberTag.contents
    name_tags = []
    for tag in contents:
        tag_str = str(tag)
        if 'name' in tag_str:
            name_tags += [tag]
    try:
        first_name = name_tags[1].string.split()[0]
    except:
        first_name = None
    return first_name

In [None]:
def getCongMemberInfoFromMaster(last_name, mods_filename):
    info = make_array()
    if last_name == 'BORDALLO' or last_name == 'CAPITO':
        return getCongMemberInfoFromLocal(last_name, mods_filename)
    try:
        extension = getCongMemberExtension(master_extensions, last_name)
        if extension is None:
            return getCongMemberInfoFromLocal(last_name, mods_filename)
        congMemberTag = getCongMemberTag(extension)
    except:
        return getCongMemberInfoFromLocal(last_name, mods_filename)
    
    
    
    congMemType = getType(congMemberTag)
    district = 'N/A'
    if congMemType == 'DELEGATE':
        try:
            info = getcongMemberInfoFromLocal(last_name, mods_filename)
        except:
            info = [99999999999999, 'First Name unavailable', last_name, congMemType, 'Party Info Unavailable','state info unavailable', district, 'BioGuideID unavailable', 'CongressID unavailable']
    else:
        if congMemType == 'REPRESENTATIVE':
            try:
                district_tag = getDistrictTag(extension)
                district = district_tag.string
            except:
                district = 'N/A'
    try:
        bioGuideID = getBioGuideId(congMemberTag)
    except:
        bioGuideID = 99999999999999999
    info = [getAuthorityId(congMemberTag), getFirstName(congMemberTag), last_name, congMemType, getParty(congMemberTag), getState(congMemberTag), district, bioGuideID, getCongressId(congMemberTag) ]
        
    return info

In [None]:
def getChamber(congMemberTag):
    return congMemberTag.attrs['chamber']

In [None]:
def getCongMemberInfoFromLocal(last_name, mods_filename):
    print('checking local file')
    extension = getCongMemberExtensionFromFile(last_name, mods_filename)
    info = make_array()
    congMemberTag = getCongMemberTag(extension)
    
    congMemChamber = getChamber(congMemberTag)
    congMemType = 'N/A'
    if congMemChamber == 'H':
        congMemType = 'REPRESENTATIVE'
    elif congMemChamber == 'S':
        congMemType = "SENATOR"
    
    district = 'N/A'
    if congMemType == 'REPRESENTATIVE':
        try:
            district_tag = getDistrictTag(extension)
            district = district_tag.string
        except:
            district = 'N/A'
            
#     info = np.append(info, getAuthorityId(congMemberTag))
#     info = np.append(info, getFirstName(congMemberTag))
#     info = np.append(info, last_name)
#     info = np.append(info, congMemType)
#     info = np.append(info, getParty(congMemberTag))
#     info = np.append(info, getState(congMemberTag))
#     info = np.append(info, district)
#     info = np.append(info, getBioGuideId(congMemberTag))
#     info = np.append(info, getCongressId(congMemberTag))
    try:
        authID = getAuthorityId(congMemberTag)
    except:
        authID = 99999999999999
    try:
        party = getParty(congMemberTag)
    except:
        party = 'Party information Unavailable'
    try:
        state = getState(congMemberTag)
    except:
        state = 'State Info Unavailable'
    try:
        bioID = getBioGuideId(congMemberTag)
    except:
        bioID = 9999999999999999
    info = [authID, getFirstName(congMemberTag), last_name, congMemType, party, state, district, bioID, getCongressId(congMemberTag) ]
    return info

## Parsing

In [None]:
def divide_chunks(l, n):
     
    # looping till length l
    for i in range(0, len(l), n): 
        yield l[i:i + n]

In [None]:
#Populate the Speech Table
count = 0
list_of_files = os.listdir("/Users/cun-yuwang/Desktop/scraped/scrape10")

for file in list_of_files:
    if file.endswith(".txt"):
        separated = sep_speech(file)
        i = 0
        while i < len(separated):
            row = make_array()
            text = separated[i+1]
            text = text.replace('MrPresident', 'Mr. President')
            if len(text) > 30:
                row = [count, separated[i], 'proceeding_id', 'topic-id', len(text.split()), text, file] 
                count += 1
                speeches = speeches.with_row(row)     
            i +=2
        count+= 1
        print('finished with file ', count)

In [None]:
speeches

In [None]:
speeches.where('speaker_id', are.equal_to('BORDALLO'))

In [None]:
#create dictionairy of unique last names to files
distinct_lastname_table = speeches.group('speaker_id')
lastname_file_table = speeches.join('speaker_id', distinct_lastname_table, 'speaker_id')
lastname_file_table = lastname_file_table.drop('count').drop('speech_id').drop('proceeding_id').drop('topic_id').drop('word_count').drop('speech_text')
name_to_xml = {}
lastnames = lastname_file_table.column(0)
files = lastname_file_table.column(1)
count = 0
while count < len(lastnames):
    name_to_xml[lastnames[count]] = files[count].replace('.txt', '.xml')
    count += 1

In [None]:
#Populate Speaker Table

for name in list(name_to_xml.keys()):
    print(name)
    row = getCongMemberInfoFromMaster(name, name_to_xml[name])
    speakers = speakers.with_row(row)
speakers.show(5)
speeches.show(5)

In [None]:
#populating speaker_id column of the speeches table.
# newcol = make_array()
# names = speeches.column('speaker_id')
# for i in np.arange(len(names)):
#     newcol = np.append(speakers.where('last_name', are.equal_to(names[i])).column('speaker_id').item(0), newcol)
# copy_speeches = speeches.with_column('num_speaker_id', newcol)
# copy_speeches.show(20)

In [None]:
names = speakers.column('last_name')
ids = speakers.column('speaker_id')
name_to_id = dict(zip(names, ids))
name_to_id['CAPITO'] = 1676

In [None]:
newcol = make_array()
for name in speeches.sort('speaker_id').column('speaker_id'):
    newcol = np.append(name_to_id[name], newcol)
speeches = speeches.sort('speaker_id').drop('speaker_id').with_column('speaker_id', np.flip(newcol, 0))

In [None]:
title_column = make_array()
year_column = make_array()
month_column = make_array()
day_column = make_array()
for file_name in speeches.column('file_name'):
    title_column = np.append(find_title(file_name), title_column)
    year, month, day = sep_date_from_file(file_name)
    year_column = np.append(int(year), year_column)
    month_column = np.append(int(month), month_column)
    day_column = np.append(int(day), day_column)

    title_column = np.flip(title_column, 0)
year_column = np.flip(year_column, 0)
mont_column = np.flip(month_column, 0)
day_column = np.flip(day_column, 0)


In [None]:
speeches = speeches.drop('proceeding_id')
speeches = speeches.with_columns('session_title', title_column, 'year', year_column, 'month', month_column, 'day', day_column)

In [None]:
speeches

In [None]:
month_int_name = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 
                 10: 'October', 11: 'November', 12: 'December'}
new_month = make_array()
for month in speeches.column('month'):
    new_month = np.append(month_int_name[month], new_month)
new_month = np.flip(new_month, 0)
new_month
speeches = speeches.drop('month').with_column('month', new_month)
    

In [None]:
speeches.to_csv('small_data_speeches.csv')