In [5]:
import pandas as pd
import wikipediaapi
import re
import os
import json

In [6]:
with open(os.path.join('public', '50_states.json'), 'r') as f:
    states = json.load(f)
states

{'AL': 'Alabama',
 'AK': 'Alaska',
 'AZ': 'Arizona',
 'AR': 'Arkansas',
 'CA': 'California',
 'CO': 'Colorado',
 'CT': 'Connecticut',
 'DE': 'Delaware',
 'FL': 'Florida',
 'GA': 'Georgia',
 'HI': 'Hawaii',
 'ID': 'Idaho',
 'IL': 'Illinois',
 'IN': 'Indiana',
 'IA': 'Iowa',
 'KS': 'Kansas',
 'KY': 'Kentucky',
 'LA': 'Louisiana',
 'ME': 'Maine',
 'MD': 'Maryland',
 'MA': 'Massachusetts',
 'MI': 'Michigan',
 'MN': 'Minnesota',
 'MS': 'Mississippi',
 'MO': 'Missouri',
 'MT': 'Montana',
 'NE': 'Nebraska',
 'NV': 'Nevada',
 'NH': 'New Hampshire',
 'NJ': 'New Jersey',
 'NM': 'New Mexico',
 'NY': 'New York',
 'NC': 'North Carolina',
 'ND': 'North Dakota',
 'OH': 'Ohio',
 'OK': 'Oklahoma',
 'OR': 'Oregon',
 'PA': 'Pennsylvania',
 'RI': 'Rhode Island',
 'SC': 'South Carolina',
 'SD': 'South Dakota',
 'TN': 'Tennessee',
 'TX': 'Texas',
 'UT': 'Utah',
 'VT': 'Vermont',
 'VA': 'Virginia',
 'WA': 'Washington',
 'WV': 'West Virginia',
 'WI': 'Wisconsin',
 'WY': 'Wyoming'}

In [7]:
all_signatures = pd.read_csv(os.path.join('public', '22march2024', 'all_signatures.txt'), delimiter='\t', header=None)
all_signatures = all_signatures.rename(columns={0: 'name', 1: 'state', 2: 'district'})
all_signatures

Unnamed: 0,name,state,district
0,1. James P. McGovern,Massachusetts,2
1,2. Gabe Vasquez,New Mexico,2
2,3. Stephen F. Lynch,Massachusetts,8
3,4. Matt Cartwright,Pennsylvania,8
4,5. Earl Blumenauer,Oregon,3
...,...,...,...
186,187. Lloyd Doggett,Texas,37
187,188. Ken Buck,Colorado,4
188,189. Josh Harder,California,9
189,190. Kevin Mullin,California,15


In [8]:
# regex to match garbage, such as numbers '1. ...' before the full name
garbage = re.compile(r'^\d+\.\s+')
all_signatures['name'] = all_signatures['name'].apply(lambda x: garbage.sub('', x))
all_signatures

Unnamed: 0,name,state,district
0,James P. McGovern,Massachusetts,2
1,Gabe Vasquez,New Mexico,2
2,Stephen F. Lynch,Massachusetts,8
3,Matt Cartwright,Pennsylvania,8
4,Earl Blumenauer,Oregon,3
...,...,...,...
186,Lloyd Doggett,Texas,37
187,Ken Buck,Colorado,4
188,Josh Harder,California,9
189,Kevin Mullin,California,15


# Parse Dem Holdouts from PNG

In [None]:
dem_holdouts = pd.read_csv(os.path.join('public', '22march2024', 'dem_holdouts.txt'), delimiter=',', header=None)
dem_holdouts = dem_holdouts.rename(columns={0: 'name', 1: 'state'})
dem_holdouts

In [None]:
# Remove parentheses
dem_holdouts.state = dem_holdouts.state.apply(lambda x: re.sub(r'\(|\)', '', x).strip())
dem_holdouts.state = dem_holdouts.state.apply(lambda x: states[x] if x in states else x)
dem_holdouts

In [None]:
# Fuzzy search: by name, search for the closest match in the all_signatures dataframe
from fuzzywuzzy import process
dem_holdouts['name'].apply(lambda x: (x, process.extractOne(x, all_signatures['name'])))

In [None]:
import flor
for i, row in flor.loop("us_rep", dem_holdouts.iterrows()):
    flor.log("name", row[0])
    flor.log("state", row[1])
    flor.log("signed", False)
    flor.log("party", "Democrat")
flor.commit()

# Enter Signatures into DB

In [9]:
all_signatures

Unnamed: 0,name,state,district
0,James P. McGovern,Massachusetts,2
1,Gabe Vasquez,New Mexico,2
2,Stephen F. Lynch,Massachusetts,8
3,Matt Cartwright,Pennsylvania,8
4,Earl Blumenauer,Oregon,3
...,...,...,...
186,Lloyd Doggett,Texas,37
187,Ken Buck,Colorado,4
188,Josh Harder,California,9
189,Kevin Mullin,California,15


In [11]:
print(all_signatures.to_string())

                           name           state  district
0             James P. McGovern   Massachusetts         2
1                  Gabe Vasquez      New Mexico         2
2              Stephen F. Lynch   Massachusetts         8
3               Matt Cartwright    Pennsylvania         8
4               Earl Blumenauer          Oregon         3
5           Jonathan L. Jackson        Illinois         1
6                Steny H. Hoyer        Maryland         5
7               Donald G. Davis  North Carolina         1
8               Scott H. Peters      California        50
9                  Wiley Nickel  North Carolina        13
10           Katherine M. Clark   Massachusetts         5
11              Morgan McGarvey        Kentucky         3
12           Hakeem S. Jeffries        New York         8
13             Suzan K. DelBene      Washington         1
14             Thomas R. Suozzi        New York         3
15                   Susan Wild    Pennsylvania         7
16            

In [12]:
pattern = r'\b(?:Democrat(?:ic)?|Republic(?:an)?|DNC|RNC|Democracy|Republican|GOP)\sParty\b|\bDemocrat(?:ic)?\b|\bRepublic(?:an)?\b'

In [13]:
wiki_wiki = wikipediaapi.Wikipedia("QuorumFinder (rogarcia@berkeley.edu; rlnsanz.github.io)")

for i, row in all_signatures.iterrows():
    full_name = row[0]
    us_state = row[1]
    p = wiki_wiki.page(full_name)
    if p.exists():
        print(row[0], re.findall(pattern, p.summary, re.IGNORECASE))
    else:
        # We need to search for the page
        print(row[0], "not found")

James P. McGovern ['Democratic Party', 'Democratic', 'Republican']
Gabe Vasquez ['Democratic Party']
Stephen F. Lynch ['Democrat', 'Democratic']
Matt Cartwright ['Democratic Party', 'Democratic', 'Republican']
Earl Blumenauer ['Democratic Party']
Jonathan L. Jackson not found
Steny H. Hoyer ['Democrat', 'Democrat', 'Republican', 'Democrat']
Donald G. Davis ['Democratic Party']
Scott H. Peters ['Democratic Party']
Wiley Nickel ['Democratic']
Katherine M. Clark ['Democratic']
Morgan McGarvey ['Democrat', 'Democrat']
Hakeem S. Jeffries ['Democratic', 'Democratic']
Suzan K. DelBene ['Democratic', 'Republican', 'Republican', 'Democratic', 'Democrat']
Thomas R. Suozzi ['Democratic Party', 'Republican', 'Democratic', 'Democratic', 'Democratic']
Susan Wild ['Democrat', 'Democrat']
Lucy McBath ['Democratic Party', 'Democratic', 'Republican', 'Democratic']
Mike Thompson []
Bennie G. Thompson ['Democratic Party', 'Democrat', 'Democrat']
Emanuel Cleaver ['Democratic Party']
Diana DeGette ['Democra