In [1]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import re

In [2]:
URL = 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN07&spec=N'

response = requests.get(URL)

In [3]:
response.status_code

200

In [4]:
soup_TN07 = BS(response.text)

In [5]:
#print(soup_TN07.prettify())

Alright, trying to find:
- the percentage of the vote that candidate received
- the total amount raised by that candidate (as a numeric variable)
- the total amount spent by the candidate (as a numeric variable)

In [6]:
# note1: the first [0] is to get the first candidate
# note2: the second [0] is there because otherwise read_html returns a list of tables
finances = (pd
    .read_html(
        str(soup_TN07.findAll('table', attrs={'class' : 'Members--table'})[0])
            .replace('$','')
            .replace(':', '')
    )[0]
    .pivot(index=2, columns=0, values=1)
    .reset_index(drop=True)
)

In [7]:
finances.head()

Unnamed: 0,Cash on Hand,Raised,Spent
0,287889,1194960,935487


In [8]:
finances.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Cash on Hand  1 non-null      int64
 1   Raised        1 non-null      int64
 2   Spent         1 non-null      int64
dtypes: int64(3)
memory usage: 152.0 bytes


In [9]:
#soup_TN07.findAll('span', attrs={'class' : 'Members--vote-pct'})[0].text
re.search(r'\d{1,2}\.\d?%', soup_TN07.findAll('span', attrs={'class' : 'Members--vote-pct'})[0].text)[0]

'69.9%'

In [10]:
soup_TN07.findAll('h2')[0].find('i').text

'Incumbent'

In [11]:
soup_TN07.findAll('h2')[0].find('span', attrs={'class' : 'winner'}).text

'Winner'

In [12]:
candidate_info = soup_TN07.findAll('h2')[0].find('strong').text

In [13]:
print(candidate_info)


Mark Green (R) • Incumbent
 • Winner 
(69.9% of vote)



the regex below possibly includes sr., jr. and the likes

In [14]:
re.search(r'[A-Z][a-z]+ [A-Z][a-z]+ [A-Z0-9]?[a-z]?', candidate_info)[0].strip()

'Mark Green'

In [15]:
re.findall(r'\(([R,D])\)', candidate_info)[0]

'R'

In [16]:
if re.search(r'Incumbent', candidate_info)[0] == 'Incumbent':
    print('Incumbent')
else :
    print('Not Incumbent')

Incumbent


In [17]:
if re.search(r'Winner', candidate_info)[0] == 'Winner':
    print('Winner')
else :
    print('Loser')

Winner


In [18]:
re.search(r'[A-Z][a-z]+', soup_TN07.find('h1').text)[0]

'Tennessee'

In [19]:
re.findall(r'District (\d{2})', soup_TN07.find('h1').text)[0]

'07'

## working together

In [20]:
URL = 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN07&spec=N'
response = requests.get(URL)
soup = BS(response.text)

In [21]:
candidate_df = pd.DataFrame()

def district_scraper(soup):
    candidate_list = [x.text.strip() for x in soup.find_all('strong')]
    candidate_df['name'] = [re.findall(r'(.+ .+) \(\w', item)[0] for item in candidate_list]
    candidate_df['party'] = [re.findall(r'\((\w)\)', item)[0] for item in candidate_list]
    candidate_df['vote'] = [re.findall(r'(\d*[.?]\d*\%)', item)[0] for item in candidate_list]
    #candidate_df['winner'] = ['Winner' if re.search(r'Winner', item) != 'None' else 'Not Winner' for item in candidate_list]
    #[re.findall(r'(Winner)', item)[0] for item in candidate_list]
    #candidate_df['incumbent'] = [re.findall(r'(Incumbent)', item)[0] for item in candidate_list]

In [22]:
for item in [x.text.strip() for x in soup.find_all('strong')]:
    print(item)
    print(type(re.search(r'Winner', item)))

Mark Green (R) • Incumbent
 • Winner 
(69.9% of vote)
<class 're.Match'>
Kiran Sreepada (D)
										
(27.3% of vote)
<class 'NoneType'>
Ronald Brown (I)
										
(2.2% of vote)
<class 'NoneType'>
Scott Vieira Jr (I)
										
(0.6% of vote)
<class 'NoneType'>


In [23]:
candidate_df.head()

The code below is to format the finances tables (for each candidate) in the correct order

In [24]:
# intitialize empty data frame
final_finances = pd.DataFrame()

# create the list of all dataframe tables for candidates
finances = (pd
    .read_html(
        str(soup_TN07.findAll('table', attrs={'class' : 'Members--table'}))
            .replace('$','')
            .replace(':', '')
    )
)

In [27]:
print(finances)

[              0        1   2
0        Raised  1194960 NaN
1         Spent   935487 NaN
2  Cash on Hand   287889 NaN,               0       1   2
0        Raised  206644 NaN
1         Spent  207191 NaN
2  Cash on Hand       0 NaN,               0     1   2
0        Raised  1750 NaN
1         Spent     0 NaN
2  Cash on Hand  9006 NaN,               0     1   2
0        Raised   655 NaN
1         Spent  1049 NaN
2  Cash on Hand  -197 NaN]


In [25]:
# loop over the dataframes list
for fin in finances:
    final_finances = pd.concat(
        [final_finances,
        fin.pivot(index=2, columns=0, values=1).reset_index(drop=True)])

print(final_finances)
    

0  Cash on Hand   Raised   Spent
0        287889  1194960  935487
0             0   206644  207191
0          9006     1750       0
0          -197      655    1049


## Code below after working with team (final version)

In [44]:
candidate_df = pd.DataFrame()

def district_scraper(state, district):
    URL = f'https://www.opensecrets.org/races/candidates?cycle=2020&id={state}{district}&spec=N'
    soup = BS(requests.get(URL).text)
    
    candidate_list = [x.text.strip() for x in soup.find_all('strong')]
    
    finances = (pd
        .read_html(
            str(soup.findAll('table', attrs={'class' : 'Members--table'}))
                .replace('$','')
                .replace(':', '')
        )
    )
    
    temp_df = pd.DataFrame()

    temp_df['name'] = [re.findall(r'(.+ .+) \(\w', item)[0] for item in candidate_list]
    temp_df['party'] = [re.findall(r'\((\w)\)', item)[0] for item in candidate_list]
    temp_df['vote'] = [re.findall(r'(\d*[.?]\d*\%)', item)[0] for item in candidate_list]
    temp_df['winner'] = ['Winner' if re.search(r'(Winner)', str(item)) is not None
                              else 'Not Winner' for item in candidate_list]
    temp_df['incumbent'] = ['Incumbent' if re.search(r'(Incumbent)', str(item)) != None 
                            else 'Not Incumbent' for item in candidate_list]
    temp_df['state'] = str(re.search(r'[A-Z][a-z]+(?: [A-Z][a-hj-z][a-z]+)?',
                                     soup.findAll('title')[0].text)[0])
    temp_df['district'] = str(re.findall(r'District ([0-9]{2})', soup.findAll('title')[0].text)[0])
    temp_df['raised'] = [finances[i].iloc[0,1] for i in range(len(candidate_list))]
    temp_df['spent'] = [finances[i].iloc[1,1] for i in range(len(candidate_list))]
    temp_df['cash_on_hand'] = [finances[i].iloc[2,1] for i in range(len(candidate_list))]
    
    global candidate_df
    candidate_df = pd.concat([candidate_df, temp_df])
    candidate_df = candidate_df.reset_index(drop=True)

In [45]:
#the handy zfill function gives you a 0 with single digit numbers!

for num in range(1,10):
    district_scraper('TN', str(num).zfill(2))

In [46]:
candidate_df

Unnamed: 0,name,party,vote,winner,incumbent,state,district,raised,spent,cash_on_hand
0,Diana Harshbarger,R,74.8%,Winner,Not Incumbent,Tennessee,1,2126946,1869100,257846
1,Blair Nicole Walsingham,D,22.4%,Not Winner,Not Incumbent,Tennessee,1,140209,134995,5215
2,Tim Burchett,R,67.7%,Winner,Incumbent,Tennessee,2,1336276,878488,593678
3,Renee Hoyos,D,31.0%,Not Winner,Not Incumbent,Tennessee,2,812784,816793,210
4,Chuck Fleischmann,R,67.3%,Winner,Incumbent,Tennessee,3,1051653,381411,1880341
5,Meg Gorman,D,30.5%,Not Winner,Not Incumbent,Tennessee,3,85843,77760,8083
6,Scott Desjarlais,R,66.7%,Winner,Incumbent,Tennessee,4,331464,392499,302649
7,Christopher Hale,D,33.3%,Not Winner,Not Incumbent,Tennessee,4,308731,302996,5735
8,Jim Cooper,D,100.0%,Winner,Incumbent,Tennessee,5,936569,1332131,272934
9,John Rose,R,73.7%,Winner,Incumbent,Tennessee,6,1050429,625688,454375


In [33]:
URL2 = 'https://www.britannica.com/topic/United-States-House-of-Representatives-Seats-by-State-1787120'

soup_house_rep = BS(requests.get(URL2).text)

In [47]:
house_rep = pd.read_html(str(soup_house_rep.findAll('table')))[0].drop(index=50)

In [39]:
URL3 = 'https://www.infoplease.com/us/postal-information/state-abbreviations-and-state-postal-codes'

soup_stateabbr = BS(requests.get(URL3).text)

In [54]:
postal = pd.read_html(str(soup_stateabbr.findAll('table')))[0].rename(columns={'State Name/District' : 'state'})

In [55]:
house_rep_postal = pd.merge(house_rep, postal, how="inner", on='state')

In [62]:
house_rep_postal.head()

Unnamed: 0,state,representatives,Postal Abbreviation,Postal Code
0,Alabama,7,Ala.,AL
1,Alaska,1,Alaska,AK
2,Arizona,9,Ariz.,AZ
3,Arkansas,4,Ark.,AR
4,California,53,Calif.,CA
