# Web Scraping Wikipedia to Get The Historical Data on US Government Party Demographic

## Web Scraping: BeautifulSoup 

#### Import Libraries

In [1]:
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd

#### Request the Web Page Content

In [3]:
URL = 'https://en.wikipedia.org/wiki/Divided_government_in_the_United_States'
r = requests.get(URL)

#### Save the html content as Beautiful Soup Object

In [8]:
content = soup(r.content)


#### Locate the Div and Table of Content 

In [13]:
div = content.find('div', id="mw-content-text")
tables = div.find_all('table', attrs={"style":"text-align:center"})
table = tables[0].find('tbody')

#### Make a list of the Table Data

In [32]:
table_data = []
president = []
for row in table.find_all('tr'):
    
    if row.find('td'):
        data = []
        data = [td.string for td in row.find_all('td') if not td.find('a') and not td.find('b')]
        if not data:
            data = [td.string for td in row.find_all('b') if not td.find('a')]
        
        if row.find('a'):
            name = [p.string for p in row.find_all('a')]
            president.append(name)
        else: president.append(name)
        table_data.append(data)

#### Save the List as Pandas Data Frame

In [33]:
df = pd.DataFrame(table_data, columns=['Year', 'Senate', 'House', 'President Party'])

## Cleaning Data: Pandas

#### Join the list Element for President Column

In [42]:
df['President'] = president
df['President'] = df['President'].str.join(sep='/')

In [60]:
df.head(30)

Unnamed: 0,Year,Senate,House,President Party,President
0,1861–1863,R,R,R,Lincoln
1,1863–1865,R,R,R,Lincoln
2,1865–1867,R,R,D,A. Johnson
3,1867–1869,R,R,D,A. Johnson
4,1869–1871,R,R,R,Grant
5,1871–1873,R,R,R,Grant
6,1873–1875,R,R,R,Grant
7,1875–1877,R,D,R,Grant
8,1877–1879,R,D,R,Hayes
9,1879–1881,D,D,R,Hayes


#### Regular Expression: Removing New Line and Useless Bracket 

In [48]:
df = df.replace(r'\n', '', regex=True)
df = df.replace(r'.*\]\/', '', regex =True)

#### Locating scpecific data point and changing it

In [53]:
df.loc[80]['President Party'] = 'D'

In [56]:
df.loc[10]['President Party'] = 'R'

In [57]:
df.loc[28]['President Party'] = 'D'

In [58]:
df.loc[28]['President'] = 'Wilson'

In [59]:
df.loc[29]['President'] = 'Wilson'

#### To CSV

In [61]:
df.to_csv('Government-Demographic.csv', index=False)

#### Read CSV

In [62]:
df = pd.read_csv('Government-Demographic.csv')
df

Unnamed: 0,Year,Senate,House,President Party,President
0,1861–1863,R,R,R,Lincoln
1,1863–1865,R,R,R,Lincoln
2,1865–1867,R,R,D,A. Johnson
3,1867–1869,R,R,D,A. Johnson
4,1869–1871,R,R,R,Grant
...,...,...,...,...,...
77,2015–2017,R,R,D,Obama
78,2017–2019,R,R,R,Trump
79,2019–2021,R,D,R,Trump
80,2021–2023,D,D,D,Biden
