# Web Scraping Wikipedia to Get The Historical Data on US Government Party Demographic

## Web Scraping: BeautifulSoup 

#### Import Libraries

In [188]:
from bs4 import BeautifulSoup as Soup
import requests
import pandas as pd

#### Request the Web Page Content

In [177]:
URL = 'https://en.wikipedia.org/wiki/Divided_government_in_the_United_States'
r = requests.get(URL)

#### Save the html content as Beautiful Soup Object

In [179]:
content = Soup(r.content)
content.prettify

#### Locate the Div and Table of Content 

In [180]:
tables = ''
div = content.find(id='mw-content-text')
tables = div.find_all('table', attrs={"style":"text-align:center"})
tables = tables[0].find('tbody')

#### Make a list of the Table Data

In [181]:
data = []
President = []
for row in tables.find_all('tr'):
    if row.find('td'):
        row_data = [td.string for td in row.find_all('td') if not td.find('a') and  not td.find('b')] 
        if not row_data: 
            row_data = [td.string for td in row.find_all('b') if not td.find('a')]
    
        if row.find('a'):
            title = [t.string for t in row.find_all('a')]
            President.append(title)
            
        else: President.append(title)
        data.append(row_data)

#### Save the List as Pandas Data Frame

In [182]:
df = pd.DataFrame(data, columns = ['Year', 'Senate', 'House', 'President_p'])
df['President'] = President

In [183]:
df

Unnamed: 0,Year,Senate,House,President_p,President
0,1861–1863\n,R\n,R\n,R\n,[Lincoln]
1,1863–1865\n,R\n,R\n,R\n,[Lincoln]
2,1865–1867,R,R,D,[A. Johnson]
3,1867–1869,R,R,D,[A. Johnson]
4,1869–1871\n,R\n,R\n,R\n,[Grant]
...,...,...,...,...,...
77,2015–2017,R,R,D,[Obama]
78,2017–2019\n,R\n,R\n,R\n,[Trump]
79,2019–2021,R,D,R,[Trump]
80,2021–2023\n,D\n,D\n,,"[[e], Biden]"


## Cleaning Data: Pandas

#### Join the list Element for President Column

In [112]:
df['President'] = df['President'].str.join(sep='/')

#### Regular Expression: Removing New Line and Useless Bracket 

In [160]:
df = df.replace(r'\n',' ', regex=True) 
df = df.replace(r'.*\]\/', '', regex=True)

#### Locating scpecific data point and changing it

In [169]:
df.iloc[29].President = 'Wilson'

#### To CSV

In [None]:
#df.to_csv('US-President-Parties.csv', index=False)

#### Read CSV

In [204]:
df = pd.read_csv('US-President-Parties.csv')

# Creating Rows for each Year Range

In [210]:
df.head()

Unnamed: 0,Year,Senate,House,President_p,President
0,"[1861, 1863 ]",R,R,R,Lincoln
1,"[1863, 1865 ]",R,R,R,Lincoln
2,"[1865, 1867]",R,R,D,A. Johnson
3,"[1867, 1869]",R,R,D,A. Johnson
4,"[1869, 1871 ]",R,R,R,Grant


In [205]:
df.Year = df.Year.str.split('–')

In [207]:
df.loc[0].Year

['1861', '1863 ']

In [222]:
_df = pd.DataFrame([[p, T, i,j,l] for P, T, i,j,l in df.values for p in range(int(P[0]), int(P[1]))
], columns=df.columns)

In [224]:
_df.head(30)

Unnamed: 0,Year,Senate,House,President_p,President
0,1861,R,R,R,Lincoln
1,1862,R,R,R,Lincoln
2,1863,R,R,R,Lincoln
3,1864,R,R,R,Lincoln
4,1865,R,R,D,A. Johnson
5,1866,R,R,D,A. Johnson
6,1867,R,R,D,A. Johnson
7,1868,R,R,D,A. Johnson
8,1869,R,R,R,Grant
9,1870,R,R,R,Grant


In [215]:
new_df.Year = pd.to_datetime(new_df.Year)

In [225]:
df.to_csv('US-President-Parties-by-Year.csv', index=False)

In [213]:
#df['Year'].apply(pd.Series).reset_index().melt(id_vars='index')