# Data gathering and integrating for Computational Thinking Project
## Pulling data from: 
* (A)  Wiki population by county, 2020
   
* (B) Office of Financial Management Health Care Research Report

* (C) ACS Household income, by county

In [1]:
import os
import pandas as pd
#import xlrd
#import openpyxl

#os.getcwd()

## Pulling and Cleaning Data

### A. Pulling data from Wikipedia -- 2020 Washington State population, by county

In [2]:
from IPython.display import IFrame   
from sodapy import Socrata

In [30]:
#link for the tables
popLink="https://en.wikipedia.org/wiki/List_of_counties_in_Washington"

# fetching the tables
popdata=pd.read_html(popLink,header=0,flavor="bs4",attrs={'class':"wikitable sortable"})

In [40]:
#pull the population table
popdt=popdata[0].copy()

Clean the data for the sake of merging

In [42]:
## Step 1: remove unecessary columns
whichToDrop=[1,2,3,4,5,7,8] #keep only county name & population (2020)
popdt.drop(labels=popdt.columns[whichToDrop],axis=1,inplace=True)

## Step 2: remove the 'County' from each county name
popdt['County'] = popdt['County'].str.rsplit(" ", 1).str[0]

## Step 3: remove the last row (full state total)
popdt.drop(popdt.tail(1).index,inplace=True) 

In [189]:
# check to make sure the population data looks good for merging 
popdt.head()

Unnamed: 0,County,Population (2020)
0,Adams,20613
1,Asotin,22285
2,Benton,206873
3,Chelan,79074
4,Clallam,77155


### B. Pulling data from OFM Report

In [47]:
## using tablua-py library to read table from the pdf 
import tabula

In [50]:
#read in the pdf
ofmLink = 'https://ofm.wa.gov/sites/default/files/public/dataresearch/healthcare/healthcoverage/2012-19_County_Uninsured_Rates_Chart_Book.pdf'

#datatable on page 52 
ofmData = tabula.read_pdf(ofmLink, pages = 52)

In [58]:
type(ofmData)
#print(ofmData)

list

In [59]:
ofmData[0]

Unnamed: 0.1,County,2012,2013,2014,2015,2016,2017,2018,2019,County 2012,2013.1,2014.1,2015.1,Unnamed: 0,2016.1,2017.1,2018.1,2019.1
0,Adams,23.3,18.0,16.7,18.2,15.9,12.9,14.5,7.4,Lincoln 9.6,11.5,7.5,4.1,,5.1,4.3,3.9,3.0
1,Asotin,10.5,12.2,8.9,4.4,5.5,4.0,4.3,2.7,Mason 17.2,21.4,9.1,7.3,,5.9,6.2,8.4,4.0
2,Benton,13.0,16.9,8.5,3.4,4.8,5.3,4.8,6.1,Okanogan 21.3,20.3,13.7,10.4,,12.0,14.1,12.6,8.4
3,Chelan,16.0,19.7,11.8,6.1,6.2,10.0,10.6,10.4,Pacific 12.5,13.2,8.5,5.7,,5.3,4.1,5.3,4.0
4,Clallam,14.5,15.5,11.1,6.9,7.0,7.4,8.3,9.1,Pend Oreille 15.5,16.6,10.6,6.9,,8.8,7.6,9.3,6.6
5,Clark,13.4,12.5,8.6,6.6,4.6,5.1,6.1,5.5,Pierce 13.8,13.2,8.3,5.7,,4.7,5.5,4.9,5.3
6,Columbia,11.2,12.2,8.9,4.3,5.8,4.0,4.6,3.0,San Juan 10.2,11.6,7.6,3.7,,4.1,4.2,4.1,5.0
7,Cowlitz,14.5,14.8,9.7,6.4,5.5,4.4,5.8,5.0,Skagit 14.2,13.8,10.8,4.9,,5.2,5.6,6.0,6.7
8,Douglas,16.7,20.8,12.6,6.4,7.0,10.6,11.7,11.7,Skamania 13.3,14.5,10.9,5.4,,6.5,8.5,5.8,6.0
9,Ferry,16.9,18.0,11.8,8.5,9.8,8.5,10.9,7.3,Snohomish 13.5,12.8,7.5,5.4,,4.7,4.4,6.5,6.6


In [62]:
#First turn the data into a dataframe
ofm_dt = pd.DataFrame(ofmData[0])

#### This data is pretty messy -- here is my approach to cleaning this data:
1. Seperate out the first 9 columns that look good
2. Seperate out the 10th column (county names + 2012 estimate)   
    2a. split the text and 2012 estimates into 2 columns
3. Seperate out the 11th - 13th columns & 15th - 18th (elminate the Unnamed NaN col)   
    3a. Bind the two datatables back together for 2013 -- 2019 clean estimates   
4. Bind the cleaned 10th column (with county names and 2012 est) back to the 2013-2019 data
5. Append the completed right datatable to the botton the the left datatable

Steps 1, 2 and 3

In [176]:
## this one is good, the format we want (step 1)
ofm_dtA = ofm_dt.iloc[:, 0:9]

## this col has both county names and 2020 (step 2)
ofm_dtB = ofm_dt.iloc[:, 9]

## these are seperated by NA col (step 3)
ofm_dtC = ofm_dt.iloc[:, 10:13]
ofm_dtD = ofm_dt.iloc[:, 14:18]

Step 2a: Time to clean the column with both county names and 2012 estimates

In [177]:
#Step 1: remove the bottom 2 rows 
ofm_dtB.drop(ofm_dtB.tail(3).index,inplace=True) 

#re-set the object to a dataframe
ofm_dtB = pd.DataFrame(ofm_dtB)

#Step 2: split along spaces to remove the numbers from the county names
ofm_dtB['County'] = ofm_dtB['County 2012'].str.rsplit(" ", 1).str[0]
ofm_dtB['2012'] = ofm_dtB['County 2012'].str.rsplit(" ", 1).str[-1]

#Step 3: remove the unecessary column
ofm_dtB = ofm_dtB.drop(['County 2012'], axis=1)

Step 3a: Now clean and bind together the C and D so they can fit the strucure of A

In [178]:
# Remove the bottom 2 rows 
ofm_dtC.drop(ofm_dtC.tail(3).index,inplace=True) 
# Set the names accurately
ofm_dtC = ofm_dtC.rename(columns={'2013.1': '2013', '2014.1': '2014', '2015.1': '2015'})

# Remove the bottom 2 rows 
ofm_dtD.drop(ofm_dtD.tail(3).index,inplace=True) 
# Set the names accurately
ofm_dtD = ofm_dtD.rename(columns={'2016.1': '2016', '2017.1': '2017', '2018.1': '2018', '2019.1': '2019'})

Step 4: Now bind all the year data for the second half of the counties

In [179]:
ofm_dtE = pd.concat([ofm_dtB, ofm_dtC, ofm_dtD], axis=1)

Step 5: And then bind the first and second halfs together (A and E) 

In [180]:
ofm_clean = ofm_dtA.append(ofm_dtE)

In [182]:
ofm_clean.head()

Unnamed: 0,County,2012,2013,2014,2015,2016,2017,2018,2019
0,Adams,23.3,18.0,16.7,18.2,15.9,12.9,14.5,7.4
1,Asotin,10.5,12.2,8.9,4.4,5.5,4.0,4.3,2.7
2,Benton,13.0,16.9,8.5,3.4,4.8,5.3,4.8,6.1
3,Chelan,16.0,19.7,11.8,6.1,6.2,10.0,10.6,10.4
4,Clallam,14.5,15.5,11.1,6.9,7.0,7.4,8.3,9.1


### C. Now read in CSV of ACS data 

In [190]:
ACS_filePath = 'https://github.com/varshaskrish/comp-thinking/raw/main/ACS_houshold_income_WA_county_2019_2012.csv'

acs_dt = pd.read_csv(ACS_filePath)

In [196]:
acs_dt.tail()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,ACS_houshold_income_WA_county_2019_2012
0500000US53061,Snohomish County,299827,300215,292430,286116,284176,278683,270616,270568
0500000US53063,Spokane County,211723,206191,200949,195807,193117,187603,186456,189004
0500000US53067,Thurston County,112909,110713,112075,105863,105159,103319,99815,102335
0500000US53073,Whatcom County,88794,87080,88636,84011,80023,81973,78330,79029
0500000US53077,Yakima County,83992,83320,82107,81084,81709,79700,79742,78472


pandas.core.frame.DataFrame

## Merging data

In [None]:
### SAVE DATA AS AN RDS FILE!