# Data gathering and integrating for Computational Thinking Project
## Pulling data from: 
* (A)  Wikipedia -- Washington State population by county, 2020
   
* (B) Office of Financial Management Health Care Research Report -- 2020 access coverage

* (C) ACS Household income, by county -- 5 year estimates, for midpoint yearly household income estimate

In [5]:
import os
import pandas as pd
#import xlrd
#import openpyxl

#os.getcwd()

## Pulling and Cleaning Data

### A. Pulling data from Wikipedia -- 2020 Washington State population, by county

In [33]:
from IPython.display import IFrame   
from sodapy import Socrata

In [34]:
#link for the tables
popLink="https://en.wikipedia.org/wiki/List_of_counties_in_Washington"

# fetching the tables
popdata=pd.read_html(popLink,header=0,flavor="bs4",attrs={'class':"wikitable sortable"})

In [35]:
#pull the population table
popdt=popdata[0].copy()

Clean the data for the sake of merging

In [36]:
## Step 1: remove unecessary columns
whichToDrop=[1,2,3,4,5,7,8] #keep only county name & population (2020)
popdt.drop(labels=popdt.columns[whichToDrop],axis=1,inplace=True)

## Step 2: remove the 'County' from each county name
popdt['County'] = popdt['County'].str.rsplit(" ", 1).str[0]

## Step 3: remove the last row (full state total)
popdt.drop(popdt.tail(1).index,inplace=True) 

In [37]:
# check to make sure the population data looks good for merging 
popdt.head()

Unnamed: 0,County,Population (2020)
0,Adams,20613
1,Asotin,22285
2,Benton,206873
3,Chelan,79074
4,Clallam,77155


### B. Pulling data from OFM Report

In [3]:
## using tablua-py library to read table from the pdf 
import tabula

In [4]:
#read in the pdf
ofmLink = 'https://ofm.wa.gov/sites/default/files/public/dataresearch/healthcare/healthcoverage/2012-19_County_Uninsured_Rates_Chart_Book.pdf'

#datatable on page 52 
ofmData = tabula.read_pdf(ofmLink, pages = 52)

In [6]:
type(ofmData)
#print(ofmData)

list

In [7]:
ofmData[0]

Unnamed: 0.1,County,2012,2013,2014,2015,2016,2017,2018,2019,County 2012,2013.1,2014.1,2015.1,Unnamed: 0,2016.1,2017.1,2018.1,2019.1
0,Adams,23.3,18.0,16.7,18.2,15.9,12.9,14.5,7.4,Lincoln 9.6,11.5,7.5,4.1,,5.1,4.3,3.9,3.0
1,Asotin,10.5,12.2,8.9,4.4,5.5,4.0,4.3,2.7,Mason 17.2,21.4,9.1,7.3,,5.9,6.2,8.4,4.0
2,Benton,13.0,16.9,8.5,3.4,4.8,5.3,4.8,6.1,Okanogan 21.3,20.3,13.7,10.4,,12.0,14.1,12.6,8.4
3,Chelan,16.0,19.7,11.8,6.1,6.2,10.0,10.6,10.4,Pacific 12.5,13.2,8.5,5.7,,5.3,4.1,5.3,4.0
4,Clallam,14.5,15.5,11.1,6.9,7.0,7.4,8.3,9.1,Pend Oreille 15.5,16.6,10.6,6.9,,8.8,7.6,9.3,6.6
5,Clark,13.4,12.5,8.6,6.6,4.6,5.1,6.1,5.5,Pierce 13.8,13.2,8.3,5.7,,4.7,5.5,4.9,5.3
6,Columbia,11.2,12.2,8.9,4.3,5.8,4.0,4.6,3.0,San Juan 10.2,11.6,7.6,3.7,,4.1,4.2,4.1,5.0
7,Cowlitz,14.5,14.8,9.7,6.4,5.5,4.4,5.8,5.0,Skagit 14.2,13.8,10.8,4.9,,5.2,5.6,6.0,6.7
8,Douglas,16.7,20.8,12.6,6.4,7.0,10.6,11.7,11.7,Skamania 13.3,14.5,10.9,5.4,,6.5,8.5,5.8,6.0
9,Ferry,16.9,18.0,11.8,8.5,9.8,8.5,10.9,7.3,Snohomish 13.5,12.8,7.5,5.4,,4.7,4.4,6.5,6.6


In [8]:
#First turn the data into a dataframe
ofm_dt = pd.DataFrame(ofmData[0])

#### This data is pretty messy -- here is my approach to cleaning this data:
1. Seperate out the first 9 columns that look good
2. Seperate out the 10th column (county names + 2012 estimate)   
    2a. split the text and 2012 estimates into 2 columns
3. Seperate out the 11th - 13th columns & 15th - 18th (elminate the Unnamed NaN col)   
    3a. Bind the two datatables back together for 2013 -- 2019 clean estimates   
4. Bind the cleaned 10th column (with county names and 2012 est) back to the 2013-2019 data
5. Append the completed right datatable to the botton the the left datatable

**Steps 1, 2 and 3:**  
Seperate out the data into usable chunks

In [9]:
## this one is good, the format we want (step 1)
ofm_dtA = ofm_dt.iloc[:, 0:9]

## this col has both county names and 2020 (step 2)
ofm_dtB = ofm_dt.iloc[:, 9]

## these are seperated by NA col (step 3)
ofm_dtC = ofm_dt.iloc[:, 10:13]
ofm_dtD = ofm_dt.iloc[:, 14:18]

**Step 2a:**   
Time to clean the column with both county names and 2012 estimates

In [10]:
#Step 1: remove the bottom 2 rows 
ofm_dtB.drop(ofm_dtB.tail(3).index,inplace=True) 

#re-set the object to a dataframe
ofm_dtB = pd.DataFrame(ofm_dtB)

#Step 2: split along spaces to remove the numbers from the county names
ofm_dtB['County'] = ofm_dtB['County 2012'].str.rsplit(" ", 1).str[0]
ofm_dtB['pui_2012'] = ofm_dtB['County 2012'].str.rsplit(" ", 1).str[-1]

#Step 3: remove the unecessary column
ofm_dtB = ofm_dtB.drop(['County 2012'], axis=1)

**Step 3a:**   
Now clean and bind together the C and D so they can fit the strucure of A

In [11]:
# Remove the bottom 2 rows 
ofm_dtC.drop(ofm_dtC.tail(3).index,inplace=True) 
# Set the names accurately
ofm_dtC = ofm_dtC.rename(columns={'2013.1': 'pui_2013', '2014.1': 'pui_2014', '2015.1': 'pui_2015'})

# Remove the bottom 2 rows 
ofm_dtD.drop(ofm_dtD.tail(3).index,inplace=True) 
# Set the names accurately
ofm_dtD = ofm_dtD.rename(columns={'2016.1': 'pui_2016', '2017.1': 'pui_2017', '2018.1': 'pui_2018', '2019.1': 'pui_2019'})

**Step 4:**   
Now bind all the year data for the second half of the counties

In [12]:
ofm_dtE = pd.concat([ofm_dtB, ofm_dtC, ofm_dtD], axis=1)

**Step 5:**   
And then bind the first and second halfs together (A and E) 

In [13]:
ofm_dtA = ofm_dtA.rename(columns={'2012': 'pui_2012', '2013': 'pui_2013', '2014': 'pui_2014', 
                                  '2015': 'pui_2015','2016': 'pui_2016', '2017': 'pui_2017', 
                                  '2018': 'pui_2018', '2019': 'pui_2019'})

ofm_clean = ofm_dtA.append(ofm_dtE)

In [14]:
ofm_clean.head()

Unnamed: 0,County,pui_2012,pui_2013,pui_2014,pui_2015,pui_2016,pui_2017,pui_2018,pui_2019
0,Adams,23.3,18.0,16.7,18.2,15.9,12.9,14.5,7.4
1,Asotin,10.5,12.2,8.9,4.4,5.5,4.0,4.3,2.7
2,Benton,13.0,16.9,8.5,3.4,4.8,5.3,4.8,6.1
3,Chelan,16.0,19.7,11.8,6.1,6.2,10.0,10.6,10.4
4,Clallam,14.5,15.5,11.1,6.9,7.0,7.4,8.3,9.1


### C. Now read in CSV of ACS data 

In [25]:
ACS_filePath = 'https://github.com/varshaskrish/comp-thinking/raw/main/Project/ACS_houshold_income_WA_county_2019_2012.csv'

acs_dt = pd.read_csv(ACS_filePath)

In [26]:
acs_dt.head()

Unnamed: 0,GEO_ID,county,he_2019,he_2018,he_2017,he_2016,he_2015,he_2014,he_2013,he_2012
0,0500000US53001,"Adams County, Washington",5973,5881,5824,5733,5802,5827,5738,5722
1,0500000US53003,"Asotin County, Washington",9101,9171,9235,9297,9341,9405,9058,9046
2,0500000US53005,"Benton County, Washington",72121,70983,70363,68418,67430,66625,65675,64660
3,0500000US53007,"Chelan County, Washington",28384,28038,27383,27200,27052,27183,27220,27048
4,0500000US53009,"Clallam County, Washington",32958,32732,32280,31438,31321,30963,30814,30877


Now clean the data for the merge 

In [27]:
#remove the unecessary first column
acs_dt = acs_dt.iloc[:, 1:10]

In [28]:
#remove the 'County, Washington' from each county name
acs_dt = pd.DataFrame(acs_dt)
# split at the , to remove Washington
acs_dt['county'] = acs_dt['county'].str.rsplit(",", 1).str[0]
# split at the _ to remove the County
acs_dt['county'] = acs_dt['county'].str.rsplit(" ", 1).str[0]

#rename 'county' to "County"
acs_dt = acs_dt.rename(columns={'county': 'County'})

## Merging data

Steps: 
1. Merge the percent uninsured (pui) data to the (household income) data
2. Merge the population data to these data

In [29]:
#Step 1
df_fin = pd.merge(ofm_clean, acs_dt, on='County', how='outer') 

In [30]:
#Step 2
df_fin = pd.merge(popdt, df_fin, on="County", how='outer')

In [31]:
df_fin.head()

Unnamed: 0,County,Population (2020),pui_2012,pui_2013,pui_2014,pui_2015,pui_2016,pui_2017,pui_2018,pui_2019,he_2019,he_2018,he_2017,he_2016,he_2015,he_2014,he_2013,he_2012
0,Adams,20613,23.3,18.0,16.7,18.2,15.9,12.9,14.5,7.4,5973,5881,5824,5733,5802,5827,5738,5722
1,Asotin,22285,10.5,12.2,8.9,4.4,5.5,4.0,4.3,2.7,9101,9171,9235,9297,9341,9405,9058,9046
2,Benton,206873,13.0,16.9,8.5,3.4,4.8,5.3,4.8,6.1,72121,70983,70363,68418,67430,66625,65675,64660
3,Chelan,79074,16.0,19.7,11.8,6.1,6.2,10.0,10.6,10.4,28384,28038,27383,27200,27052,27183,27220,27048
4,Clallam,77155,14.5,15.5,11.1,6.9,7.0,7.4,8.3,9.1,32958,32732,32280,31438,31321,30963,30814,30877


### **Alternative version -- make the year data long**

In [17]:
## OFM data
#ofm_wide = ofm_clean.copy()

#ofm_long = pd.wide_to_long(ofm_wide, ['pui'], i='County', j='year', sep='_')

#ofm_long.head()

In [29]:
## ACS Data
#acs_wide = acs_dt.copy()

#acs_long = pd.wide_to_long(acs_wide, ['he'], i='County', j='year', sep='_')

#acs_long.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,he
County,year,Unnamed: 2_level_1
Adams,2019,5973
Asotin,2019,9101
Benton,2019,72121
Chelan,2019,28384
Clallam,2019,32958


In [38]:
## merge the 3 datasets together 

#df_fin_long = pd.merge(ofm_long, acs_long, on=['County','year'], how='outer') 

#df_fin_long = pd.merge(popdt, df_fin_long, on="County", how='outer')

### Save data in .Rds file format

In [33]:
# Step 1 read data out into csv
df_fin.to_csv("deliverable1_data_clean_check.csv",index=False)

In [34]:
# Step 2: read it back in as a pandas df
df_fin_check=pd.read_csv("deliverable1_data_clean_check.csv")
df_fin_check.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   County             39 non-null     object 
 1   Population (2020)  39 non-null     int64  
 2   pui_2012           39 non-null     float64
 3   pui_2013           39 non-null     float64
 4   pui_2014           39 non-null     float64
 5   pui_2015           39 non-null     float64
 6   pui_2016           39 non-null     float64
 7   pui_2017           39 non-null     float64
 8   pui_2018           39 non-null     float64
 9   pui_2019           39 non-null     float64
 10  he_2019            39 non-null     int64  
 11  he_2018            39 non-null     int64  
 12  he_2017            39 non-null     int64  
 13  he_2016            39 non-null     int64  
 14  he_2015            39 non-null     int64  
 15  he_2014            39 non-null     int64  
 16  he_2013            39 non-nu

In [36]:
# Step 3: Write out as an .rds
#!pip install rpy2
from rpy2.robjects import pandas2ri
pandas2ri.activate()

from rpy2.robjects.packages import importr

base = importr('base')
base.saveRDS(df_fin_check,file="alldata_OK.RDS")

Collecting rpy2
  Downloading rpy2-3.4.5.tar.gz (194 kB)
[K     |████████████████████████████████| 194 kB 3.6 MB/s eta 0:00:01
Collecting tzlocal
  Downloading tzlocal-4.1-py3-none-any.whl (19 kB)
Collecting pytz-deprecation-shim
  Downloading pytz_deprecation_shim-0.1.0.post0-py2.py3-none-any.whl (15 kB)
Collecting tzdata
  Downloading tzdata-2021.5-py2.py3-none-any.whl (339 kB)
[K     |████████████████████████████████| 339 kB 1.7 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: rpy2
  Building wheel for rpy2 (setup.py) ... [?25ldone
[?25h  Created wheel for rpy2: filename=rpy2-3.4.5-py3-none-any.whl size=198821 sha256=7a98030453cd19c115815d9702ef9cbf7b596b983ad5078f0dd2c56b87b7a17f
  Stored in directory: /Users/Varsha/Library/Caches/pip/wheels/36/d1/63/882c1f63d21bcf817b7db960b9536a747d4258daeaace0edd4
Successfully built rpy2
Installing collected packages: tzdata, pytz-deprecation-shim, tzlocal, rpy2
Successfully installed pytz-deprecation-shim-0.1.0.post0 rpy2-3.4.

  values, tz_parsed = conversion.datetime_to_datetime64(data.ravel("K"))


NameError: name 'allDataFull' is not defined