In [1]:
import pandas as pd
import numpy as np
import requests as req
import psycopg2
from sqlalchemy import create_engine

# Extraction

### We downloaded csv files of 2016 presidential election results by county, unemployment rate by county, and minimum wage by State data sets from kaggle.com. After a preeliminary inspection we noticed data sets had different use of State identifiers, some full name and others in abbreviations, so we looked for a dataset containing both. We decided to compare tables from www.infoplease.com and https://www.50states.com/abbreviations.htm . We gathered the tables with a python Request get() method.

# Transformation

### 1. Read into pandas and make data frames

#### We downloaded the csv file from kaggle, read it with read_csv, and made a data frame. We inspected the new data frame with .head()

### Unemployment Rate by County

In [2]:
unemployment_df = pd.read_csv('Resources/unemp.csv')
unemployment_df.head()

Unnamed: 0,Year,Month,State,County,Rate
0,2015,February,Mississippi,Newton County,6.1
1,2015,February,Mississippi,Panola County,9.4
2,2015,February,Mississippi,Monroe County,7.9
3,2015,February,Mississippi,Hinds County,6.1
4,2015,February,Mississippi,Kemper County,10.6


### 2016 Presidential Results by County

In [3]:
presidential_results_df = pd.read_csv('Resources/pres16results.csv')
presidential_results_df.head(5)

Unnamed: 0,county,fips,cand,st,pct_report,votes,total_votes,pct,lead
0,,US,Donald Trump,US,0.9951,60350241.0,127592176.0,0.472993,Donald Trump
1,,US,Hillary Clinton,US,0.9951,60981118.0,127592176.0,0.477938,Donald Trump
2,,US,Gary Johnson,US,0.9951,4164589.0,127592176.0,0.03264,Donald Trump
3,,US,Jill Stein,US,0.9951,1255968.0,127592176.0,0.009844,Donald Trump
4,,US,Evan McMullin,US,0.9951,451636.0,127592176.0,0.00354,Donald Trump


### Minimum Wage by State

### We did a three steps trial for reading this dataset into pandas with                            pd.read_csv('Resources/Minimum Wage Data.csv')

### First time we got an UnicodeDecodeError, so we tried first with encoding="utc-8", but got a "LookupError: unknown encoding: utc-8". After searching on google we tried endcoding='latin' and it worked. The three steps are below

minimum_wage_df = pd.read_csv('Resources/Minimum Wage Data.csv')

minimum_wage_df = pd.read_csv('Resources/Minimum Wage Data.csv', encoding="utc-8")

In [4]:
minimum_wage_df = pd.read_csv('Resources/Minimum Wage Data.csv', encoding="latin")

### Export the encoded dataframe to a new csv file

In [5]:
minimum_wage_df.to_csv('Resources/minimumw.csv', encoding='utf-8')

### Read the new csv file and explore columns

In [6]:
min_wage_df = pd.read_csv('Resources/minimumw.csv')
min_wage_df.head(5)

Unnamed: 0.1,Unnamed: 0,Year,State,Table_Data,Footnote,High.Value,Low.Value,CPI.Average,High.2018,Low.2018
0,0,1968,Alabama,...,,0.0,0.0,34.783333,0.0,0.0
1,1,1968,Alaska,2.10,,2.1,2.1,34.783333,15.12,15.12
2,2,1968,Arizona,18.72 - 26.40/wk(b),(b),0.66,0.468,34.783333,4.75,3.37
3,3,1968,Arkansas,1.25/day(b),(b),0.15625,0.15625,34.783333,1.12,1.12
4,4,1968,California,1.65(b),(b),1.65,1.65,34.783333,11.88,11.88


# Select and rename columns

In [7]:
unemployment_df.columns

Index(['Year', 'Month', 'State', 'County', 'Rate'], dtype='object')

In [8]:
presidential_results_df["st"].unique()

array(['US', 'CA', 'FL', 'TX', 'NY', 'PA', 'IL', 'OH', 'MI', 'NC', 'GA',
       'VA', 'NJ', 'MA', 'WI', 'MN', 'MO', 'WA', 'IN', 'CO', 'TN', 'MD',
       'SC', 'AL', 'AZ', 'LA', 'KY', 'OR', 'CT', 'IA', 'OK', 'MS', 'KS',
       'NV', 'AR', 'UT', 'NE', 'NM', 'ME', 'NH', 'WV', 'ID', 'MT', 'RI',
       'DE', 'HI', 'SD', 'ND', 'VT', 'DC', 'WY', 'AK', nan], dtype=object)

In [9]:
min_wage_df.columns

Index(['Unnamed: 0', 'Year', 'State', 'Table_Data', 'Footnote', 'High.Value',
       'Low.Value', 'CPI.Average', 'High.2018', 'Low.2018'],
      dtype='object')

In [10]:
unemploy = unemployment_df
pres_res = presidential_results_df.copy()[['county', 'st', 'lead']]
min_wage = min_wage_df.copy()[['Year', 'State','Low.2018']]

In [11]:
pres_res.rename(columns={'county': "County", 'st': "State", 'lead': "Lead"}, inplace=True)

In [12]:
min_wage.rename(columns={'Low.2018':"Minimun_Wage"}, inplace=True)

### Replace content of states columns to state abbreviations for consistency

### For this, we scrapped two websites to get a table of the states and their abbreviations, and we decided to use the results from infoplease.com

### Scrape for States' abbreviations table

In [13]:
#Get the states abbreviations from infoplease
html = req.get ('https://www.infoplease.com/us/postal-information/state-abbreviations-and-state-postal-codes')
#Read the response with read_html and make a dataframe
states_df = pd.read_html(html.text)
#print(states_df)
states_abbv = states_df[0]
states_abbv.to_csv('Resources/states_abbreviations.csv', index=False)

state_url = req.get('https://www.50states.com/abbreviations.htm')
state_abv = pd.read_html(state_url.text)
state_abvs = state_abv[0]

### Read states_abbv aa a dict and map the abbreviation into the 3 dataframes

In [14]:
states_abbv = pd.read_csv('Resources/states_abbreviations.csv', index_col=0)
#Inspect columns with states_abbv.columns and create df to use
states_postalcodes = states_abbv[["Postal Code"]]

In [15]:
#Make a dict for mapping later into the dataframes
states_postalcodes_dict = states_postalcodes.to_dict()["Postal Code"]
# # Inspect dict
# states_postalcodes_dict

### Map states dict into dataframes

In [16]:
unemploy['State'] = unemploy['State'].map(states_postalcodes_dict)
min_wage['State'] = min_wage['State'].map(states_postalcodes_dict)
#Check replacement
min_wage['State'].unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', nan, 'FL',
       'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
       'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM',
       'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN',
       'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'], dtype=object)

### Drop NaN

In [17]:
unemploy.dropna(inplace=True)
unemploy

Unnamed: 0,Year,Month,State,County,Rate
0,2015,February,MS,Newton County,6.1
1,2015,February,MS,Panola County,9.4
2,2015,February,MS,Monroe County,7.9
3,2015,February,MS,Hinds County,6.1
4,2015,February,MS,Kemper County,10.6
...,...,...,...,...,...
885543,2009,November,ME,Somerset County,10.5
885544,2009,November,ME,Oxford County,10.5
885545,2009,November,ME,Knox County,7.5
885546,2009,November,ME,Piscataquis County,11.3


In [28]:
pres_res.dropna(inplace=True)

In [19]:
#min_wage.dropna()
#State AL shows minimum wage at 0, so we decided to replace and drop
min_wage.replace(0, np.NaN).dropna(inplace=True)
min_wage

Unnamed: 0,Year,State,Minimun_Wage
0,1968,AL,0.00
1,1968,AK,15.12
2,1968,AZ,3.37
3,1968,AR,1.12
4,1968,CA,11.88
...,...,...,...
2745,2017,VA,7.41
2746,2017,WA,11.24
2747,2017,WV,8.94
2748,2017,WI,7.41


# Load into a Database

In [20]:
#Create engine and connection with database
engine = create_engine('postgres://USER:PASSWORD@localhost:5433/etl_project')
conn = engine.connect()

In [21]:
#Verify tables
engine.table_names()

['min_wage', 'election', 'unemployment']

In [22]:
unemploy.to_sql(name="unemployment", con=engine, if_exists="replace", index=False)

In [23]:
#confirm data has been added by querying table
pd.read_sql_query('select * from unemployment', con=engine).head()

Unnamed: 0,Year,Month,State,County,Rate
0,2015,February,MS,Newton County,6.1
1,2015,February,MS,Panola County,9.4
2,2015,February,MS,Monroe County,7.9
3,2015,February,MS,Hinds County,6.1
4,2015,February,MS,Kemper County,10.6


In [24]:
pres_res.to_sql(name="election", con=engine, if_exists="replace", index=True)

In [25]:
#confirm data has been added by querying table
pd.read_sql_query('select * from election', con=engine).head()

Unnamed: 0,index,County,State,Lead
0,159,Los Angeles County,CA,Hillary Clinton
1,160,Los Angeles County,CA,Hillary Clinton
2,161,Los Angeles County,CA,Hillary Clinton
3,162,Los Angeles County,CA,Hillary Clinton
4,163,Los Angeles County,CA,Hillary Clinton


In [26]:
min_wage.to_sql(name="min_wage", con=engine, if_exists="replace", index=False)

In [27]:
#confirm data has been added by querying table
pd.read_sql_query('select * from min_wage', con=engine).head()

Unnamed: 0,Year,State,Minimun_Wage
0,1968,AL,0.0
1,1968,AK,15.12
2,1968,AZ,3.37
3,1968,AR,1.12
4,1968,CA,11.88
