In [1]:
import pandas as pd
import numpy as np
import requests as req

# Extraction

### We downloaded csv files of 2016 presidential election results by county, unemployment rate by county, and minimum wage by State data sets from kaggle.com. After a preeliminary inspection we noticed data sets had different use of State identifiers, some full name and others in abbreviations, so we looked for a dataset containing both. We decided to compare tables from www.infoplease.com and https://www.50states.com/abbreviations.htm . We gathered the tables with a python Request get() method.

# Transformation

### 1. Read into pandas and make data frames

#### We downloaded the csv file from kaggle, read it with read_csv, and made a data frame. We inspected the new data frame with .head()

### Original files

In [2]:
unemployment_df = pd.read_csv('Resources/unemp.csv')
presidential_results_df = pd.read_csv('Resources/pres16results.csv')

In [3]:
unemployment_df.head(3)

Unnamed: 0,Year,Month,State,County,Rate
0,2015,February,Mississippi,Newton County,6.1
1,2015,February,Mississippi,Panola County,9.4
2,2015,February,Mississippi,Monroe County,7.9


In [6]:
presidential_results_df.head(3)

Unnamed: 0,county,fips,cand,st,pct_report,votes,total_votes,pct,lead
0,,US,Donald Trump,US,0.9951,60350241.0,127592176.0,0.472993,Donald Trump
1,,US,Hillary Clinton,US,0.9951,60981118.0,127592176.0,0.477938,Donald Trump
2,,US,Gary Johnson,US,0.9951,4164589.0,127592176.0,0.03264,Donald Trump


### Create a states abbreviation dict for mapping into the dataframes to change states columns to state abbreviations for consistency

###### For this, we scrapped two websites to get a table of the states and their abbreviations, and we decided to use the results from infoplease.com. The coding for the other website is shown below for reference.

state_url = req.get('https://www.50states.com/abbreviations.htm')
state_abv = pd.read_html(state_url.text)
state_abvs = state_abv[0]

### Scrape for States' abbreviations table

In [7]:
#Get the states abbreviations from infoplease
html = req.get ('https://www.infoplease.com/us/postal-information/state-abbreviations-and-state-postal-codes')
#Read the response with read_html and make a dataframe
states_df = pd.read_html(html.text)
#print(states_df)
states_abbv = states_df[0]
states_abbv.to_csv('Resources/states_abbreviations.csv', index=False)
states_abbv = pd.read_csv('Resources/states_abbreviations.csv', index_col=0)
#Inspect columns with states_abbv.columns and create df to use
states_postalcodes = states_abbv[["Postal Code"]]
#Make a dict for mapping later into the dataframes
states_postalcodes_dict = states_postalcodes.to_dict()["Postal Code"]
# # Inspect dict
#states_postalcodes_dict

### Map unemployment and minimum wage data frames 

In [8]:
unemployment_df['State'] = unemployment_df['State'].map(states_postalcodes_dict)
#Check replacement
unemployment_df['State'].unique()

array(['MS', 'OK', 'DE', 'MN', 'IL', 'AR', 'NM', 'IN', 'MD', 'LA', 'ID',
       'WY', 'TN', 'AZ', 'IA', 'KS', 'UT', 'VA', 'OR', 'CT', 'MT', 'CA',
       'MA', 'WV', 'SC', 'NH', 'WI', 'VT', 'ND', 'PA', 'KY', 'HI', 'NE',
       'MO', 'OH', 'AL', 'RI', 'SD', 'CO', 'NJ', 'WA', 'NC', 'NY', 'TX',
       'NV', 'ME', 'MI'], dtype=object)

### Export dataframes as new csv files for further transformation on other jpnb.

In [9]:
unemployment_df.to_csv('Resources/unemployment.csv')
presidential_results_df.to_csv('Resources/presidential.csv')

##### Exercise continues on a second jpnb for ease of use after mapping states