#### Notebooks  
- [Data Collection](./01_data_collection.ipynb)
- [Data Cleaning](./02_data_cleaning.ipynb)
- [Data Preprocessing](./03_data_preprocessing.ipynb)
- [EDA Five States](./04_eda_five_states.ipynb)
- [EDA California](./05_eda_ca.ipynb)
- [EDA Florida](./05_eda_fl.ipynb)
- [EDA Illinois](./05_eda_il.ipynb)
- [EDA New York](./05_eda_ny.ipynb)
- [EDA Texas](./05_eda_tx.ipynb)
- [Modeling Five States](./06_modeling_five_states.ipynb)
- [Modeling California](./07_modeling_ca.ipynb)
- [Modeling Florida](./07_modeling_fl.ipynb)
- [Modeling Illinois](./07_modeling_il.ipynb)
- [Modeling New York](./07_modeling_ny.ipynb)
- [Modeling Texas](./07_modeling_tx.ipynb)
- [Conclusions](./08_conclusions.ipynb)

#### This Notebook's Contents  
- [Pulling DP05 From Census API](#Pulling-DP05-From-Census-API)
- [Pulling DP03 From Census API](#Pulling-DP03-From-Census-API)

In [1]:
# Import the required libraries.
import pandas as pd
import requests

# Pulling DP05 From Census API
States: CA (06), FL(12), IL(17), NY (36), TX (48)  
ACS 5-YEAR DEMOGRAPHIC AND HOUSING ESTIMATES   
Survey/Program: American Community Survey   
2018: ACS 5-Year Estimates Data Profiles  
TableID: DP05  

## California: 06

In [2]:
# Set base url
url = 'https://api.census.gov/data/2018/acs/acs5/profile?'

# Set params
params = {
    'get': 'group(DP05),NAME',
    'for': 'county:*',
    'in': 'state:06',
    'key': 'YOURKEYHERE'
}

# Make a request and display the response code.
res = requests.get(url,params)
res

<Response [200]>

In [3]:
# Create a dataframe from the request JSON object.
df_ca = pd.DataFrame(res.json())
# Display the first few rows of the dataframe.
df_ca.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,707,708,709,710,711,712,713,714,715,716
0,NAME,DP05_0031PM,DP05_0032E,DP05_0032M,DP05_0032PE,DP05_0032PM,DP05_0033E,DP05_0033M,DP05_0033PE,DP05_0033PM,...,DP05_0029PEA,DP05_0030MA,DP05_0030EA,DP05_0030PMA,DP05_0030PEA,DP05_0031MA,DP05_0031EA,DP05_0031PEA,state,county
1,"Lake County, California",0.3,94.1,1.1,-888888888,-888888888,64148,-555555555,64148,-888888888,...,,,,,,,,,06,033


In [4]:
# Set the values in the first row to the columns.
df_ca.columns = df_ca.iloc[0]

In [5]:
# Drop the first row
df_ca = df_ca.iloc[1:, :]
# Display the first few rows of the dataframe.
df_ca.head(2)

Unnamed: 0,NAME,DP05_0031PM,DP05_0032E,DP05_0032M,DP05_0032PE,DP05_0032PM,DP05_0033E,DP05_0033M,DP05_0033PE,DP05_0033PM,...,DP05_0029PEA,DP05_0030MA,DP05_0030EA,DP05_0030PMA,DP05_0030PEA,DP05_0031MA,DP05_0031EA,DP05_0031PEA,state,county
1,"Lake County, California",0.3,94.1,1.1,-888888888,-888888888,64148,-555555555,64148,-888888888,...,,,,,,,,,6,33
2,"Mariposa County, California",1.1,98.1,4.2,-888888888,-888888888,17540,-555555555,17540,-888888888,...,,,,,,,,,6,43


## Florida: 12

In [6]:
# Set base url
url = 'https://api.census.gov/data/2018/acs/acs5/profile?'

# Set params
params = {
    'get': 'group(DP05),NAME',
    'for': 'county:*',
    'in': 'state:12',
    'key': 'YOURKEYHERE'
}

# Make a request and display the response code.
res = requests.get(url,params)
res

<Response [200]>

In [7]:
# Create a dataframe from the request JSON object.
df_fl = pd.DataFrame(res.json())
# Display the first few rows of the dataframe.
df_fl.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,707,708,709,710,711,712,713,714,715,716
0,NAME,DP05_0031PM,DP05_0032E,DP05_0032M,DP05_0032PE,DP05_0032PM,DP05_0033E,DP05_0033M,DP05_0033PE,DP05_0033PM,...,DP05_0029PEA,DP05_0030MA,DP05_0030EA,DP05_0030PMA,DP05_0030PEA,DP05_0031MA,DP05_0031EA,DP05_0031PEA,state,county
1,"Okaloosa County, Florida",0.1,83.2,0.3,-888888888,-888888888,200737,-555555555,200737,-888888888,...,,,,,,,,,12,091


In [8]:
# Set the values in the first row to the columns
df_fl.columns = df_fl.iloc[0]

In [9]:
# Drop the first row
df_fl = df_fl.iloc[1:, :]
# Display the first few rows of the dataframe.
df_fl.head(2)

Unnamed: 0,NAME,DP05_0031PM,DP05_0032E,DP05_0032M,DP05_0032PE,DP05_0032PM,DP05_0033E,DP05_0033M,DP05_0033PE,DP05_0033PM,...,DP05_0029PEA,DP05_0030MA,DP05_0030EA,DP05_0030PMA,DP05_0030PEA,DP05_0031MA,DP05_0031EA,DP05_0031PEA,state,county
1,"Okaloosa County, Florida",0.1,83.2,0.3,-888888888,-888888888,200737,-555555555,200737,-888888888,...,,,,,,,,,12,91
2,"Taylor County, Florida",0.5,88.1,1.8,-888888888,-888888888,22098,-555555555,22098,-888888888,...,,,,,,,,,12,123


## Illinois: 17

In [10]:
# Set base url
url = 'https://api.census.gov/data/2018/acs/acs5/profile?'

# Set params
params = {
    'get': 'group(DP05),NAME',
    'for': 'county:*',
    'in': 'state:17',
    'key': 'YOURKEYHERE'
}

# Make a request and display the response code.
res = requests.get(url,params)
res

<Response [200]>

In [11]:
# Create a dataframe from the request JSON object.
df_il = pd.DataFrame(res.json())
# Display the first few rows of the dataframe.
df_il.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,707,708,709,710,711,712,713,714,715,716
0,NAME,DP05_0031PM,DP05_0032E,DP05_0032M,DP05_0032PE,DP05_0032PM,DP05_0033E,DP05_0033M,DP05_0033PE,DP05_0033PM,...,DP05_0029PEA,DP05_0030MA,DP05_0030EA,DP05_0030PMA,DP05_0030PEA,DP05_0031MA,DP05_0031EA,DP05_0031PEA,state,county
1,"Jersey County, Illinois",1.0,81.3,3.2,-888888888,-888888888,22069,-555555555,22069,-888888888,...,,,,,,,,,17,083


In [12]:
# Set the values in the first row to the columns
df_il.columns = df_il.iloc[0]

In [13]:
# Drop the first row
df_il = df_il.iloc[1:, :]
# Display the first few rows of the dataframe.
df_il.head(2)

Unnamed: 0,NAME,DP05_0031PM,DP05_0032E,DP05_0032M,DP05_0032PE,DP05_0032PM,DP05_0033E,DP05_0033M,DP05_0033PE,DP05_0033PM,...,DP05_0029PEA,DP05_0030MA,DP05_0030EA,DP05_0030PMA,DP05_0030PEA,DP05_0031MA,DP05_0031EA,DP05_0031PEA,state,county
1,"Jersey County, Illinois",1.0,81.3,3.2,-888888888,-888888888,22069,-555555555,22069,-888888888,...,,,,,,,,,17,83
2,"Putnam County, Illinois",1.2,92.6,4.6,-888888888,-888888888,5746,-555555555,5746,-888888888,...,,,,,,,,,17,155


## New York: 36

In [14]:
# Set base url
url = 'https://api.census.gov/data/2018/acs/acs5/profile?'

# Set params
params = {
    'get': 'group(DP05),NAME',
    'for': 'county:*',
    'in': 'state:36',
    'key': 'YOURKEYHERE'
}

# Make a request and display the response code.
res = requests.get(url,params)
res

<Response [200]>

In [15]:
# Create a dataframe from the request JSON object.
df_ny = pd.DataFrame(res.json())
# Display the first few rows of the dataframe.
df_ny.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,707,708,709,710,711,712,713,714,715,716
0,NAME,DP05_0031PM,DP05_0032E,DP05_0032M,DP05_0032PE,DP05_0032PM,DP05_0033E,DP05_0033M,DP05_0033PE,DP05_0033PM,...,DP05_0029PEA,DP05_0030MA,DP05_0030EA,DP05_0030PMA,DP05_0030PEA,DP05_0031MA,DP05_0031EA,DP05_0031PEA,state,county
1,"Schoharie County, New York",0.3,93.7,1.2,-888888888,-888888888,31364,-555555555,31364,-888888888,...,,,,,,,,,36,095


In [16]:
# Set the values in the first row to the columns
df_ny.columns = df_ny.iloc[0]

In [17]:
# Drop the first row
df_ny = df_ny.iloc[1:, :]
# Display the first few rows of the dataframe.
df_ny.head(2)

Unnamed: 0,NAME,DP05_0031PM,DP05_0032E,DP05_0032M,DP05_0032PE,DP05_0032PM,DP05_0033E,DP05_0033M,DP05_0033PE,DP05_0033PM,...,DP05_0029PEA,DP05_0030MA,DP05_0030EA,DP05_0030PMA,DP05_0030PEA,DP05_0031MA,DP05_0031EA,DP05_0031PEA,state,county
1,"Schoharie County, New York",0.3,93.7,1.2,-888888888,-888888888,31364,-555555555,31364,-888888888,...,,,,,,,,,36,95
2,"Onondaga County, New York",0.1,75.6,0.1,-888888888,-888888888,464242,-555555555,464242,-888888888,...,,,,,,,,,36,67


## Texas: 48

In [18]:
# Set base url.
url = 'https://api.census.gov/data/2018/acs/acs5/profile?'

# Set params.
params = {
    'get': 'group(DP05),NAME',
    'for': 'county:*',
    'in': 'state:48',
    'key': 'YOURKEYHERE'
}

# Make a request and display the response code.
res = requests.get(url,params)
res

<Response [200]>

In [19]:
# Convert the request JSON to a dataframe.
df_tx = pd.DataFrame(res.json())
# Display the first few rows of the dataframe.
df_tx.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,707,708,709,710,711,712,713,714,715,716
0,NAME,DP05_0031PM,DP05_0032E,DP05_0032M,DP05_0032PE,DP05_0032PM,DP05_0033E,DP05_0033M,DP05_0033PE,DP05_0033PM,...,DP05_0029PEA,DP05_0030MA,DP05_0030EA,DP05_0030PMA,DP05_0030PEA,DP05_0031MA,DP05_0031EA,DP05_0031PEA,state,county
1,"Austin County, Texas",0.6,87.0,1.9,-888888888,-888888888,29565,-555555555,29565,-888888888,...,,,,,,,,,48,015


In [20]:
# Set the values in the first row to the columns
df_tx.columns = df_tx.iloc[0]

In [21]:
# Drop the first row.
df_tx = df_tx.iloc[1:, :]
# Display the first few rows of the dataframe.
df_tx.head(2)

Unnamed: 0,NAME,DP05_0031PM,DP05_0032E,DP05_0032M,DP05_0032PE,DP05_0032PM,DP05_0033E,DP05_0033M,DP05_0033PE,DP05_0033PM,...,DP05_0029PEA,DP05_0030MA,DP05_0030EA,DP05_0030PMA,DP05_0030PEA,DP05_0031MA,DP05_0031EA,DP05_0031PEA,state,county
1,"Austin County, Texas",0.6,87.0,1.9,-888888888,-888888888,29565,-555555555,29565,-888888888,...,,,,,,,,,48,15
2,"Kenedy County, Texas",20.5,34.7,38.9,-888888888,-888888888,595,181,595,-888888888,...,,,,,,,,,48,261


## Combining States

In [22]:
# Concatenate the state dataframes.
df = pd.concat([df_tx, df_ny, df_ca, df_fl, df_il])

In [23]:
# Export the data.
df.to_csv('../data/preprocessing/raw_dp05_five_states.csv', index=False)

## Pulling DP05 Headers

In [24]:
# Import the DP05 headers.
header_df = pd.read_csv('../data/preprocessing/acs5y2018_dp05_data_with_overlays.csv')
# Display the dataframe.
header_df

Unnamed: 0,GEO_ID,NAME,DP05_0031PM,DP05_0032E,DP05_0032M,DP05_0032PE,DP05_0032PM,DP05_0033E,DP05_0033M,DP05_0033PE,...,DP05_0029M,DP05_0029PE,DP05_0029PM,DP05_0030E,DP05_0030M,DP05_0030PE,DP05_0030PM,DP05_0031E,DP05_0031M,DP05_0031PE
0,id,Geographic Area Name,Percent Margin of Error!!SEX AND AGE!!Total po...,Estimate!!SEX AND AGE!!Total population!!65 ye...,Margin of Error!!SEX AND AGE!!Total population...,Percent Estimate!!SEX AND AGE!!Total populatio...,Percent Margin of Error!!SEX AND AGE!!Total po...,Estimate!!RACE!!Total population,Margin of Error!!RACE!!Total population,Percent Estimate!!RACE!!Total population,...,Margin of Error!!SEX AND AGE!!Total population...,Percent Estimate!!SEX AND AGE!!Total populatio...,Percent Margin of Error!!SEX AND AGE!!Total po...,Estimate!!SEX AND AGE!!Total population!!65 ye...,Margin of Error!!SEX AND AGE!!Total population...,Percent Estimate!!SEX AND AGE!!Total populatio...,Percent Margin of Error!!SEX AND AGE!!Total po...,Estimate!!SEX AND AGE!!Total population!!65 ye...,Margin of Error!!SEX AND AGE!!Total population...,Percent Estimate!!SEX AND AGE!!Total populatio...
1,0100000US,United States,0.1,79.3,0.1,(X),(X),322903030,*****,322903030,...,5463,49238581,(X),21781300,3215,44.2,0.1,27457281,3635,55.8


In [25]:
# Drop the geo ID column
header_df = header_df.iloc[:, 1:]

In [26]:
# Export the data.
header_df.to_csv('../data/preprocessing/dp05_headers.csv')

## Create a dictionary of the columns and their identifiers

In [27]:
# Extract the header columns as a list.
header_cols = list(header_df.columns)

In [28]:
# Drop the second row of the dataframe.
row_one_df = header_df.iloc[:1, :]
# Display the dataframe.
row_one_df

Unnamed: 0,NAME,DP05_0031PM,DP05_0032E,DP05_0032M,DP05_0032PE,DP05_0032PM,DP05_0033E,DP05_0033M,DP05_0033PE,DP05_0033PM,...,DP05_0029M,DP05_0029PE,DP05_0029PM,DP05_0030E,DP05_0030M,DP05_0030PE,DP05_0030PM,DP05_0031E,DP05_0031M,DP05_0031PE
0,Geographic Area Name,Percent Margin of Error!!SEX AND AGE!!Total po...,Estimate!!SEX AND AGE!!Total population!!65 ye...,Margin of Error!!SEX AND AGE!!Total population...,Percent Estimate!!SEX AND AGE!!Total populatio...,Percent Margin of Error!!SEX AND AGE!!Total po...,Estimate!!RACE!!Total population,Margin of Error!!RACE!!Total population,Percent Estimate!!RACE!!Total population,Percent Margin of Error!!RACE!!Total population,...,Margin of Error!!SEX AND AGE!!Total population...,Percent Estimate!!SEX AND AGE!!Total populatio...,Percent Margin of Error!!SEX AND AGE!!Total po...,Estimate!!SEX AND AGE!!Total population!!65 ye...,Margin of Error!!SEX AND AGE!!Total population...,Percent Estimate!!SEX AND AGE!!Total populatio...,Percent Margin of Error!!SEX AND AGE!!Total po...,Estimate!!SEX AND AGE!!Total population!!65 ye...,Margin of Error!!SEX AND AGE!!Total population...,Percent Estimate!!SEX AND AGE!!Total populatio...


In [29]:
# Convert the row of the dataframe into a list
descriptions = row_one_df.values.tolist()

# The output is a nested list. Extract the list.
descriptions = descriptions[0]

# View the first five entries
descriptions[:5]

['Geographic Area Name',
 'Percent Margin of Error!!SEX AND AGE!!Total population!!65 years and over!!Female',
 'Estimate!!SEX AND AGE!!Total population!!65 years and over!!Sex ratio (males per 100 females)',
 'Margin of Error!!SEX AND AGE!!Total population!!65 years and over!!Sex ratio (males per 100 females)',
 'Percent Estimate!!SEX AND AGE!!Total population!!65 years and over!!Sex ratio (males per 100 females)']

In [30]:
# Create a dictionary from a zipped list of the header columns and descriptions.
header_dict = dict(zip(header_cols, descriptions))

# View the first five entries in the dictionary
list(header_dict.items())[:5]

[('NAME', 'Geographic Area Name'),
 ('DP05_0031PM',
  'Percent Margin of Error!!SEX AND AGE!!Total population!!65 years and over!!Female'),
 ('DP05_0032E',
  'Estimate!!SEX AND AGE!!Total population!!65 years and over!!Sex ratio (males per 100 females)'),
 ('DP05_0032M',
  'Margin of Error!!SEX AND AGE!!Total population!!65 years and over!!Sex ratio (males per 100 females)'),
 ('DP05_0032PE',
  'Percent Estimate!!SEX AND AGE!!Total population!!65 years and over!!Sex ratio (males per 100 females)')]

In [31]:
# Rename the colums in the original dataframe according to the dictionary
df.rename(columns = header_dict, inplace=True)

In [32]:
# Drop columns with NaN values
df.dropna(axis=1, inplace=True)

In [33]:
# Display the first few rows of the dataframe.
df.head(2)

Unnamed: 0,Geographic Area Name,Percent Margin of Error!!SEX AND AGE!!Total population!!65 years and over!!Female,Estimate!!SEX AND AGE!!Total population!!65 years and over!!Sex ratio (males per 100 females),Margin of Error!!SEX AND AGE!!Total population!!65 years and over!!Sex ratio (males per 100 females),Percent Estimate!!SEX AND AGE!!Total population!!65 years and over!!Sex ratio (males per 100 females),Percent Margin of Error!!SEX AND AGE!!Total population!!65 years and over!!Sex ratio (males per 100 females),Estimate!!RACE!!Total population,Margin of Error!!RACE!!Total population,Percent Estimate!!RACE!!Total population,Percent Margin of Error!!RACE!!Total population,...,DP05_0004PMA,DP05_0004PEA,DP05_0018PMA,DP05_0018PEA,DP05_0025PMA,DP05_0028PMA,DP05_0028PEA,DP05_0029PMA,state,county
1,"Austin County, Texas",0.6,87.0,1.9,-888888888,-888888888,29565,-555555555,29565,-888888888,...,(X),(X),(X),(X),(X),(X),(X),(X),48,15
2,"Kenedy County, Texas",20.5,34.7,38.9,-888888888,-888888888,595,181,595,-888888888,...,(X),(X),(X),(X),(X),(X),(X),(X),48,261


In [34]:
# Export the data.
df.to_csv('../data/preprocessing/raw_dp05_with_headers_five_states.csv', index=False)

# Pulling DP03 From Census API
States: CA (06), FL(12), IL(17), NY (36), TX (48)  
SELECTED ECONOMIC CHARACTERISTICS  
Survey/Program: American Community Survey   
2018: ACS 5-Year Estimates Data Profiles  
TableID: DP03  

## California: 06

In [35]:
# Set base url
url03 = 'https://api.census.gov/data/2018/acs/acs5/profile?'

# Set params
params03 = {
    'get': 'group(DP03),NAME',
    'for': 'county:*',
    'in': 'state:06',
    'key': 'YOURKEYHERE'
}

# Make a request and display the response code.
res03 = requests.get(url03,params03)
res03

<Response [200]>

In [36]:
# Convert the request JSON to a dataframe.
df03_ca = pd.DataFrame(res03.json())
# Display the first few rows of the dataframe.
df03_ca.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1091,1092,1093,1094,1095,1096,1097,1098,1099,1100
0,NAME,GEO_ID,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,...,DP03_0136EA,DP03_0136MA,DP03_0136PEA,DP03_0136PMA,DP03_0137MA,DP03_0137EA,DP03_0137PEA,DP03_0137PMA,state,county
1,"Lake County, California",0500000US06033,52331,199,52331,-888888888,26160,858,50.0,1.6,...,(X),(X),,,(X),(X),,,06,033


In [37]:
# Set the values in the first row to the columns
df03_ca.columns = df03_ca.iloc[0]

In [38]:
# Drop the first row.
df03_ca = df03_ca.iloc[1:, :]
# Display the first few rows of the dataframe.
df03_ca.head(2)

Unnamed: 0,NAME,GEO_ID,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,...,DP03_0136EA,DP03_0136MA,DP03_0136PEA,DP03_0136PMA,DP03_0137MA,DP03_0137EA,DP03_0137PEA,DP03_0137PMA,state,county
1,"Lake County, California",0500000US06033,52331,199,52331,-888888888,26160,858,50.0,1.6,...,(X),(X),,,(X),(X),,,6,33
2,"Mariposa County, California",0500000US06043,15019,113,15019,-888888888,7735,395,51.5,2.5,...,(X),(X),,,(X),(X),,,6,43


## Florida: 12

In [39]:
# Set base url.
url03 = 'https://api.census.gov/data/2018/acs/acs5/profile?'

# Set params.
params03 = {
    'get': 'group(DP03),NAME',
    'for': 'county:*',
    'in': 'state:12',
    'key': 'YOURKEYHERE'
}

# Make a request and display the response code.
res03 = requests.get(url03,params03)
res03

<Response [200]>

In [40]:
# Convert the request JSON to a dataframe.
df03_fl = pd.DataFrame(res03.json())
# Display the first few rows of the dataframe.
df03_fl.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1091,1092,1093,1094,1095,1096,1097,1098,1099,1100
0,NAME,GEO_ID,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,...,DP03_0136EA,DP03_0136MA,DP03_0136PEA,DP03_0136PMA,DP03_0137MA,DP03_0137EA,DP03_0137PEA,DP03_0137PMA,state,county
1,"Okaloosa County, Florida",0500000US12091,160704,276,160704,-888888888,102023,1281,63.5,0.8,...,(X),(X),,,(X),(X),,,12,091


In [41]:
# Set the values in the first row to the columns
df03_fl.columns = df03_fl.iloc[0]

In [42]:
# Drop the first row
df03_fl = df03_fl.iloc[1:, :]
# Display the first few rows of the dataframe.
df03_fl.head(2)

Unnamed: 0,NAME,GEO_ID,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,...,DP03_0136EA,DP03_0136MA,DP03_0136PEA,DP03_0136PMA,DP03_0137MA,DP03_0137EA,DP03_0137PEA,DP03_0137PMA,state,county
1,"Okaloosa County, Florida",0500000US12091,160704,276,160704,-888888888,102023,1281,63.5,0.8,...,(X),(X),,,(X),(X),,,12,91
2,"Taylor County, Florida",0500000US12123,18353,161,18353,-888888888,6918,443,37.7,2.4,...,(X),(X),,,(X),(X),,,12,123


## Illinois: 17

In [43]:
# Set base url.
url03 = 'https://api.census.gov/data/2018/acs/acs5/profile?'

# Set params.
params03 = {
    'get': 'group(DP03),NAME',
    'for': 'county:*',
    'in': 'state:17',
    'key': 'YOURKEYHERE'
}

# Make a request and display the response code.
res03 = requests.get(url03,params03)
res03

<Response [200]>

In [44]:
# Convert the request JSON to a dataframe.
df03_il = pd.DataFrame(res03.json())
# Display the first few rows of the dataframe.
df03_il.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1091,1092,1093,1094,1095,1096,1097,1098,1099,1100
0,NAME,GEO_ID,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,...,DP03_0136EA,DP03_0136MA,DP03_0136PEA,DP03_0136PMA,DP03_0137MA,DP03_0137EA,DP03_0137PEA,DP03_0137PMA,state,county
1,"Jersey County, Illinois",0500000US17083,18079,74,18079,-888888888,11127,375,61.5,2.1,...,(X),(X),,,(X),(X),,,17,083


In [45]:
# Set the values in the first row to the columns.
df03_il.columns = df03_il.iloc[0]

In [46]:
# Drop the first row
df03_il = df03_il.iloc[1:, :]
# Display the first few rows of the dataframe.
df03_il.head(2)

Unnamed: 0,NAME,GEO_ID,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,...,DP03_0136EA,DP03_0136MA,DP03_0136PEA,DP03_0136PMA,DP03_0137MA,DP03_0137EA,DP03_0137PEA,DP03_0137PMA,state,county
1,"Jersey County, Illinois",0500000US17083,18079,74,18079,-888888888,11127,375,61.5,2.1,...,(X),(X),,,(X),(X),,,17,83
2,"Putnam County, Illinois",0500000US17155,4777,27,4777,-888888888,3107,117,65.0,2.4,...,(X),(X),,,(X),(X),,,17,155


## New York: 36

In [47]:
# Set base url.
url03 = 'https://api.census.gov/data/2018/acs/acs5/profile?'

# Set params.
params03 = {
    'get': 'group(DP03),NAME',
    'for': 'county:*',
    'in': 'state:36',
    'key': 'YOURKEYHERE'
}

# Make a request and display the response code.
res03 = requests.get(url03,params03)
res03

<Response [200]>

In [48]:
# Convert the request JSON to a dataframe.
df03_ny = pd.DataFrame(res03.json())
# Display the first few rows of the dataframe.
df03_ny.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1091,1092,1093,1094,1095,1096,1097,1098,1099,1100
0,NAME,GEO_ID,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,...,DP03_0136EA,DP03_0136MA,DP03_0136PEA,DP03_0136PMA,DP03_0137MA,DP03_0137EA,DP03_0137PEA,DP03_0137PMA,state,county
1,"Schoharie County, New York",0500000US36095,26508,75,26508,-888888888,15390,382,58.1,1.5,...,(X),(X),,,(X),(X),,,36,095


In [49]:
# Set the values in the first row to the columns
df03_ny.columns = df03_ny.iloc[0]

In [50]:
# Drop the first row
df03_ny = df03_ny.iloc[1:, :]
# Display the first few rows of the dataframe.
df03_ny.head(2)

Unnamed: 0,NAME,GEO_ID,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,...,DP03_0136EA,DP03_0136MA,DP03_0136PEA,DP03_0136PMA,DP03_0137MA,DP03_0137EA,DP03_0137PEA,DP03_0137PMA,state,county
1,"Schoharie County, New York",0500000US36095,26508,75,26508,-888888888,15390,382,58.1,1.5,...,(X),(X),,,(X),(X),,,36,95
2,"Onondaga County, New York",0500000US36067,376244,342,376244,-888888888,235381,1621,62.6,0.4,...,(X),(X),,,(X),(X),,,36,67


## Texas: 48

In [51]:
# Set base url.
url03 = 'https://api.census.gov/data/2018/acs/acs5/profile?'

# Set params.
params03 = {
    'get': 'group(DP03),NAME',
    'for': 'county:*',
    'in': 'state:48',
    'key': 'YOURKEYHERE'
}

# Make a request and display the response code.
res03 = requests.get(url03,params03)
res03

<Response [200]>

In [52]:
# Convert the request JSON to a dataframe.
df03_tx = pd.DataFrame(res03.json())
# Display the first few rows of the dataframe.
df03_tx.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1091,1092,1093,1094,1095,1096,1097,1098,1099,1100
0,NAME,GEO_ID,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,...,DP03_0136EA,DP03_0136MA,DP03_0136PEA,DP03_0136PMA,DP03_0137MA,DP03_0137EA,DP03_0137PEA,DP03_0137PMA,state,county
1,"Austin County, Texas",0500000US48015,23354,108,23354,-888888888,14475,413,62.0,1.8,...,(X),(X),,,(X),(X),,,48,015


In [53]:
# Set the values in the first row to the columns.
df03_tx.columns = df03_tx.iloc[0]

In [54]:
# Drop the first row
df03_tx = df03_tx.iloc[1:, :]
# Display the first few rows of the dataframe.
df03_tx.head(2)

Unnamed: 0,NAME,GEO_ID,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,...,DP03_0136EA,DP03_0136MA,DP03_0136PEA,DP03_0136PMA,DP03_0137MA,DP03_0137EA,DP03_0137PEA,DP03_0137PMA,state,county
1,"Austin County, Texas",0500000US48015,23354,108,23354,-888888888,14475,413,62.0,1.8,...,(X),(X),,,(X),(X),,,48,15
2,"Kenedy County, Texas",0500000US48261,428,122,428,-888888888,220,83,51.4,13.4,...,(X),(X),,,(X),(X),,,48,261


## Combining States

In [55]:
# Concatenate the dataframes.
df03 = pd.concat([df03_tx, df03_ny, df03_ca, df03_fl, df03_il])

In [56]:
# Export the data.
df03.to_csv('../data/preprocessing/raw_dp03_five_states.csv', index=False)

## Pulling DP03 Headers

In [57]:
# Import the DP03 headers.
header_df03 = pd.read_csv('../data/preprocessing/acs5y2018_dp03_data_with_overlays.csv')
# Display the dataframe.
header_df03

Unnamed: 0,GEO_ID,NAME,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,...,DP03_0135PE,DP03_0135PM,DP03_0136E,DP03_0136M,DP03_0136PE,DP03_0136PM,DP03_0137E,DP03_0137M,DP03_0137PE,DP03_0137PM
0,id,Geographic Area Name,Estimate!!EMPLOYMENT STATUS!!Population 16 yea...,Margin of Error!!EMPLOYMENT STATUS!!Population...,Percent Estimate!!EMPLOYMENT STATUS!!Populatio...,Percent Margin of Error!!EMPLOYMENT STATUS!!Po...,Estimate!!EMPLOYMENT STATUS!!Population 16 yea...,Margin of Error!!EMPLOYMENT STATUS!!Population...,Percent Estimate!!EMPLOYMENT STATUS!!Populatio...,Percent Margin of Error!!EMPLOYMENT STATUS!!Po...,...,Percent Estimate!!PERCENTAGE OF FAMILIES AND P...,Percent Margin of Error!!PERCENTAGE OF FAMILIE...,Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WH...,Margin of Error!!PERCENTAGE OF FAMILIES AND PE...,Percent Estimate!!PERCENTAGE OF FAMILIES AND P...,Percent Margin of Error!!PERCENTAGE OF FAMILIE...,Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WH...,Margin of Error!!PERCENTAGE OF FAMILIES AND PE...,Percent Estimate!!PERCENTAGE OF FAMILIES AND P...,Percent Margin of Error!!PERCENTAGE OF FAMILIE...
1,0100000US,United States,257754872,16354,257754872,(X),163276329,146596,63.3,0.1,...,9.3,0.1,(X),(X),11.3,0.1,(X),(X),25.6,0.1


In [58]:
# Drop the geo ID column.
header_df03 = header_df03.iloc[:, 1:]

In [59]:
# Export the data.
header_df03.to_csv('../data/preprocessing/dp03_headers.csv')

## Create a dictionary of the columns and their identifiers

In [60]:
# Extract the header columns as a list.
header_cols = list(header_df03.columns)

In [61]:
# Drop the second row of the dataframe.
row_one_df = header_df03.iloc[:1, :]
# Display the dataframe.
row_one_df

Unnamed: 0,NAME,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,DP03_0003E,...,DP03_0135PE,DP03_0135PM,DP03_0136E,DP03_0136M,DP03_0136PE,DP03_0136PM,DP03_0137E,DP03_0137M,DP03_0137PE,DP03_0137PM
0,Geographic Area Name,Estimate!!EMPLOYMENT STATUS!!Population 16 yea...,Margin of Error!!EMPLOYMENT STATUS!!Population...,Percent Estimate!!EMPLOYMENT STATUS!!Populatio...,Percent Margin of Error!!EMPLOYMENT STATUS!!Po...,Estimate!!EMPLOYMENT STATUS!!Population 16 yea...,Margin of Error!!EMPLOYMENT STATUS!!Population...,Percent Estimate!!EMPLOYMENT STATUS!!Populatio...,Percent Margin of Error!!EMPLOYMENT STATUS!!Po...,Estimate!!EMPLOYMENT STATUS!!Population 16 yea...,...,Percent Estimate!!PERCENTAGE OF FAMILIES AND P...,Percent Margin of Error!!PERCENTAGE OF FAMILIE...,Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WH...,Margin of Error!!PERCENTAGE OF FAMILIES AND PE...,Percent Estimate!!PERCENTAGE OF FAMILIES AND P...,Percent Margin of Error!!PERCENTAGE OF FAMILIE...,Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WH...,Margin of Error!!PERCENTAGE OF FAMILIES AND PE...,Percent Estimate!!PERCENTAGE OF FAMILIES AND P...,Percent Margin of Error!!PERCENTAGE OF FAMILIE...


In [62]:
# Convert the row of the dataframe into a list.
descriptions = row_one_df.values.tolist()

# The output is a nested list. Extract the list.
descriptions = descriptions[0]

# View the first five entries.
descriptions[:5]

['Geographic Area Name',
 'Estimate!!EMPLOYMENT STATUS!!Population 16 years and over',
 'Margin of Error!!EMPLOYMENT STATUS!!Population 16 years and over',
 'Percent Estimate!!EMPLOYMENT STATUS!!Population 16 years and over',
 'Percent Margin of Error!!EMPLOYMENT STATUS!!Population 16 years and over']

In [63]:
# Create a dictionary from a zipped list of the header columns and descriptions.
header_dict = dict(zip(header_cols, descriptions))

# View the first five entries in the dictionary
list(header_dict.items())[:5]

[('NAME', 'Geographic Area Name'),
 ('DP03_0001E', 'Estimate!!EMPLOYMENT STATUS!!Population 16 years and over'),
 ('DP03_0001M',
  'Margin of Error!!EMPLOYMENT STATUS!!Population 16 years and over'),
 ('DP03_0001PE',
  'Percent Estimate!!EMPLOYMENT STATUS!!Population 16 years and over'),
 ('DP03_0001PM',
  'Percent Margin of Error!!EMPLOYMENT STATUS!!Population 16 years and over')]

In [64]:
# Rename the colums in the original dataframe according to the dictionary.
df03 = df03.rename(columns = header_dict)

In [65]:
# Drop columns with NaN values.
df03 = df03.dropna(axis=1)

In [66]:
# Display the first few rows of the dataframe.
df03.head(3)

Unnamed: 0,Geographic Area Name,GEO_ID,Estimate!!EMPLOYMENT STATUS!!Population 16 years and over,Margin of Error!!EMPLOYMENT STATUS!!Population 16 years and over,Percent Estimate!!EMPLOYMENT STATUS!!Population 16 years and over,Percent Margin of Error!!EMPLOYMENT STATUS!!Population 16 years and over,Estimate!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force,Margin of Error!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force,Percent Estimate!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force,Percent Margin of Error!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force,...,DP03_0134EA,DP03_0134MA,DP03_0135EA,DP03_0135MA,DP03_0136EA,DP03_0136MA,DP03_0137MA,DP03_0137EA,state,county
1,"Austin County, Texas",0500000US48015,23354,108,23354,-888888888,14475,413,62.0,1.8,...,(X),(X),(X),(X),(X),(X),(X),(X),48,15
2,"Kenedy County, Texas",0500000US48261,428,122,428,-888888888,220,83,51.4,13.4,...,(X),(X),(X),(X),(X),(X),(X),(X),48,261
3,"Nueces County, Texas",0500000US48355,280990,413,280990,-888888888,177352,1636,63.1,0.6,...,(X),(X),(X),(X),(X),(X),(X),(X),48,355


In [67]:
# Export the data.
df03.to_csv('../data/preprocessing/raw_dp03_with_headers_five_states.csv', index=False)