# Merge All Data to Prepare for Analysis

## Purpose: Merge the following data in preparation for analysis
* "Base" (Daily Google Mobility + COVID-19) 
* "VDE" (Locality Voter Preferences in 2020)
* "NOAA" (Daily Precipitation)
* "Census" (Locality 2019 Population Estimates)

## Dependencies

In [50]:
import pandas as pd

## For Multiple Outputs 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
## Google Mobility & COVID-19 (daily)
base = pd.read_csv("data/build/build1_base.csv")

base.head()

Unnamed: 0,Report Date,FIPS,Locality,VDH Health District,Total Cases,Hospitalizations,Deaths,country_region_code,country_region,sub_region_1,...,metro_area,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,2020-03-17,51001,Accomack,Eastern Shore,0,0,0,US,United States,Virginia,...,,,51001.0,2020-03-17,-8.0,11.0,,,-15.0,5.0
1,2020-03-17,51003,Albemarle,Thomas Jefferson,0,0,0,US,United States,Virginia,...,,,51003.0,2020-03-17,-22.0,9.0,,-16.0,-38.0,14.0
2,2020-03-17,51005,Alleghany,Alleghany,0,0,0,US,United States,Virginia,...,,,51005.0,2020-03-17,-11.0,,,8.0,-12.0,
3,2020-03-17,51007,Amelia,Piedmont,0,0,0,US,United States,Virginia,...,,,51007.0,2020-03-17,-21.0,6.0,,,-18.0,
4,2020-03-17,51009,Amherst,Central Virginia,0,0,0,US,United States,Virginia,...,,,51009.0,2020-03-17,3.0,22.0,,,-13.0,6.0


In [6]:
## VDE - Voter Preferences in 2020 Election (county)
vde = pd.read_csv("data/build/build2_locality.csv")

vde.head()

Unnamed: 0,Locality,P_Democrat,P_Republican,S_Democrat,S_Republican
0,ACCOMACK COUNTY,0,1,0,1
1,ALBEMARLE COUNTY,1,0,1,0
2,ALEXANDRIA CITY,1,0,1,0
3,ALLEGHANY COUNTY,0,1,0,1
4,AMELIA COUNTY,0,1,0,1


In [36]:
## NOAA - Precipitation (daily)
noaa = pd.read_csv("data/build/build3_precipitation.csv")

noaa.head()

Unnamed: 0,COUNTY,FIPS,DATE,PRCP
0,Amelia County,51007,2020-03-17,0.0
1,Amelia County,51007,2020-03-18,0.0
2,Amelia County,51007,2020-03-19,0.095
3,Amelia County,51007,2020-03-20,0.0
4,Amelia County,51007,2020-03-21,0.145


In [35]:
## Census - Estimated 2019 population & 2018 5-year estimates of income (county)
census = pd.read_csv('data/build/build4_census.csv')

census.head()

Unnamed: 0,GEO_ID,NAME,DP03_0062E,DP03_0063E,FIPS,Locality,MEDIAN_INCOME,MEAN_INCOME,2019
0,0500000US51001,"Accomack County, Virginia",43210,57996,51001,Accomack County,43210,57996,32316
1,0500000US51003,"Albemarle County, Virginia",75394,107948,51003,Albemarle County,75394,107948,109330
2,0500000US51005,"Alleghany County, Virginia",47794,60513,51005,Alleghany County,47794,60513,14860
3,0500000US51007,"Amelia County, Virginia",58526,74185,51007,Amelia County,58526,74185,13145
4,0500000US51009,"Amherst County, Virginia",49170,61676,51009,Amherst County,49170,61676,31605


In [41]:
## Standardize VDE "LOCALITY"
vde['LOCALITY'] = vde['Locality'].str.replace(" COUNTY", "")
vde['LOCALITY'] = vde['LOCALITY'].str.replace('&', 'and')

vde.head()

Unnamed: 0,Locality,P_Democrat,P_Republican,S_Democrat,S_Republican,Locality_std,LOCALITY
0,ACCOMACK COUNTY,0,1,0,1,ACCOMACK,ACCOMACK
1,ALBEMARLE COUNTY,1,0,1,0,ALBEMARLE,ALBEMARLE
2,ALEXANDRIA CITY,1,0,1,0,ALEXANDRIA CITY,ALEXANDRIA CITY
3,ALLEGHANY COUNTY,0,1,0,1,ALLEGHANY,ALLEGHANY
4,AMELIA COUNTY,0,1,0,1,AMELIA,AMELIA


In [46]:
## Standardize Base "LOCALITY"

base["LOCALITY"] = base["Locality"].str.upper()

## [NOTE - CONSIDER CONVERTING THIS TO FUNCTION]
base.loc[base['LOCALITY'] == 'ALEXANDRIA', 'LOCALITY'] = 'ALEXANDRIA CITY'
base.loc[base['LOCALITY'] == 'BRISTOL', 'LOCALITY'] = 'BRISTOL CITY'
base.loc[base['LOCALITY'] == 'CHARLOTTESVILLE', 'LOCALITY'] = 'CHARLOTTESVILLE CITY'
base.loc[base['LOCALITY'] == 'CHESAPEAKE', 'LOCALITY'] = 'CHESAPEAKE CITY'
base.loc[base['LOCALITY'] == 'COLONIAL HEIGHTS', 'LOCALITY'] = 'COLONIAL HEIGHTS CITY'
base.loc[base['LOCALITY'] == 'COVINGTON', 'LOCALITY'] = 'COVINGTON CITY'
base.loc[base['LOCALITY'] == 'DANVILLE', 'LOCALITY'] = 'DANVILLE CITY'
base.loc[base['LOCALITY'] == 'EMPORIA', 'LOCALITY'] = 'EMPORIA CITY'
base.loc[base['LOCALITY'] == 'FAIRFAX', 'LOCALITY'] = 'FAIRFAX CITY'
base.loc[base['LOCALITY'] == 'FALLS CHURCH', 'LOCALITY'] = 'FALLS CHURCH CITY'
base.loc[base['LOCALITY'] == 'FRANKLIN', 'LOCALITY'] = 'FRANKLIN CITY'
base.loc[base['LOCALITY'] == 'FREDERICKSBURG', 'LOCALITY'] = 'FREDERICKSBURG CITY'
base.loc[base['LOCALITY'] == 'GALAX', 'LOCALITY'] = 'GALAX CITY'
base.loc[base['LOCALITY'] == 'HAMPTON', 'LOCALITY'] = 'HAMPTON CITY'
base.loc[base['LOCALITY'] == 'HARRISONBURG', 'LOCALITY'] = 'HARRISONBURG CITY'
base.loc[base['LOCALITY'] == 'HOPEWELL', 'LOCALITY'] = 'HOPEWELL CITY'
base.loc[base['LOCALITY'] == 'KING AND QUEEN', 'LOCALITY'] = 'KING AND QUEEN CITY'
base.loc[base['LOCALITY'] == 'LEXINGTON', 'LOCALITY'] = 'LEXINGTON CITY'
base.loc[base['LOCALITY'] == 'LYNCHBURG', 'LOCALITY'] = 'LYNCHBURG CITY'
base.loc[base['LOCALITY'] == 'MANASSAS PARK', 'LOCALITY'] = 'MANASSAS PARK CITY'
base.loc[base['LOCALITY'] == 'MARTINSVILLE', 'LOCALITY'] = 'MARTINSVILLE CITY'
base.loc[base['LOCALITY'] == 'NEWPORT NEWS', 'LOCALITY'] = 'NEWPORT NEWS CITY'
base.loc[base['LOCALITY'] == 'NORFOLK', 'LOCALITY'] = 'NORFOLK CITY'
base.loc[base['LOCALITY'] == 'NORTON', 'LOCALITY'] = 'NORTON CITY'
base.loc[base['LOCALITY'] == 'PETERSBURG', 'LOCALITY'] = 'PETERSBURG CITY'
base.loc[base['LOCALITY'] == 'POQUOSON', 'LOCALITY'] = 'POQUOSON CITY'
base.loc[base['LOCALITY'] == 'PORTSMOUTH', 'LOCALITY'] = 'PORTSMOUTH CITY'
base.loc[base['LOCALITY'] == 'RADFORD', 'LOCALITY'] = 'RADFORD CITY'
base.loc[base['LOCALITY'] == 'RICHMOND', 'LOCALITY'] = 'RICHMOND CITY'
base.loc[base['LOCALITY'] == 'ROANOKE', 'LOCALITY'] = 'ROANOKE CITY'
base.loc[base['LOCALITY'] == 'SALEM', 'LOCALITY'] = 'SALEM CITY'
base.loc[base['LOCALITY'] == 'STAUNTON', 'LOCALITY'] = 'STAUNTON CITY'
base.loc[base['LOCALITY'] == 'SUFFOLK', 'LOCALITY'] = 'SUFFOLK CITY'
base.loc[base['LOCALITY'] == 'VIRGINIA BEACH', 'LOCALITY'] = 'VIRGINIA BEACH CITY'
base.loc[base['LOCALITY'] == 'WAYNESBORO', 'LOCALITY'] = 'WAYNESBORO CITY'
base.loc[base['LOCALITY'] == 'WILLIAMSBURG', 'LOCALITY'] = 'WILLIAMSBURG CITY'
base.loc[base['LOCALITY'] == 'WINCHESTER', 'LOCALITY'] = 'WINCHESTER CITY'

base.head()

Unnamed: 0,Report Date,FIPS,Locality,VDH Health District,Total Cases,Hospitalizations,Deaths,country_region_code,country_region,sub_region_1,...,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,LOCALITY
0,2020-03-17,51001,Accomack,Eastern Shore,0,0,0,US,United States,Virginia,...,,51001.0,2020-03-17,-8.0,11.0,,,-15.0,5.0,ACCOMACK
1,2020-03-17,51003,Albemarle,Thomas Jefferson,0,0,0,US,United States,Virginia,...,,51003.0,2020-03-17,-22.0,9.0,,-16.0,-38.0,14.0,ALBEMARLE
2,2020-03-17,51005,Alleghany,Alleghany,0,0,0,US,United States,Virginia,...,,51005.0,2020-03-17,-11.0,,,8.0,-12.0,,ALLEGHANY
3,2020-03-17,51007,Amelia,Piedmont,0,0,0,US,United States,Virginia,...,,51007.0,2020-03-17,-21.0,6.0,,,-18.0,,AMELIA
4,2020-03-17,51009,Amherst,Central Virginia,0,0,0,US,United States,Virginia,...,,51009.0,2020-03-17,3.0,22.0,,,-13.0,6.0,AMHERST


In [51]:
## Pre-Merge dimensions
base.shape

## Inner Merge
base1 = base.merge(vde, how= "inner", on= "LOCALITY")

## Post-merge dimensions (Should be identical row count)
base1.shape

(33649, 22)

(33396, 28)