## Read in Data

In [1]:
# Import Statements
import pandas as pd
import numpy as np

### Master Row Data

In [2]:
# Read in Data

# Price Per Square Feet per [County, City, State, +More]
ppsft = pd.read_csv('property_value_cost/cols_price_per_sqft.csv', 
                    index_col=0)

# N Trips Ending per [County Name, State]
dest = pd.read_csv('travel_revenue/col_travel_n_times_city_is_dest_with_states.csv')

# Minimum Wage per [State]
min_wg = pd.read_csv('infrastructure_cost/cols_infra_min_wage.csv', 
                     index_col=0)

# Property Taxes per [State]
prp_taxes = pd.read_csv('taxes_cost/18_revenue_state_and_local_summary.csv')

# Natural Disasters by Type per [County Name, State]
dis = pd.read_csv('nat_disasters_cost/cols_disaster_stats.csv')

In [3]:
print(ppsft.shape)
ppsft.head()

(12243, 8)


Unnamed: 0,RegionID,RegionName,State,Metro,CountyName,SizeRank,pct_change,2017-09
0,6181,New York,NY,New York,Queens,1,0.092089,483
1,12447,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,2,0.080575,432
2,17426,Chicago,IL,Chicago,Cook,3,0.046394,173
3,13271,Philadelphia,PA,Philadelphia,Philadelphia,4,0.078478,107
4,40326,Phoenix,AZ,Phoenix,Maricopa,5,0.08265,143


In [4]:
print(dest.shape)
dest.head()

(3139, 3)


Unnamed: 0,dest_full,dest_state,n_trips
0,"Abbeville County, South Carolina",South Carolina,184624.0
1,"Acadia Parish, Louisiana",Louisiana,241374.0
2,"Accomack County, Virginia",Virginia,242825.0
3,"Ada County, Idaho",Idaho,1964790.0
4,"Adair County, Iowa",Iowa,69708.0


In [5]:
print(min_wg.shape)
min_wg.head()

(54, 5)


Unnamed: 0,State,State.Minimum.Wage,Federal.Minimum.Wage,Effective.Minimum.Wage,CPI.Average
2808,Alabama,0.0,7.25,7.25,258.66
2809,Alaska,10.19,7.25,10.19,258.66
2810,Arizona,12.0,7.25,12.0,258.66
2811,Arkansas,10.0,7.25,10.0,258.66
2812,California,13.0,7.25,13.0,258.66


In [6]:
print(prp_taxes.shape)
prp_taxes.head()

(52, 5)


Unnamed: 0,State,Total (millions of dollars),Per Capita (dollars),Percentage of General Revenue,Percentage of Personal Income
0,United States,"$547,039","$1,667",16.6%,3.1%
1,Alabama,2921,596,7.0%,1.4%
2,Alaska,1614,2206,14.0%,3.6%
3,Arizona,8053,1106,14.7%,2.5%
4,Arkansas,2334,774,9.0%,1.8%


In [7]:
print(dis.shape)
dis.head()

(3267, 24)


Unnamed: 0,loc_full,Biological,Chemical,Coastal Storm,Dam/Levee Break,Drought,Earthquake,Fire,Fishing Losses,Flood,...,Other,Severe Ice Storm,Severe Storm(s),Snow,Terrorist,Tornado,Toxic Substances,Tsunami,Typhoon,Volcano
0,"Abbeville County, South Carolina",2.0,,,,1.0,,,,1.0,...,,1.0,2.0,,,1.0,,,,
1,"Acadia Parish, Louisiana",2.0,,2.0,,,,,,3.0,...,1.0,,4.0,,,,,,,
2,"Accomack County, Virginia",2.0,,,,,,,,2.0,...,,,2.0,2.0,,,,,,
3,"Ada County, Idaho",2.0,,,,,,1.0,,2.0,...,,,,,,,,,,
4,"Adair County, Iowa",2.0,,,,,,,,4.0,...,,2.0,6.0,1.0,,,,,,


### Join Tables

In [8]:
# State Abbreviations + State Fips + State Name
states = pd.read_csv('join_tables/StatesFIPSCodes.csv')

In [9]:
print(states.shape)
states.head()

(57, 4)


Unnamed: 0,STATE_FIPS,STUSAB,STATE_NAME,STATENS
0,1,AL,Alabama,1779775
1,2,AK,Alaska,1785533
2,4,AZ,Arizona,1779777
3,5,AR,Arkansas,68085
4,6,CA,California,1779778


## Data Cleaning

### Price Per Square Foot `ppsft` 

In [10]:
ppsft.head(2)

Unnamed: 0,RegionID,RegionName,State,Metro,CountyName,SizeRank,pct_change,2017-09
0,6181,New York,NY,New York,Queens,1,0.092089,483
1,12447,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,2,0.080575,432


In [11]:
ppsft.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12243 entries, 0 to 12242
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   RegionID    12243 non-null  int64  
 1   RegionName  12243 non-null  object 
 2   State       12243 non-null  object 
 3   Metro       11297 non-null  object 
 4   CountyName  12243 non-null  object 
 5   SizeRank    12243 non-null  int64  
 6   pct_change  12242 non-null  float64
 7   2017-09     12243 non-null  int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 860.8+ KB


In [12]:
# Stick Together loc_full Column
loc_full = []
for i, row in ppsft.iterrows():
    loc_full.append(row[4] + ', ' + row[2])

ppsft.insert(8, 'loc_full', loc_full)

In [13]:
ppsft.head()

Unnamed: 0,RegionID,RegionName,State,Metro,CountyName,SizeRank,pct_change,2017-09,loc_full
0,6181,New York,NY,New York,Queens,1,0.092089,483,"Queens, NY"
1,12447,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,2,0.080575,432,"Los Angeles, CA"
2,17426,Chicago,IL,Chicago,Cook,3,0.046394,173,"Cook, IL"
3,13271,Philadelphia,PA,Philadelphia,Philadelphia,4,0.078478,107,"Philadelphia, PA"
4,40326,Phoenix,AZ,Phoenix,Maricopa,5,0.08265,143,"Maricopa, AZ"


### Destination Trip Data `dest`

In [14]:
dest.head(2)

Unnamed: 0,dest_full,dest_state,n_trips
0,"Abbeville County, South Carolina",South Carolina,184624.0
1,"Acadia Parish, Louisiana",Louisiana,241374.0


In [15]:
states.head()

Unnamed: 0,STATE_FIPS,STUSAB,STATE_NAME,STATENS
0,1,AL,Alabama,1779775
1,2,AK,Alaska,1785533
2,4,AZ,Arizona,1779777
3,5,AR,Arkansas,68085
4,6,CA,California,1779778


In [16]:
dest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3139 entries, 0 to 3138
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   dest_full   3139 non-null   object 
 1   dest_state  3139 non-null   object 
 2   n_trips     3139 non-null   float64
dtypes: float64(1), object(2)
memory usage: 73.7+ KB


In [17]:
# Get State Column
state = []
for i, row in dest.iterrows():
    state_row = states[row[1] == states['STATE_NAME']]
    state.append(state_row['STUSAB'].values[0])

dest.insert(3, 'state', state)

In [18]:
dest.head()

Unnamed: 0,dest_full,dest_state,n_trips,state
0,"Abbeville County, South Carolina",South Carolina,184624.0,SC
1,"Acadia Parish, Louisiana",Louisiana,241374.0,LA
2,"Accomack County, Virginia",Virginia,242825.0,VA
3,"Ada County, Idaho",Idaho,1964790.0,ID
4,"Adair County, Iowa",Iowa,69708.0,IA


### Minimum Wage `min_wg` 

In [19]:
min_wg.head(2)

Unnamed: 0,State,State.Minimum.Wage,Federal.Minimum.Wage,Effective.Minimum.Wage,CPI.Average
2808,Alabama,0.0,7.25,7.25,258.66
2809,Alaska,10.19,7.25,10.19,258.66


In [20]:
min_wg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54 entries, 2808 to 2861
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   54 non-null     object 
 1   State.Minimum.Wage      54 non-null     float64
 2   Federal.Minimum.Wage    54 non-null     float64
 3   Effective.Minimum.Wage  54 non-null     float64
 4   CPI.Average             54 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.5+ KB


In [21]:
# Get State Column
state = []
for i, row in min_wg.iterrows():
    state_row = states[row[0] == states['STATE_NAME']]
    state.append(state_row['STUSAB'].values[0])

min_wg.insert(5, 'state', state)

In [22]:
min_wg.head()

Unnamed: 0,State,State.Minimum.Wage,Federal.Minimum.Wage,Effective.Minimum.Wage,CPI.Average,state
2808,Alabama,0.0,7.25,7.25,258.66,AL
2809,Alaska,10.19,7.25,10.19,258.66,AK
2810,Arizona,12.0,7.25,12.0,258.66,AZ
2811,Arkansas,10.0,7.25,10.0,258.66,AR
2812,California,13.0,7.25,13.0,258.66,CA


### Property Taxes `prp_taxes` 

In [23]:
prp_taxes.head(2)

Unnamed: 0,State,Total (millions of dollars),Per Capita (dollars),Percentage of General Revenue,Percentage of Personal Income
0,United States,"$547,039","$1,667",16.6%,3.1%
1,Alabama,2921,596,7.0%,1.4%


In [24]:
prp_taxes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   State                          52 non-null     object
 1   Total (millions of dollars)    52 non-null     object
 2   Per Capita (dollars)           52 non-null     object
 3   Percentage of General Revenue  52 non-null     object
 4   Percentage of Personal Income  52 non-null     object
dtypes: object(5)
memory usage: 2.2+ KB


In [25]:
# Get State Column
state = []
for i, row in prp_taxes.iterrows():
    state_row = states[row[0] == states['STATE_NAME']]
    try:
        state.append(state_row['STUSAB'].values[0])
    except:
        state.append(np.nan)
        
prp_taxes.insert(5, 'state', state)

In [26]:
prp_taxes.head(2)

Unnamed: 0,State,Total (millions of dollars),Per Capita (dollars),Percentage of General Revenue,Percentage of Personal Income,state
0,United States,"$547,039","$1,667",16.6%,3.1%,
1,Alabama,2921,596,7.0%,1.4%,AL


### Natural Disasters `dis` 

In [27]:
dis.head(2)

Unnamed: 0,loc_full,Biological,Chemical,Coastal Storm,Dam/Levee Break,Drought,Earthquake,Fire,Fishing Losses,Flood,...,Other,Severe Ice Storm,Severe Storm(s),Snow,Terrorist,Tornado,Toxic Substances,Tsunami,Typhoon,Volcano
0,"Abbeville County, South Carolina",2.0,,,,1.0,,,,1.0,...,,1.0,2.0,,,1.0,,,,
1,"Acadia Parish, Louisiana",2.0,,2.0,,,,,,3.0,...,1.0,,4.0,,,,,,,


In [28]:
dis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3267 entries, 0 to 3266
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loc_full          3267 non-null   object 
 1   Biological        3249 non-null   float64
 2   Chemical          9 non-null      float64
 3   Coastal Storm     474 non-null    float64
 4   Dam/Levee Break   11 non-null     float64
 5   Drought           1193 non-null   float64
 6   Earthquake        168 non-null    float64
 7   Fire              975 non-null    float64
 8   Fishing Losses    34 non-null     float64
 9   Flood             2705 non-null   float64
 10  Freezing          171 non-null    float64
 11  Human Cause       7 non-null      float64
 12  Hurricane         2899 non-null   float64
 13  Mud/Landslide     28 non-null     float64
 14  Other             224 non-null    float64
 15  Severe Ice Storm  995 non-null    float64
 16  Severe Storm(s)   3042 non-null   float64


In [29]:
# Insert State Column
state = [s.split(',')[1].strip() for s in dis['loc_full']]
dis.insert(24, 'state', state)

In [30]:
dis.head()

Unnamed: 0,loc_full,Biological,Chemical,Coastal Storm,Dam/Levee Break,Drought,Earthquake,Fire,Fishing Losses,Flood,...,Severe Ice Storm,Severe Storm(s),Snow,Terrorist,Tornado,Toxic Substances,Tsunami,Typhoon,Volcano,state
0,"Abbeville County, South Carolina",2.0,,,,1.0,,,,1.0,...,1.0,2.0,,,1.0,,,,,South Carolina
1,"Acadia Parish, Louisiana",2.0,,2.0,,,,,,3.0,...,,4.0,,,,,,,,Louisiana
2,"Accomack County, Virginia",2.0,,,,,,,,2.0,...,,2.0,2.0,,,,,,,Virginia
3,"Ada County, Idaho",2.0,,,,,,1.0,,2.0,...,,,,,,,,,,Idaho
4,"Adair County, Iowa",2.0,,,,,,,,4.0,...,2.0,6.0,1.0,,,,,,,Iowa


## The Join Begins!

In [31]:
ppsft.head()

Unnamed: 0,RegionID,RegionName,State,Metro,CountyName,SizeRank,pct_change,2017-09,loc_full
0,6181,New York,NY,New York,Queens,1,0.092089,483,"Queens, NY"
1,12447,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,2,0.080575,432,"Los Angeles, CA"
2,17426,Chicago,IL,Chicago,Cook,3,0.046394,173,"Cook, IL"
3,13271,Philadelphia,PA,Philadelphia,Philadelphia,4,0.078478,107,"Philadelphia, PA"
4,40326,Phoenix,AZ,Phoenix,Maricopa,5,0.08265,143,"Maricopa, AZ"


In [32]:
dest.head()

Unnamed: 0,dest_full,dest_state,n_trips,state
0,"Abbeville County, South Carolina",South Carolina,184624.0,SC
1,"Acadia Parish, Louisiana",Louisiana,241374.0,LA
2,"Accomack County, Virginia",Virginia,242825.0,VA
3,"Ada County, Idaho",Idaho,1964790.0,ID
4,"Adair County, Iowa",Iowa,69708.0,IA


In [35]:
merge1 = ppsft.merge(right=dest, how='outer', left_on='loc_full', right_on='dest_full')

In [37]:
merge1 = merge1.drop('dest_full', axis=1)

In [42]:
merge2 = merge1.merge(right=min_wg, how='outer', on='state')

In [43]:
merge2.head()

Unnamed: 0,RegionID,RegionName,State_x,Metro,CountyName,SizeRank,pct_change,2017-09,loc_full,dest_state,n_trips,state,State_y,State.Minimum.Wage,Federal.Minimum.Wage,Effective.Minimum.Wage,CPI.Average
0,6181.0,New York,NY,New York,Queens,1.0,0.092089,483.0,"Queens, NY",,,,,,,,
1,12447.0,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,2.0,0.080575,432.0,"Los Angeles, CA",,,,,,,,
2,46298.0,Long Beach,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,34.0,0.068866,425.0,"Los Angeles, CA",,,,,,,,
3,45457.0,Glendale,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,122.0,0.059937,483.0,"Los Angeles, CA",,,,,,,,
4,5534.0,Lancaster,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,138.0,0.119016,149.0,"Los Angeles, CA",,,,,,,,


In [44]:
merge3 = merge2.merge(right=prp_taxes, how='outer', on='state')

In [45]:
merge3.head()

Unnamed: 0,RegionID,RegionName,State_x,Metro,CountyName,SizeRank,pct_change,2017-09,loc_full,dest_state,...,State_y,State.Minimum.Wage,Federal.Minimum.Wage,Effective.Minimum.Wage,CPI.Average,State,Total (millions of dollars),Per Capita (dollars),Percentage of General Revenue,Percentage of Personal Income
0,6181.0,New York,NY,New York,Queens,1.0,0.092089,483.0,"Queens, NY",,...,,,,,,United States,"$547,039","$1,667",16.6%,3.1%
1,12447.0,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,2.0,0.080575,432.0,"Los Angeles, CA",,...,,,,,,United States,"$547,039","$1,667",16.6%,3.1%
2,46298.0,Long Beach,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,34.0,0.068866,425.0,"Los Angeles, CA",,...,,,,,,United States,"$547,039","$1,667",16.6%,3.1%
3,45457.0,Glendale,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,122.0,0.059937,483.0,"Los Angeles, CA",,...,,,,,,United States,"$547,039","$1,667",16.6%,3.1%
4,5534.0,Lancaster,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,138.0,0.119016,149.0,"Los Angeles, CA",,...,,,,,,United States,"$547,039","$1,667",16.6%,3.1%


In [47]:
dis.head()

Unnamed: 0,loc_full,Biological,Chemical,Coastal Storm,Dam/Levee Break,Drought,Earthquake,Fire,Fishing Losses,Flood,...,Severe Ice Storm,Severe Storm(s),Snow,Terrorist,Tornado,Toxic Substances,Tsunami,Typhoon,Volcano,state
0,"Abbeville County, South Carolina",2.0,,,,1.0,,,,1.0,...,1.0,2.0,,,1.0,,,,,South Carolina
1,"Acadia Parish, Louisiana",2.0,,2.0,,,,,,3.0,...,,4.0,,,,,,,,Louisiana
2,"Accomack County, Virginia",2.0,,,,,,,,2.0,...,,2.0,2.0,,,,,,,Virginia
3,"Ada County, Idaho",2.0,,,,,,1.0,,2.0,...,,,,,,,,,,Idaho
4,"Adair County, Iowa",2.0,,,,,,,,4.0,...,2.0,6.0,1.0,,,,,,,Iowa


In [48]:
merge4 = merge3.merge(right=dis, how='outer', on='loc_full')

In [49]:
merge4

Unnamed: 0,RegionID,RegionName,State_x,Metro,CountyName,SizeRank,pct_change,2017-09,loc_full,dest_state,...,Severe Ice Storm,Severe Storm(s),Snow,Terrorist,Tornado,Toxic Substances,Tsunami,Typhoon,Volcano,state_y
0,6181.0,New York,NY,New York,Queens,1.0,0.092089,483.0,"Queens, NY",,...,,,,,,,,,,
1,12447.0,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,2.0,0.080575,432.0,"Los Angeles, CA",,...,,,,,,,,,,
2,46298.0,Long Beach,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,34.0,0.068866,425.0,"Los Angeles, CA",,...,,,,,,,,,,
3,45457.0,Glendale,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,122.0,0.059937,483.0,"Los Angeles, CA",,...,,,,,,,,,,
4,5534.0,Lancaster,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,138.0,0.119016,149.0,"Los Angeles, CA",,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18647,,,,,,,,,"Yuma County, Arizona",,...,,1.0,,,,,,,,Arizona
18648,,,,,,,,,"Yuma County, Colorado",,...,,1.0,,,1.0,,,,,Colorado
18649,,,,,,,,,"Zapata County, Texas",,...,,1.0,,,,,,,,Texas
18650,,,,,,,,,"Zavala County, Texas",,...,,3.0,,,,,,,,Texas


In [50]:
merge4.to_csv('giant_merge.csv')