In [1]:
# Import dependencies
import pandas as pd
import numpy as np

# Read in csv containing every county in the US and the coordinates of its midpoint.
ff_cancer_df = pd.read_csv('../cleaned_data/ML_data_ff_cancer.csv', dtype={'GEOID': str})

ff_cancer_df.head()

Unnamed: 0,latitude,longitude,GEOID,County_x,nameplate_capacity_MW1,NOx_tons1,SO2_tons1,CO2_tons1,CH4_lbs1,N2O_lbs1,...,SO2_norm4,CO2_norm4,CH4_norm4,N2O_norm4,PM2.5_norm4,SO2_norm5,CO2_norm5,CH4_norm5,N2O_norm5,PM2.5_norm5
0,32.53492,-86.642749,1001,Autauga County,939.4,50.521,3.849,762545.203,28447.358,2844.736,...,6.735791,0.0,600.173051,159.15902,0.093953,0.02429,0.0,156.466804,86.462937,0.01171
1,30.66097,-87.74984,1003,Baldwin County,50.0,450.864,4.58,167490.328,6318.013,631.801,...,0.900625,0.0,0.0,0.0,0.395226,0.008566,1697.107079,52.575239,5.257524,0.228696
2,31.869603,-85.393197,1005,Barbour County,120.5,312.818,0.59,0.0,134642.958,24655.8,...,0.00033,63.970049,2.404243,0.240427,0.004277,8.782955,622.152431,799.696591,164.486235,0.165484
3,32.998644,-87.126439,1007,Bibb County,13.0,16.113,2.219,0.011,12526.086,1644.049,...,0.02429,0.0,156.466804,86.462937,0.01171,0.564922,2597.835515,377.96268,53.46756,0.106137
4,33.980867,-86.567371,1009,Blount County,3.8,2.197,0.009,1006.565,37.975,3.79,...,0.100718,422.446337,82.916872,12.063848,0.062996,0.00142,280.552964,10.830925,1.11207,0.018389


In [2]:
ff_cancer_df.columns

Index(['latitude', 'longitude', 'GEOID', 'County_x', 'nameplate_capacity_MW1',
       'NOx_tons1', 'SO2_tons1', 'CO2_tons1', 'CH4_lbs1', 'N2O_lbs1',
       ...
       'SO2_norm4', 'CO2_norm4', 'CH4_norm4', 'N2O_norm4', 'PM2.5_norm4',
       'SO2_norm5', 'CO2_norm5', 'CH4_norm5', 'N2O_norm5', 'PM2.5_norm5'],
      dtype='object', length=132)

In [3]:
# Replace any 0 distance values with 1 for next calculation
ff_cancer_df[["dist_from_county2", "dist_from_county3",
              "dist_from_county4", "dist_from_county5"]].replace(0,1,inplace=True)
ff_cancer_df["dist_from_county1"].replace(0,1,inplace=True)
ff_cancer_df[["dist_from_county1", "dist_from_county2", "dist_from_county3",
              "dist_from_county4", "dist_from_county5"]].min()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


dist_from_county1    1.0
dist_from_county2    1.0
dist_from_county3    1.0
dist_from_county4    1.0
dist_from_county5    1.0
dtype: float64

In [4]:
# For each feature, calculate the average of that feature divided by distance squared.
for index, row in ff_cancer_df.iterrows():
    ff_cancer_df.loc[index,"avg_capacity_over_d2"] = (row["nameplate_capacity_MW1"] / row["dist_from_county1"]**2
                                                      + row["nameplate_capacity_MW2"] / row["dist_from_county2"]**2
                                                      + row["nameplate_capacity_MW3"] / row["dist_from_county3"]**2
                                                      + row["nameplate_capacity_MW4"] / row["dist_from_county4"]**2
                                                      + row["nameplate_capacity_MW5"] / row["dist_from_county5"]**2) / 5
    ff_cancer_df.loc[index,"avg_NOx_over_d2"] = (row["NOx_tons1"] / row["dist_from_county1"]**2
                                                      + row["NOx_tons2"] / row["dist_from_county2"]**2
                                                      + row["NOx_tons3"] / row["dist_from_county3"]**2
                                                      + row["NOx_tons4"] / row["dist_from_county4"]**2
                                                      + row["NOx_tons5"] / row["dist_from_county5"]**2) / 5
    ff_cancer_df.loc[index,"avg_SO2_over_d2"] = (row["SO2_tons1"] / row["dist_from_county1"]**2
                                                      + row["SO2_tons2"] / row["dist_from_county2"]**2
                                                      + row["SO2_tons3"] / row["dist_from_county3"]**2
                                                      + row["SO2_tons4"] / row["dist_from_county4"]**2
                                                      + row["SO2_tons5"] / row["dist_from_county5"]**2) / 5
    ff_cancer_df.loc[index,"avg_CO2_over_d2"] = (row["CO2_tons1"] / row["dist_from_county1"]**2
                                                      + row["CO2_tons2"] / row["dist_from_county2"]**2
                                                      + row["CO2_tons3"] / row["dist_from_county3"]**2
                                                      + row["CO2_tons4"] / row["dist_from_county4"]**2
                                                      + row["CO2_tons5"] / row["dist_from_county5"]**2) / 5
    ff_cancer_df.loc[index,"avg_CH4_over_d2"] = (row["CH4_lbs1"] / row["dist_from_county1"]**2
                                                      + row["CH4_lbs2"] / row["dist_from_county2"]**2
                                                      + row["CH4_lbs3"] / row["dist_from_county3"]**2
                                                      + row["CH4_lbs4"] / row["dist_from_county4"]**2
                                                      + row["CH4_lbs5"] / row["dist_from_county5"]**2) / 5
    ff_cancer_df.loc[index,"avg_N2O_over_d2"] = (row["N2O_lbs1"] / row["dist_from_county1"]**2
                                                      + row["N2O_lbs2"] / row["dist_from_county2"]**2
                                                      + row["N2O_lbs3"] / row["dist_from_county3"]**2
                                                      + row["N2O_lbs4"] / row["dist_from_county4"]**2
                                                      + row["N2O_lbs5"] / row["dist_from_county5"]**2) / 5
    ff_cancer_df.loc[index,"avg_PM2.5_over_d2"] = (row["PM2.5_tons1"] / row["dist_from_county1"]**2
                                                      + row["PM2.5_tons2"] / row["dist_from_county2"]**2
                                                      + row["PM2.5_tons3"] / row["dist_from_county3"]**2
                                                      + row["PM2.5_tons4"] / row["dist_from_county4"]**2
                                                      + row["PM2.5_tons5"] / row["dist_from_county5"]**2) / 5

In [5]:
# For each feature, calculate the average of that feature divided by distance squared.
for index, row in ff_cancer_df.iterrows():
    ff_cancer_df.loc[index,"avg_NOx_norm_over_d2"] = (row["NOx_norm1"] / row["dist_from_county1"]**2
                                                      + row["NOx_norm2"] / row["dist_from_county2"]**2
                                                      + row["NOx_norm3"] / row["dist_from_county3"]**2
                                                      + row["NOx_norm4"] / row["dist_from_county4"]**2
                                                      + row["NOx_norm5"] / row["dist_from_county5"]**2) / 5
    ff_cancer_df.loc[index,"avg_SO2_norm_over_d2"] = (row["SO2_norm1"] / row["dist_from_county1"]**2
                                                      + row["SO2_norm2"] / row["dist_from_county2"]**2
                                                      + row["SO2_norm3"] / row["dist_from_county3"]**2
                                                      + row["SO2_norm4"] / row["dist_from_county4"]**2
                                                      + row["SO2_norm5"] / row["dist_from_county5"]**2) / 5
    ff_cancer_df.loc[index,"avg_CO2_norm_over_d2"] = (row["CO2_norm1"] / row["dist_from_county1"]**2
                                                      + row["CO2_norm2"] / row["dist_from_county2"]**2
                                                      + row["CO2_norm3"] / row["dist_from_county3"]**2
                                                      + row["CO2_norm4"] / row["dist_from_county4"]**2
                                                      + row["CO2_norm5"] / row["dist_from_county5"]**2) / 5
    ff_cancer_df.loc[index,"avg_CH4_norm_over_d2"] = (row["CH4_norm1"] / row["dist_from_county1"]**2
                                                      + row["CH4_norm2"] / row["dist_from_county2"]**2
                                                      + row["CH4_norm3"] / row["dist_from_county3"]**2
                                                      + row["CH4_norm4"] / row["dist_from_county4"]**2
                                                      + row["CH4_norm5"] / row["dist_from_county5"]**2) / 5
    ff_cancer_df.loc[index,"avg_N2O_norm_over_d2"] = (row["N2O_norm1"] / row["dist_from_county1"]**2
                                                      + row["N2O_norm2"] / row["dist_from_county2"]**2
                                                      + row["N2O_norm3"] / row["dist_from_county3"]**2
                                                      + row["N2O_norm4"] / row["dist_from_county4"]**2
                                                      + row["N2O_norm5"] / row["dist_from_county5"]**2) / 5
    ff_cancer_df.loc[index,"avg_PM2.5_norm_over_d2"] = (row["PM2.5_norm1"] / row["dist_from_county1"]**2
                                                      + row["PM2.5_norm2"] / row["dist_from_county2"]**2
                                                      + row["PM2.5_norm3"] / row["dist_from_county3"]**2
                                                      + row["PM2.5_norm4"] / row["dist_from_county4"]**2
                                                      + row["PM2.5_norm5"] / row["dist_from_county5"]**2) / 5

In [6]:
ff_cancer_df.head()

Unnamed: 0,latitude,longitude,GEOID,County_x,nameplate_capacity_MW1,NOx_tons1,SO2_tons1,CO2_tons1,CH4_lbs1,N2O_lbs1,...,avg_CO2_over_d2,avg_CH4_over_d2,avg_N2O_over_d2,avg_PM2.5_over_d2,avg_NOx_norm_over_d2,avg_SO2_norm_over_d2,avg_CO2_norm_over_d2,avg_CH4_norm_over_d2,avg_N2O_norm_over_d2,avg_PM2.5_norm_over_d2
0,32.53492,-86.642749,1001,Autauga County,939.4,50.521,3.849,762545.203,28447.358,2844.736,...,9090.126982,405.357721,57.630938,0.40477,0.005512,0.008032,7.723091,1.124036,0.292679,0.000477
1,30.66097,-87.74984,1003,Baldwin County,50.0,450.864,4.58,167490.328,6318.013,631.801,...,555.172427,19.5538,1.95538,0.043945,0.005227,0.000431,3.4755,0.126674,0.012667,0.000382
2,31.869603,-85.393197,1005,Barbour County,120.5,312.818,0.59,0.0,134642.958,24655.8,...,5.282991,35.93527,6.714078,0.001799,0.001069,0.000666,0.060757,0.308435,0.057802,3.1e-05
3,32.998644,-87.126439,1007,Bibb County,13.0,16.113,2.219,0.011,12526.086,1644.049,...,1157.221806,111.274622,15.659547,0.051646,0.000896,0.000105,0.894578,0.282251,0.044297,4.2e-05
4,33.980867,-86.567371,1009,Blount County,3.8,2.197,0.009,1006.565,37.975,3.79,...,3172.217969,659.893056,95.881832,0.032135,0.000698,6.5e-05,1.383653,0.245878,0.035385,3.3e-05


In [7]:
# Export final csv file
ff_cancer_df.to_csv('../cleaned_data/ML_data_ff_cancer_w_avgs.csv', index=False)

In [8]:
data_for_plot = ff_cancer_df.drop(columns=['County_x', 'nameplate_capacity_MW1',
       'NOx_tons1', 'SO2_tons1', 'CO2_tons1', 'CH4_lbs1', 'N2O_lbs1',
       'PM2.5_tons1', 'dist_from_county1', 'nameplate_capacity_MW2',
       'NOx_tons2', 'SO2_tons2', 'CO2_tons2', 'CH4_lbs2', 'N2O_lbs2',
       'PM2.5_tons2', 'dist_from_county2', 'nameplate_capacity_MW3',
       'NOx_tons3', 'SO2_tons3', 'CO2_tons3', 'CH4_lbs3', 'N2O_lbs3',
       'PM2.5_tons3', 'dist_from_county3', 'nameplate_capacity_MW4',
       'NOx_tons4', 'SO2_tons4', 'CO2_tons4', 'CH4_lbs4', 'N2O_lbs4',
       'PM2.5_tons4', 'dist_from_county4', 'nameplate_capacity_MW5',
       'NOx_tons5', 'SO2_tons5', 'CO2_tons5', 'CH4_lbs5', 'N2O_lbs5',
       'PM2.5_tons5', 'dist_from_county5', 'fuel_type1_Biomass',
       'fuel_type1_Coal', 'fuel_type1_Gas', 'fuel_type1_Oil',
       'fuel_type1_Other Fossil', 'fuel_type2_Biomass', 'fuel_type2_Coal',
       'fuel_type2_Gas', 'fuel_type2_Oil', 'fuel_type2_Other Fossil',
       'fuel_type3_Biomass', 'fuel_type3_Coal', 'fuel_type3_Gas',
       'fuel_type3_Oil', 'fuel_type3_Other Fossil', 'fuel_type4_Biomass',
       'fuel_type4_Coal', 'fuel_type4_Gas', 'fuel_type4_Oil',
       'fuel_type4_Other Fossil', 'fuel_type5_Biomass', 'fuel_type5_Coal',
       'fuel_type5_Gas', 'fuel_type5_Oil', 'fuel_type5_Other Fossil',
       'County_y'])

In [9]:
data_for_plot.head()

Unnamed: 0,latitude,longitude,GEOID,cardio_death,total_cancer,bladder,brain,breast,breast_insitu,cervix,...,avg_CO2_over_d2,avg_CH4_over_d2,avg_N2O_over_d2,avg_PM2.5_over_d2,avg_NOx_norm_over_d2,avg_SO2_norm_over_d2,avg_CO2_norm_over_d2,avg_CH4_norm_over_d2,avg_N2O_norm_over_d2,avg_PM2.5_norm_over_d2
0,32.53492,-86.642749,1001,263.9,506.4,15.8,7.0,124.4,23.9,0.0,...,9090.126982,405.357721,57.630938,0.40477,0.005512,0.008032,7.723091,1.124036,0.292679,0.000477
1,30.66097,-87.74984,1003,241.9,455.7,23.1,6.5,124.7,25.5,11.0,...,555.172427,19.5538,1.95538,0.043945,0.005227,0.000431,3.4755,0.126674,0.012667,0.000382
2,31.869603,-85.393197,1005,351.2,447.2,13.3,0.0,109.5,22.6,0.0,...,5.282991,35.93527,6.714078,0.001799,0.001069,0.000666,0.060757,0.308435,0.057802,3.1e-05
3,32.998644,-87.126439,1007,323.6,466.1,19.8,0.0,113.9,0.0,0.0,...,1157.221806,111.274622,15.659547,0.051646,0.000896,0.000105,0.894578,0.282251,0.044297,4.2e-05
4,33.980867,-86.567371,1009,283.6,438.7,17.4,6.7,113.6,21.6,0.0,...,3172.217969,659.893056,95.881832,0.032135,0.000698,6.5e-05,1.383653,0.245878,0.035385,3.3e-05


In [10]:
# Export final csv file
data_for_plot.to_csv('../Tableau/avg_ff_cancer_data.csv', index=False)