In [1]:
# Import dependencies
import pandas as pd
import numpy as np

# Read in csv containing every county in the US and the coordinates of its midpoint.
ff_cancer_df = pd.read_csv('../cleaned_data/ML_data_ff_cancer.csv', dtype={'GEOID': str})

ff_cancer_df.head()

Unnamed: 0,latitude,longitude,GEOID,County_x,nameplate_capacity_MW1,NOx_tons1,SO2_tons1,CO2_tons1,CH4_lbs1,N2O_lbs1,...,ovary,pancreas,prostate,stomach,thyroid,uterus,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease
0,32.53492,-86.642749,1001,Autauga County,939.4,50.521,3.849,762545.203,28447.358,2844.736,...,12.9,9.7,158.0,0.0,0.0,25.5,1718.0,3906.0,4274.0,8657.0
1,30.66097,-87.74984,1003,Baldwin County,50.0,450.864,4.58,167490.328,6318.013,631.801,...,11.8,10.0,91.8,8.1,3.8,17.6,6393.0,16246.0,19461.0,36546.0
2,31.869603,-85.393197,1005,Barbour County,120.5,312.818,0.59,0.0,134642.958,24655.8,...,0.0,0.0,162.6,0.0,0.0,22.7,664.0,1760.0,2001.0,3855.0
3,32.998644,-87.126439,1007,Bibb County,13.0,16.113,2.219,0.011,12526.086,1644.049,...,0.0,0.0,112.1,0.0,0.0,25.3,584.0,1603.0,1754.0,3433.0
4,33.980867,-86.567371,1009,Blount County,3.8,2.197,0.009,1006.565,37.975,3.79,...,14.1,11.5,96.9,0.0,9.4,23.5,1742.0,4028.0,4638.0,9075.0


In [2]:
ff_cancer_df.columns

Index(['latitude', 'longitude', 'GEOID', 'County_x', 'nameplate_capacity_MW1',
       'NOx_tons1', 'SO2_tons1', 'CO2_tons1', 'CH4_lbs1', 'N2O_lbs1',
       'PM2.5_tons1', 'dist_from_county1', 'nameplate_capacity_MW2',
       'NOx_tons2', 'SO2_tons2', 'CO2_tons2', 'CH4_lbs2', 'N2O_lbs2',
       'PM2.5_tons2', 'dist_from_county2', 'nameplate_capacity_MW3',
       'NOx_tons3', 'SO2_tons3', 'CO2_tons3', 'CH4_lbs3', 'N2O_lbs3',
       'PM2.5_tons3', 'dist_from_county3', 'nameplate_capacity_MW4',
       'NOx_tons4', 'SO2_tons4', 'CO2_tons4', 'CH4_lbs4', 'N2O_lbs4',
       'PM2.5_tons4', 'dist_from_county4', 'nameplate_capacity_MW5',
       'NOx_tons5', 'SO2_tons5', 'CO2_tons5', 'CH4_lbs5', 'N2O_lbs5',
       'PM2.5_tons5', 'dist_from_county5', 'fuel_type1_Biomass',
       'fuel_type1_Coal', 'fuel_type1_Gas', 'fuel_type1_Oil',
       'fuel_type1_Other Fossil', 'fuel_type2_Biomass', 'fuel_type2_Coal',
       'fuel_type2_Gas', 'fuel_type2_Oil', 'fuel_type2_Other Fossil',
       'fuel_type3_Biom

In [3]:
# Create empty data frames for the binned distances
zero_ten = pd.DataFrame()
ten_twenty = pd.DataFrame()
twenty_thirty = pd.DataFrame()
thirty_fourty = pd.DataFrame()
fourty_fifty = pd.DataFrame()
fifty_plus = pd.DataFrame()

In [5]:
# Iterate through the DataFrame to bin the data by the various distances
for index, row in ff_cancer_df.iterrows():
    for i in np.arange(5):
        dist_index = "dist_from_county" + str(i+1)
        if row[dist_index] <= 10:
            zero_ten = zero_ten.append(row,ignore_index=True)
        elif (row[dist_index] > 10 and row[dist_index] <= 20):
            ten_twenty = ten_twenty.append(row,ignore_index=True)
        elif (row[dist_index] > 20 and row[dist_index] <= 30):
            twenty_thirty = twenty_thirty.append(row,ignore_index=True)
        elif (row[dist_index] > 30 and row[dist_index] <= 40):
            thirty_fourty = thirty_fourty.append(row,ignore_index=True)
        elif (row[dist_index] > 40 and row[dist_index] <= 50):
            fourty_fifty = fourty_fifty.append(row,ignore_index=True)
        else:
            fifty_plus = fifty_plus.append(row,ignore_index=True)

In [6]:
zero_ten.head()

Unnamed: 0,latitude,longitude,GEOID,County_x,nameplate_capacity_MW1,NOx_tons1,SO2_tons1,CO2_tons1,CH4_lbs1,N2O_lbs1,...,ovary,pancreas,prostate,stomach,thyroid,uterus,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease
0,32.53492,-86.642749,1001,Autauga County,939.4,50.521,3.849,762545.203,28447.358,2844.736,...,12.9,9.7,158.0,0.0,0.0,25.5,1718.0,3906.0,4274.0,8657.0
1,32.53492,-86.642749,1001,Autauga County,939.4,50.521,3.849,762545.203,28447.358,2844.736,...,12.9,9.7,158.0,0.0,0.0,25.5,1718.0,3906.0,4274.0,8657.0
2,34.700434,-87.804985,1033,Colbert County,1826.0,11.153,3.672,3152.6,94.737,9.474,...,15.1,10.8,103.2,0.0,0.0,18.1,1531.0,3942.0,4600.0,8758.0
3,34.459773,-85.804143,1049,DeKalb County,4.8,0.0,6.48,0.0,0.0,0.0,...,8.8,11.3,86.9,9.2,0.0,17.3,2266.0,4919.0,5579.0,11156.0
4,31.126123,-87.16162,1053,Escambia County,115.9,200.808,456.756,70623.784,12812.246,7031.871,...,0.0,0.0,97.4,0.0,0.0,22.1,1068.0,2543.0,2871.0,5657.0


In [15]:
# Get the various statistical information for each set of binned data so that the median can be plotted.
zero_ten_stats = zero_ten.describe()
ten_twenty_stats = ten_twenty.describe()
twenty_thirty_stats = twenty_thirty.describe()
thirty_fourty_stats = thirty_fourty.describe()
fourty_fifty_stats = fourty_fifty.describe()
fifty_plus_stats = fifty_plus.describe()

In [30]:
fourty_fifty_stats

Unnamed: 0,latitude,longitude,nameplate_capacity_MW1,NOx_tons1,SO2_tons1,CO2_tons1,CH4_lbs1,N2O_lbs1,PM2.5_tons1,dist_from_county1,...,ovary,pancreas,prostate,stomach,thyroid,uterus,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease
count,1404.0,1404.0,1404.0,1404.0,1404.0,1404.0,1404.0,1404.0,1404.0,1404.0,...,1404.0,1404.0,1404.0,1404.0,1404.0,1404.0,1404.0,1404.0,1404.0,1404.0
mean,37.525193,-93.395453,412.004986,656.172049,792.073999,936439.1,171901.0,24882.196126,106.81178,26.348291,...,2.292735,3.206909,96.482194,1.311467,0.866239,15.275071,872.433761,3525.495726,2714.854701,6262.026353
std,5.23879,11.490748,659.077836,1885.789933,3662.126717,2325829.0,515717.1,75022.7873,669.449935,12.848775,...,4.729744,5.596234,43.729977,3.331436,2.623627,14.975515,2165.749576,9100.800746,5918.402945,15132.666744
min,20.867735,-164.244323,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,5.0,11.0
25%,33.282993,-99.108733,9.45,1.90525,0.32,48.016,20.701,3.206,0.281222,17.0,...,0.0,0.0,80.35,0.0,0.0,0.0,153.75,660.5,550.0,1219.75
50%,36.793897,-91.138898,82.1,68.5315,3.145,16712.17,3695.137,432.78,2.818659,26.0,...,0.0,0.0,100.3,0.0,0.0,19.6,354.5,1424.0,1256.0,2661.0
75%,40.895302,-85.262575,568.35,286.628,25.369,566880.5,59900.94,8643.60125,28.762995,37.0,...,0.0,8.5,122.85,0.0,0.0,27.625,707.0,2827.25,2454.25,5204.25
max,64.835451,-68.649428,3718.2,16093.224,34474.822,15508280.0,3411480.0,496195.059,9763.664552,50.0,...,21.5,22.4,258.0,17.6,15.8,55.4,38788.0,152317.0,81573.0,248573.0


In [29]:
# Store only the median data for each bin
stats = pd.DataFrame()
stats = stats.append(zero_ten_stats.iloc[5,:])
stats = stats.append(ten_twenty_stats.iloc[5,:])
stats = stats.append(twenty_thirty_stats.iloc[5,:])
stats = stats.append(thirty_fourty_stats.iloc[5,:])
stats = stats.append(fourty_fifty_stats.iloc[5,:])
stats = stats.append(fifty_plus_stats.iloc[5,:])
stats["bin"] = ["zero_ten","ten_twenty","twenty_thirty","thirty_fourty","fourty_fifty","fifty_plus"]
stats.head(6)

Unnamed: 0,latitude,longitude,nameplate_capacity_MW1,NOx_tons1,SO2_tons1,CO2_tons1,CH4_lbs1,N2O_lbs1,PM2.5_tons1,dist_from_county1,...,pancreas,prostate,stomach,thyroid,uterus,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease,bin
50%,40.268281,-86.527534,33.35,18.849,1.118,5962.787,452.113,45.4875,1.267721,5.0,...,11.3,111.4,7.0,5.2,27.9,1731.5,7628.5,5583.0,13890.5,zero_ten
50%,39.741527,-87.142042,48.5,26.159,1.813,7106.704,578.934,72.276,1.901484,11.0,...,9.5,108.5,0.0,0.0,26.1,821.0,3436.0,2694.0,6047.0,ten_twenty
50%,39.307617,-87.509684,44.75,29.249,1.791,7641.335,822.9815,96.116,1.886672,17.0,...,0.0,106.2,0.0,0.0,24.8,466.5,2092.5,1778.0,3803.0,twenty_thirty
50%,37.697925,-88.557716,63.2,53.649,2.531,10764.533,2351.38,245.095,2.332344,22.0,...,0.0,103.7,0.0,0.0,22.4,424.0,1805.0,1563.0,3342.0,thirty_fourty
50%,36.793897,-91.138898,82.1,68.5315,3.145,16712.174,3695.137,432.78,2.818659,26.0,...,0.0,100.3,0.0,0.0,19.6,354.5,1424.0,1256.0,2661.0,fourty_fifty
50%,40.666433,-101.657774,54.9,50.966,2.313,14379.205,3191.101,319.11,2.002282,45.0,...,0.0,87.9,0.0,0.0,0.0,166.0,718.0,537.0,1293.0,fifty_plus


In [31]:
stats.columns

Index(['latitude', 'longitude', 'nameplate_capacity_MW1', 'NOx_tons1',
       'SO2_tons1', 'CO2_tons1', 'CH4_lbs1', 'N2O_lbs1', 'PM2.5_tons1',
       'dist_from_county1', 'nameplate_capacity_MW2', 'NOx_tons2', 'SO2_tons2',
       'CO2_tons2', 'CH4_lbs2', 'N2O_lbs2', 'PM2.5_tons2', 'dist_from_county2',
       'nameplate_capacity_MW3', 'NOx_tons3', 'SO2_tons3', 'CO2_tons3',
       'CH4_lbs3', 'N2O_lbs3', 'PM2.5_tons3', 'dist_from_county3',
       'nameplate_capacity_MW4', 'NOx_tons4', 'SO2_tons4', 'CO2_tons4',
       'CH4_lbs4', 'N2O_lbs4', 'PM2.5_tons4', 'dist_from_county4',
       'nameplate_capacity_MW5', 'NOx_tons5', 'SO2_tons5', 'CO2_tons5',
       'CH4_lbs5', 'N2O_lbs5', 'PM2.5_tons5', 'dist_from_county5',
       'fuel_type1_Biomass', 'fuel_type1_Coal', 'fuel_type1_Gas',
       'fuel_type1_Oil', 'fuel_type1_Other Fossil', 'fuel_type2_Biomass',
       'fuel_type2_Coal', 'fuel_type2_Gas', 'fuel_type2_Oil',
       'fuel_type2_Other Fossil', 'fuel_type3_Biomass', 'fuel_type3_Coal',
   

In [34]:
print(stats[['bladder',
       'brain', 'breast', 'breast_insitu', 'cervix', 'colon', 'esophagus',
       'kidney_and_renal', 'leukemia', 'liver', 'lung', 'melanoma',
       'non-hodgkins_lymphoma', 'oral_cavity', 'ovary', 'pancreas', 'prostate',
       'stomach', 'thyroid', 'uterus']].max())

bladder                   20.10
brain                      6.10
breast                   126.80
breast_insitu             26.10
cervix                     5.40
colon                     33.80
esophagus                  0.00
kidney_and_renal          11.40
leukemia                   9.85
liver                      0.00
lung                      55.60
melanoma                  16.50
non-hodgkins_lymphoma     14.80
oral_cavity                5.60
ovary                      9.30
pancreas                  11.30
prostate                 111.40
stomach                    7.00
thyroid                    5.20
uterus                    27.90
dtype: float64


In [36]:
# Export final csv file
stats.to_csv('../cleaned_data/ff_plants_binned_stats.csv', index=False)