In [2]:
# Import dependencies
import pandas as pd
import numpy as np

# Read in csv containing every county in the US and the coordinates of its midpoint.
nuc_cancer_df = pd.read_csv('../cleaned_data/ML_data_nuc_cancer.csv', dtype={'GEOID': str})

nuc_cancer_df.head()

Unnamed: 0,latitude,longitude,GEOID,County_State,closest_plant,distance,plant_capacity,County,cardio_death,total_cancer,...,ovary,pancreas,prostate,stomach,thyroid,uterus,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease
0,32.53492,-86.642749,1001,"Autauga County, Alabama",Joseph M. Farley Nuclear Plant,128.0,1776.4,"Autauga, Alabama",263.9,506.4,...,12.9,9.7,158.0,0.0,0.0,25.5,1718.0,3906.0,4274.0,8657.0
1,30.66097,-87.74984,1003,"Baldwin County, Alabama",Joseph M. Farley Nuclear Plant,161.0,1776.4,"Baldwin, Alabama",241.9,455.7,...,11.8,10.0,91.8,8.1,3.8,17.6,6393.0,16246.0,19461.0,36546.0
2,31.869603,-85.393197,1005,"Barbour County, Alabama",Joseph M. Farley Nuclear Plant,48.0,1776.4,"Barbour, Alabama",351.2,447.2,...,0.0,0.0,162.6,0.0,0.0,22.7,664.0,1760.0,2001.0,3855.0
3,32.998644,-87.126439,1007,"Bibb County, Alabama",Browns Ferry Nuclear Plant,118.0,3567.5,"Bibb, Alabama",323.6,466.1,...,0.0,0.0,112.1,0.0,0.0,25.3,584.0,1603.0,1754.0,3433.0
4,33.980867,-86.567371,1009,"Blount County, Alabama",Browns Ferry Nuclear Plant,59.0,3567.5,"Blount, Alabama",283.6,438.7,...,14.1,11.5,96.9,0.0,9.4,23.5,1742.0,4028.0,4638.0,9075.0


In [3]:
nuc_cancer_df.columns

Index(['latitude', 'longitude', 'GEOID', 'County_State', 'closest_plant',
       'distance', 'plant_capacity', 'County', 'cardio_death', 'total_cancer',
       'bladder', 'brain', 'breast', 'breast_insitu', 'cervix', 'colon',
       'esophagus', 'kidney_and_renal', 'leukemia', 'liver', 'lung',
       'melanoma', 'non-hodgkins_lymphoma', 'oral_cavity', 'ovary', 'pancreas',
       'prostate', 'stomach', 'thyroid', 'uterus', 'pediatric_asthma',
       'adult_asthma', 'COPD', 'adult_chronic_lung_disease'],
      dtype='object')

In [4]:
# Create empty data frames for the binned distances
zero_ten = pd.DataFrame()
ten_twenty = pd.DataFrame()
twenty_thirty = pd.DataFrame()
thirty_fourty = pd.DataFrame()
fourty_fifty = pd.DataFrame()
fifty_plus = pd.DataFrame()

In [6]:
# Iterate through the DataFrame to bin the data by the various distances
for index, row in nuc_cancer_df.iterrows():
    if row["distance"] <= 10:
        zero_ten = zero_ten.append(row,ignore_index=True)
    elif (row["distance"] > 10 and row["distance"] <= 20):
        ten_twenty = ten_twenty.append(row,ignore_index=True)
    elif (row["distance"] > 20 and row["distance"] <= 30):
        twenty_thirty = twenty_thirty.append(row,ignore_index=True)
    elif (row["distance"] > 30 and row["distance"] <= 40):
        thirty_fourty = thirty_fourty.append(row,ignore_index=True)
    elif (row["distance"] > 40 and row["distance"] <= 50):
        fourty_fifty = fourty_fifty.append(row,ignore_index=True)
    else:
        fifty_plus = fifty_plus.append(row,ignore_index=True)

In [7]:
zero_ten.head()

Unnamed: 0,latitude,longitude,GEOID,County_State,closest_plant,distance,plant_capacity,County,cardio_death,total_cancer,...,ovary,pancreas,prostate,stomach,thyroid,uterus,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease
0,39.57833,-75.638978,10003,"New Castle County, Delaware",Hope Creek Generating Station,9.0,1290.7,"New Castle, Delaware",211.5,474.4,...,9.3,13.5,125.9,9.3,7.7,31.7,8882.0,48620.0,25956.0,74295.0
1,40.174629,-88.904089,17039,"De Witt County, Illinois",Clinton Power Station,4.0,1138.3,"De Witt, Illinois",228.6,500.6,...,0.0,0.0,67.3,0.0,0.0,38.2,223.0,1032.0,792.0,1865.0
2,41.285103,-88.418496,17063,"Grundy County, Illinois",Braidwood Station,10.0,2449.8,"Grundy, Illinois",214.3,539.9,...,0.0,0.0,130.3,0.0,0.0,26.5,873.0,3302.0,2283.0,5919.0
3,42.042566,-89.320727,17141,"Ogle County, Illinois",Byron Station,3.0,2449.8,"Ogle, Illinois",229.9,478.8,...,11.0,8.2,90.9,0.0,0.0,31.2,775.0,3327.0,2526.0,6047.0
4,29.905528,-90.35822,22089,"St. Charles Parish, Louisiana",Waterford Steam Electric Station,9.0,1199.8,"St. Charles, Louisiana",252.9,475.9,...,0.0,17.0,140.9,0.0,0.0,24.8,1121.0,3200.0,3419.0,6834.0


In [8]:
# Get the various statistical information for each set of binned data so that the median can be plotted.
zero_ten_stats = zero_ten.describe()
ten_twenty_stats = ten_twenty.describe()
twenty_thirty_stats = twenty_thirty.describe()
thirty_fourty_stats = thirty_fourty.describe()
fourty_fifty_stats = fourty_fifty.describe()
fifty_plus_stats = fifty_plus.describe()

In [9]:
fourty_fifty_stats

Unnamed: 0,latitude,longitude,distance,plant_capacity,cardio_death,total_cancer,bladder,brain,breast,breast_insitu,...,ovary,pancreas,prostate,stomach,thyroid,uterus,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease
count,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,...,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0
mean,38.047766,-84.58909,45.496241,1749.227068,237.466165,442.13985,18.229323,3.939098,117.91203,22.827068,...,5.022556,7.117293,106.133835,4.257895,2.968421,23.075188,3680.285714,15252.81203,9923.383459,25842.06015
std,4.511377,7.39718,2.767846,760.979908,46.046944,112.842033,7.691522,3.651501,35.390478,14.177928,...,5.678236,6.451879,33.712539,4.980328,4.239914,12.202815,9313.573709,40356.084342,25057.758403,67572.040586
min,28.439109,-118.478403,41.0,614.0,148.8,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,40.0,172.0,164.0,317.0
25%,34.265645,-90.212595,43.0,1035.9,201.7,435.4,16.6,0.0,114.7,17.2,...,0.0,0.0,97.0,0.0,0.0,19.8,445.0,1725.0,1353.0,3183.0
50%,39.250717,-84.066367,45.0,1776.4,227.8,467.2,19.7,5.6,126.1,26.7,...,0.0,10.0,110.7,0.0,0.0,26.8,1028.0,3366.0,2667.0,6503.0
75%,41.508838,-79.237291,48.0,2430.0,266.4,492.5,23.0,7.2,136.0,32.6,...,10.6,12.2,124.5,8.9,6.5,31.0,2481.0,10309.0,7528.0,19048.0
max,46.229843,-71.280979,50.0,3567.5,361.4,568.9,30.6,11.3,173.8,63.3,...,15.8,20.7,166.9,18.5,19.9,47.8,75700.0,346474.0,232759.0,600127.0


In [10]:
# Store only the median data for each bin
stats = pd.DataFrame()
stats = stats.append(zero_ten_stats.iloc[5,:])
stats = stats.append(ten_twenty_stats.iloc[5,:])
stats = stats.append(twenty_thirty_stats.iloc[5,:])
stats = stats.append(thirty_fourty_stats.iloc[5,:])
stats = stats.append(fourty_fifty_stats.iloc[5,:])
stats = stats.append(fifty_plus_stats.iloc[5,:])
stats["bin"] = ["zero_ten","ten_twenty","twenty_thirty","thirty_fourty","fourty_fifty","fifty_plus"]
stats.head(6)

Unnamed: 0,latitude,longitude,distance,plant_capacity,cardio_death,total_cancer,bladder,brain,breast,breast_insitu,...,pancreas,prostate,stomach,thyroid,uterus,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease,bin
50%,38.835517,-86.306415,8.0,1846.8,243.9,474.4,21.8,6.0,129.4,25.6,...,10.7,109.3,0.0,0.0,25.1,903.0,3341.0,3332.0,6834.0,zero_ten
50%,39.536436,-82.331293,16.0,1311.0,222.2,473.5,20.6,5.8,133.5,26.5,...,11.3,117.4,6.3,0.0,28.2,1567.0,5447.0,4440.0,9991.0,ten_twenty
50%,39.56242,-83.261133,26.0,2018.6,233.3,470.2,19.9,5.9,123.1,26.9,...,11.2,113.4,6.4,6.1,26.4,1522.0,6938.0,5181.0,12077.0,twenty_thirty
50%,38.473681,-83.531132,36.0,1776.4,230.5,463.1,20.0,5.3,126.4,23.5,...,10.8,111.9,0.0,0.0,25.0,1021.0,4003.0,3557.0,7602.0,thirty_fourty
50%,39.250717,-84.066367,45.0,1776.4,227.8,467.2,19.7,5.6,126.1,26.7,...,10.0,110.7,0.0,0.0,26.8,1028.0,3366.0,2667.0,6503.0,fourty_fifty
50%,39.160918,-91.844324,132.0,1311.0,230.8,455.4,17.8,0.0,118.0,0.0,...,0.0,101.9,0.0,0.0,21.9,388.0,1708.0,1421.0,3128.0,fifty_plus


In [11]:
stats.columns

Index(['latitude', 'longitude', 'distance', 'plant_capacity', 'cardio_death',
       'total_cancer', 'bladder', 'brain', 'breast', 'breast_insitu', 'cervix',
       'colon', 'esophagus', 'kidney_and_renal', 'leukemia', 'liver', 'lung',
       'melanoma', 'non-hodgkins_lymphoma', 'oral_cavity', 'ovary', 'pancreas',
       'prostate', 'stomach', 'thyroid', 'uterus', 'pediatric_asthma',
       'adult_asthma', 'COPD', 'adult_chronic_lung_disease', 'bin'],
      dtype='object')

In [12]:
print(stats[['bladder',
       'brain', 'breast', 'breast_insitu', 'cervix', 'colon', 'esophagus',
       'kidney_and_renal', 'leukemia', 'liver', 'lung', 'melanoma',
       'non-hodgkins_lymphoma', 'oral_cavity', 'ovary', 'pancreas', 'prostate',
       'stomach', 'thyroid', 'uterus']].max())

bladder                   21.8
brain                      6.0
breast                   133.5
breast_insitu             26.9
cervix                     5.6
colon                     33.7
esophagus                  0.0
kidney_and_renal          12.1
leukemia                  10.0
liver                      0.0
lung                      59.2
melanoma                  18.6
non-hodgkins_lymphoma     14.7
oral_cavity                4.8
ovary                      9.3
pancreas                  11.3
prostate                 117.4
stomach                    6.4
thyroid                    6.1
uterus                    28.2
dtype: float64


In [13]:
# Export final csv file
stats.to_csv('../cleaned_data/nuc_plants_binned_stats.csv', index=False)