In [1]:
# Import dependencies
import pandas as pd
import numpy as np

# Read in csv containing county/nuclear plant data
nuc_df = pd.read_csv('../cleaned_data/closest_nuc_plant_in_each_county.csv', dtype={'GEOID': str})

# Read in csv containing county/fossil fuel plant data
ff_df = pd.read_csv('../cleaned_data/closest_ff_plant_in_each_county.csv', dtype={'GEOID': str})

# Read in csv containing county/cancer data
cancer_df = pd.read_csv('../cleaned_data/cardio_cancer_resp.csv', dtype={'FIPS': str})

In [18]:
nuc_df.head()

Unnamed: 0,latitude,longitude,GEOID,County_State,closest_plant,distance,plant_capacity
0,32.53492,-86.642749,1001,"Autauga County, Alabama",Joseph M. Farley Nuclear Plant,128.0,1776.4
1,30.66097,-87.74984,1003,"Baldwin County, Alabama",Joseph M. Farley Nuclear Plant,161.0,1776.4
2,31.869603,-85.393197,1005,"Barbour County, Alabama",Joseph M. Farley Nuclear Plant,48.0,1776.4
3,32.998644,-87.126439,1007,"Bibb County, Alabama",Browns Ferry Nuclear Plant,118.0,3567.5
4,33.980867,-86.567371,1009,"Blount County, Alabama",Browns Ferry Nuclear Plant,59.0,3567.5


In [3]:
ff_df.head()

Unnamed: 0,latitude,longitude,GEOID,County,fuel_type1,nameplate_capacity_MW1,NOx_tons1,SO2_tons1,CO2_tons1,CH4_lbs1,...,dist_from_county4,fuel_type5,nameplate_capacity_MW5,NOx_tons5,SO2_tons5,CO2_tons5,CH4_lbs5,N2O_lbs5,PM2.5_tons5,dist_from_county5
0,32.53492,-86.642749,1001,Autauga County,Gas,939.4,50.521,3.849,762545.203,28447.358,...,13.0,Gas,104.2,367.256,2.531,0.0,16303.841,9009.438,1.220169,15.0
1,30.66097,-87.74984,1003,Baldwin County,Gas,50.0,450.864,4.58,167490.328,6318.013,...,22.0,Gas,317.4,35.096,2.719,538661.787,16687.381,1668.738,72.588035,24.0
2,31.869603,-85.393197,1005,Barbour County,Biomass,120.5,312.818,0.59,0.0,134642.958,...,51.0,Biomass,101.2,350.852,888.835,62961.826,80929.295,16646.007,16.746958,52.0
3,32.998644,-87.126439,1007,Bibb County,Biomass,13.0,16.113,2.219,0.011,12526.086,...,42.0,Gas,2034.0,3462.81,1149.051,5283997.438,768776.092,108753.017,215.882167,42.0
4,33.980867,-86.567371,1009,Blount County,Other Fossil,3.8,2.197,0.009,1006.565,37.975,...,43.0,Gas,748.0,60.07,1.062,209853.617,8101.532,831.828,13.755226,44.0


In [4]:
# Rename FIPS column to GEOID to match other DatFrames
cancer_df.rename(columns={'FIPS': 'GEOID'},inplace=True)
cancer_df.columns

Index(['GEOID', 'County', 'cardio_death', 'total_cancer', 'bladder', 'brain',
       'breast', 'breast_insitu', 'cervix', 'colon', 'esophagus',
       'kidney_and_renal', 'leukemia', 'liver', 'lung', 'melanoma',
       'non-hodgkins_lymphoma', 'oral_cavity', 'ovary', 'pancreas', 'prostate',
       'stomach', 'thyroid', 'uterus', 'Rate_range', 'bladder_range',
       'brain_range', 'breast_range', 'breast_insitu_range', 'cervix_range',
       'colon_range', 'esophagus_range', 'kidney and renal_range',
       'leukemia_range', 'liver_range', 'lung_range', 'melanoma_range',
       'non-hudgkin lymphoma_range', 'oral cavity_range', 'ovay_range',
       'pancreas_range', 'prostate_range', 'stomach_range', 'thyroid_range',
       'uteras_range', 'Total Population', 'pediatric_asthma', 'adult_asthma',
       'COPD', 'adult_chronic_lung_disease', 'lung_cancer'],
      dtype='object')

In [5]:
# Correct GEOIDs that dropped the leading 0
for index, row in cancer_df.iterrows():
    if len(row["GEOID"]) == 4:
        cancer_df.loc[index,"GEOID"] = "0" + row["GEOID"]
cancer_df.tail(50)

Unnamed: 0,GEOID,County,cardio_death,total_cancer,bladder,brain,breast,breast_insitu,cervix,colon,...,prostate_range,stomach_range,thyroid_range,uteras_range,Total Population,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease,lung_cancer
3084,27017,"Carlton, Minnesota",195.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,35769.0,338.0,2355.0,1335.0,3629.0,19.0
3085,27025,"Chisago, Minnesota",186.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,56794.0,542.0,3747.0,2078.0,5733.0,30.0
3086,27031,"Cook, Minnesota",148.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5417.0,35.0,378.0,258.0,599.0,3.0
3087,27037,"Dakota, Minnesota",147.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,431807.0,4429.0,27956.0,14864.0,42727.0,225.0
3088,27053,"Hennepin, Minnesota",151.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1268408.0,11687.0,85036.0,42915.0,126688.0,661.0
3089,27059,"Isanti, Minnesota",178.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,41429.0,415.0,2690.0,1491.0,4146.0,22.0
3090,27065,"Kanabec, Minnesota",187.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,16416.0,147.0,1083.0,665.0,1703.0,9.0
3091,27075,"Lake, Minnesota",183.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10639.0,86.0,709.0,474.0,1137.0,6.0
3092,27095,"Mille Lacs, Minnesota",185.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,26146.0,260.0,1688.0,981.0,2640.0,14.0
3093,27115,"Pine, Minnesota",191.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,29359.0,239.0,1987.0,1214.0,3086.0,15.0


In [6]:
# Drop all of the 'range' columns since they won't be used in the ML model, then sort by GEOID
cancer_df.drop(columns=['Rate_range',
       'bladder_range', 'brain_range', 'breast_range', 'breast_insitu_range',
       'cervix_range', 'colon_range', 'esophagus_range',
       'kidney and renal_range', 'leukemia_range', 'liver_range', 'lung_range',
       'melanoma_range', 'non-hudgkin lymphoma_range', 'oral cavity_range',
       'ovay_range', 'pancreas_range', 'prostate_range', 'stomach_range',
       'thyroid_range', 'uteras_range','lung_cancer'],inplace=True)
cancer_df.sort_values(by=['GEOID'],inplace=True)
cancer_df.head()

Unnamed: 0,GEOID,County,cardio_death,total_cancer,bladder,brain,breast,breast_insitu,cervix,colon,...,pancreas,prostate,stomach,thyroid,uterus,Total Population,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease
1957,1001,"Autauga, Alabama",263.9,506.4,15.8,7.0,124.4,23.9,0.0,45.8,...,9.7,158.0,0.0,0.0,25.5,56145.0,1718.0,3906.0,4274.0,8657.0
1242,1003,"Baldwin, Alabama",241.9,455.7,23.1,6.5,124.7,25.5,11.0,33.3,...,10.0,91.8,8.1,3.8,17.6,229287.0,6393.0,16246.0,19461.0,36546.0
1243,1005,"Barbour, Alabama",351.2,447.2,13.3,0.0,109.5,22.6,0.0,41.9,...,0.0,162.6,0.0,0.0,22.7,24589.0,664.0,1760.0,2001.0,3855.0
1455,1007,"Bibb, Alabama",323.6,466.1,19.8,0.0,113.9,0.0,0.0,26.4,...,0.0,112.1,0.0,0.0,25.3,22136.0,584.0,1603.0,1754.0,3433.0
1958,1009,"Blount, Alabama",283.6,438.7,17.4,6.7,113.6,21.6,0.0,34.4,...,11.5,96.9,0.0,9.4,23.5,57879.0,1742.0,4028.0,4638.0,9075.0


In [7]:
# Count and then drop all NaN values in the cancer_df
print(cancer_df.isna().sum())
cancer_df.dropna(inplace=True)
print(cancer_df.isna().sum())

GEOID                           0
County                          0
cardio_death                    2
total_cancer                  254
bladder                       254
brain                         254
breast                        254
breast_insitu                 254
cervix                        254
colon                         254
esophagus                     254
kidney_and_renal              254
leukemia                      254
liver                         254
lung                          254
melanoma                      254
non-hodgkins_lymphoma         254
oral_cavity                   254
ovary                         254
pancreas                      254
prostate                      254
stomach                       254
thyroid                       254
uterus                        254
Total Population              276
pediatric_asthma              276
adult_asthma                  276
COPD                          276
adult_chronic_lung_disease    276
dtype: int64
G

In [8]:
# Use get_dummes() to encode the fuel_type columns
ff_df_encoded = pd.get_dummies(ff_df, columns=["fuel_type1","fuel_type2","fuel_type3","fuel_type4","fuel_type5"])
ff_df_encoded.head()

Unnamed: 0,latitude,longitude,GEOID,County,nameplate_capacity_MW1,NOx_tons1,SO2_tons1,CO2_tons1,CH4_lbs1,N2O_lbs1,...,fuel_type4_Biomass,fuel_type4_Coal,fuel_type4_Gas,fuel_type4_Oil,fuel_type4_Other Fossil,fuel_type5_Biomass,fuel_type5_Coal,fuel_type5_Gas,fuel_type5_Oil,fuel_type5_Other Fossil
0,32.53492,-86.642749,1001,Autauga County,939.4,50.521,3.849,762545.203,28447.358,2844.736,...,1,0,0,0,0,0,0,1,0,0
1,30.66097,-87.74984,1003,Baldwin County,50.0,450.864,4.58,167490.328,6318.013,631.801,...,1,0,0,0,0,0,0,1,0,0
2,31.869603,-85.393197,1005,Barbour County,120.5,312.818,0.59,0.0,134642.958,24655.8,...,0,0,1,0,0,1,0,0,0,0
3,32.998644,-87.126439,1007,Bibb County,13.0,16.113,2.219,0.011,12526.086,1644.049,...,0,0,1,0,0,0,0,1,0,0
4,33.980867,-86.567371,1009,Blount County,3.8,2.197,0.009,1006.565,37.975,3.79,...,0,1,0,0,0,0,0,1,0,0


In [9]:
# Merge the cancer_df into the nuc_df on the GEOID
nuc_cancer_df = nuc_df.merge(cancer_df, how="right",on="GEOID")
nuc_cancer_df.head()

Unnamed: 0,latitude,longitude,GEOID,County_State,closest_plant,distance,plant_capacity,County,cardio_death,total_cancer,...,pancreas,prostate,stomach,thyroid,uterus,Total Population,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease
0,32.53492,-86.642749,1001,"Autauga County, Alabama",Joseph M. Farley Nuclear Plant,128.0,1776.4,"Autauga, Alabama",263.9,506.4,...,9.7,158.0,0.0,0.0,25.5,56145.0,1718.0,3906.0,4274.0,8657.0
1,30.66097,-87.74984,1003,"Baldwin County, Alabama",Joseph M. Farley Nuclear Plant,161.0,1776.4,"Baldwin, Alabama",241.9,455.7,...,10.0,91.8,8.1,3.8,17.6,229287.0,6393.0,16246.0,19461.0,36546.0
2,31.869603,-85.393197,1005,"Barbour County, Alabama",Joseph M. Farley Nuclear Plant,48.0,1776.4,"Barbour, Alabama",351.2,447.2,...,0.0,162.6,0.0,0.0,22.7,24589.0,664.0,1760.0,2001.0,3855.0
3,32.998644,-87.126439,1007,"Bibb County, Alabama",Browns Ferry Nuclear Plant,118.0,3567.5,"Bibb, Alabama",323.6,466.1,...,0.0,112.1,0.0,0.0,25.3,22136.0,584.0,1603.0,1754.0,3433.0
4,33.980867,-86.567371,1009,"Blount County, Alabama",Browns Ferry Nuclear Plant,59.0,3567.5,"Blount, Alabama",283.6,438.7,...,11.5,96.9,0.0,9.4,23.5,57879.0,1742.0,4028.0,4638.0,9075.0


In [10]:
nuc_cancer_df["pediatric_asthma_per_100k"] = nuc_cancer_df["pediatric_asthma"] / nuc_cancer_df["Total Population"] * 100000
nuc_cancer_df["adult_asthma_per_100k"] = nuc_cancer_df["adult_asthma"] / nuc_cancer_df["Total Population"] * 100000
nuc_cancer_df["COPD_per_100k"] = nuc_cancer_df["COPD"] / nuc_cancer_df["Total Population"] * 100000
nuc_cancer_df["cardio_death_per_100k"] = nuc_cancer_df["cardio_death"] / nuc_cancer_df["Total Population"] * 100000
nuc_cancer_df["adult_chronic_lung_disease_per_100k"] = nuc_cancer_df["adult_chronic_lung_disease"] / nuc_cancer_df["Total Population"] * 100000
nuc_cancer_df.head()

Unnamed: 0,latitude,longitude,GEOID,County_State,closest_plant,distance,plant_capacity,County,cardio_death,total_cancer,...,Total Population,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease,pediatric_asthma_per_100k,adult_asthma_per_100k,COPD_per_100k,cardio_death_per_100k,adult_chronic_lung_disease_per_100k
0,32.53492,-86.642749,1001,"Autauga County, Alabama",Joseph M. Farley Nuclear Plant,128.0,1776.4,"Autauga, Alabama",263.9,506.4,...,56145.0,1718.0,3906.0,4274.0,8657.0,3059.934099,6956.986375,7612.432095,470.03295,15419.004364
1,30.66097,-87.74984,1003,"Baldwin County, Alabama",Joseph M. Farley Nuclear Plant,161.0,1776.4,"Baldwin, Alabama",241.9,455.7,...,229287.0,6393.0,16246.0,19461.0,36546.0,2788.208664,7085.443135,8487.615957,105.500966,15938.976043
2,31.869603,-85.393197,1005,"Barbour County, Alabama",Joseph M. Farley Nuclear Plant,48.0,1776.4,"Barbour, Alabama",351.2,447.2,...,24589.0,664.0,1760.0,2001.0,3855.0,2700.394485,7157.67213,8137.785188,1428.280939,15677.74208
3,32.998644,-87.126439,1007,"Bibb County, Alabama",Browns Ferry Nuclear Plant,118.0,3567.5,"Bibb, Alabama",323.6,466.1,...,22136.0,584.0,1603.0,1754.0,3433.0,2638.236357,7241.597398,7923.744127,1461.872064,15508.673654
4,33.980867,-86.567371,1009,"Blount County, Alabama",Browns Ferry Nuclear Plant,59.0,3567.5,"Blount, Alabama",283.6,438.7,...,57879.0,1742.0,4028.0,4638.0,9075.0,3009.727189,6959.346222,8013.269061,489.987733,15679.261908


In [29]:
for index, row in nuc_cancer_df.iterrows():
    nuc_cancer_df.loc[index,"cap_over_d2"] = row["plant_capacity"] / row["distance"] ** 2
nuc_cancer_df.head()

Unnamed: 0,latitude,longitude,GEOID,County_State,closest_plant,distance,plant_capacity,County,cardio_death,total_cancer,...,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease,pediatric_asthma_per_100k,adult_asthma_per_100k,COPD_per_100k,cardio_death_per_100k,adult_chronic_lung_disease_per_100k,cap_over_d2
0,32.53492,-86.642749,1001,"Autauga County, Alabama",Joseph M. Farley Nuclear Plant,128.0,1776.4,"Autauga, Alabama",263.9,506.4,...,1718.0,3906.0,4274.0,8657.0,3059.934099,6956.986375,7612.432095,470.03295,15419.004364,0.108423
1,30.66097,-87.74984,1003,"Baldwin County, Alabama",Joseph M. Farley Nuclear Plant,161.0,1776.4,"Baldwin, Alabama",241.9,455.7,...,6393.0,16246.0,19461.0,36546.0,2788.208664,7085.443135,8487.615957,105.500966,15938.976043,0.068531
2,31.869603,-85.393197,1005,"Barbour County, Alabama",Joseph M. Farley Nuclear Plant,48.0,1776.4,"Barbour, Alabama",351.2,447.2,...,664.0,1760.0,2001.0,3855.0,2700.394485,7157.67213,8137.785188,1428.280939,15677.74208,0.771007
3,32.998644,-87.126439,1007,"Bibb County, Alabama",Browns Ferry Nuclear Plant,118.0,3567.5,"Bibb, Alabama",323.6,466.1,...,584.0,1603.0,1754.0,3433.0,2638.236357,7241.597398,7923.744127,1461.872064,15508.673654,0.256212
4,33.980867,-86.567371,1009,"Blount County, Alabama",Browns Ferry Nuclear Plant,59.0,3567.5,"Blount, Alabama",283.6,438.7,...,1742.0,4028.0,4638.0,9075.0,3009.727189,6959.346222,8013.269061,489.987733,15679.261908,1.024849


In [11]:
# Merge the cancer_df into the ff_df on the GEOID
ff_cancer_df = ff_df_encoded.merge(cancer_df, how="right",on="GEOID")
ff_cancer_df.head()

Unnamed: 0,latitude,longitude,GEOID,County_x,nameplate_capacity_MW1,NOx_tons1,SO2_tons1,CO2_tons1,CH4_lbs1,N2O_lbs1,...,pancreas,prostate,stomach,thyroid,uterus,Total Population,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease
0,32.53492,-86.642749,1001,Autauga County,939.4,50.521,3.849,762545.203,28447.358,2844.736,...,9.7,158.0,0.0,0.0,25.5,56145.0,1718.0,3906.0,4274.0,8657.0
1,30.66097,-87.74984,1003,Baldwin County,50.0,450.864,4.58,167490.328,6318.013,631.801,...,10.0,91.8,8.1,3.8,17.6,229287.0,6393.0,16246.0,19461.0,36546.0
2,31.869603,-85.393197,1005,Barbour County,120.5,312.818,0.59,0.0,134642.958,24655.8,...,0.0,162.6,0.0,0.0,22.7,24589.0,664.0,1760.0,2001.0,3855.0
3,32.998644,-87.126439,1007,Bibb County,13.0,16.113,2.219,0.011,12526.086,1644.049,...,0.0,112.1,0.0,0.0,25.3,22136.0,584.0,1603.0,1754.0,3433.0
4,33.980867,-86.567371,1009,Blount County,3.8,2.197,0.009,1006.565,37.975,3.79,...,11.5,96.9,0.0,9.4,23.5,57879.0,1742.0,4028.0,4638.0,9075.0


In [12]:
ff_cancer_df["pediatric_asthma_per_100k"] = ff_cancer_df["pediatric_asthma"] / ff_cancer_df["Total Population"] * 100000
ff_cancer_df["adult_asthma_per_100k"] = ff_cancer_df["adult_asthma"] / ff_cancer_df["Total Population"] * 100000
ff_cancer_df["COPD_per_100k"] = ff_cancer_df["COPD"] / ff_cancer_df["Total Population"] * 100000
ff_cancer_df["cardio_death_per_100k"] = ff_cancer_df["cardio_death"] / ff_cancer_df["Total Population"] * 100000
ff_cancer_df["adult_chronic_lung_disease_per_100k"] = ff_cancer_df["adult_chronic_lung_disease"] / ff_cancer_df["Total Population"] * 100000
ff_cancer_df.head()

Unnamed: 0,latitude,longitude,GEOID,County_x,nameplate_capacity_MW1,NOx_tons1,SO2_tons1,CO2_tons1,CH4_lbs1,N2O_lbs1,...,Total Population,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease,pediatric_asthma_per_100k,adult_asthma_per_100k,COPD_per_100k,cardio_death_per_100k,adult_chronic_lung_disease_per_100k
0,32.53492,-86.642749,1001,Autauga County,939.4,50.521,3.849,762545.203,28447.358,2844.736,...,56145.0,1718.0,3906.0,4274.0,8657.0,3059.934099,6956.986375,7612.432095,470.03295,15419.004364
1,30.66097,-87.74984,1003,Baldwin County,50.0,450.864,4.58,167490.328,6318.013,631.801,...,229287.0,6393.0,16246.0,19461.0,36546.0,2788.208664,7085.443135,8487.615957,105.500966,15938.976043
2,31.869603,-85.393197,1005,Barbour County,120.5,312.818,0.59,0.0,134642.958,24655.8,...,24589.0,664.0,1760.0,2001.0,3855.0,2700.394485,7157.67213,8137.785188,1428.280939,15677.74208
3,32.998644,-87.126439,1007,Bibb County,13.0,16.113,2.219,0.011,12526.086,1644.049,...,22136.0,584.0,1603.0,1754.0,3433.0,2638.236357,7241.597398,7923.744127,1461.872064,15508.673654
4,33.980867,-86.567371,1009,Blount County,3.8,2.197,0.009,1006.565,37.975,3.79,...,57879.0,1742.0,4028.0,4638.0,9075.0,3009.727189,6959.346222,8013.269061,489.987733,15679.261908


In [26]:
for index, row in ff_cancer_df.iterrows():
    for i in np.arange(5):
        cap_index = "nameplate_capacity_MW" + str(i+1)
        old_NOx_index = "NOx_tons" + str(i+1)
        old_SO2_index = "SO2_tons" + str(i+1)
        old_CO2_index = "CO2_tons" + str(i+1)
        old_CH4_index = "CH4_lbs" + str(i+1)
        old_N2O_index = "N2O_lbs" + str(i+1)
        old_PM25_index = "PM2.5_tons" + str(i+1)
        new_NOx_index = "NOx_norm" + str(i+1)
        new_SO2_index = "SO2_norm" + str(i+1)
        new_CO2_index = "CO2_norm" + str(i+1)
        new_CH4_index = "CH4_norm" + str(i+1)
        new_N2O_index = "N2O_norm" + str(i+1)
        new_PM25_index = "PM2.5_norm" + str(i+1)
        ff_cancer_df.loc[index,new_NOx_index] = row[old_NOx_index] / row[cap_index]
        ff_cancer_df.loc[index,new_SO2_index] = row[old_SO2_index] / row[cap_index]
        ff_cancer_df.loc[index,new_CO2_index] = row[old_CO2_index] / row[cap_index]
        ff_cancer_df.loc[index,new_CH4_index] = row[old_CH4_index] / row[cap_index]
        ff_cancer_df.loc[index,new_N2O_index] = row[old_N2O_index] / row[cap_index]
        ff_cancer_df.loc[index,new_PM25_index] = row[old_PM25_index] / row[cap_index]
ff_cancer_df.head()

Unnamed: 0,latitude,longitude,GEOID,County_x,nameplate_capacity_MW1,NOx_tons1,SO2_tons1,CO2_tons1,CH4_lbs1,N2O_lbs1,...,SO2_norm4,CO2_norm4,CH4_norm4,N2O_norm4,PM2.5_norm4,SO2_norm5,CO2_norm5,CH4_norm5,N2O_norm5,PM2.5_norm5
0,32.53492,-86.642749,1001,Autauga County,939.4,50.521,3.849,762545.203,28447.358,2844.736,...,6.735791,0.0,600.173051,159.15902,0.093953,0.02429,0.0,156.466804,86.462937,0.01171
1,30.66097,-87.74984,1003,Baldwin County,50.0,450.864,4.58,167490.328,6318.013,631.801,...,0.900625,0.0,0.0,0.0,0.395226,0.008566,1697.107079,52.575239,5.257524,0.228696
2,31.869603,-85.393197,1005,Barbour County,120.5,312.818,0.59,0.0,134642.958,24655.8,...,0.00033,63.970049,2.404243,0.240427,0.004277,8.782955,622.152431,799.696591,164.486235,0.165484
3,32.998644,-87.126439,1007,Bibb County,13.0,16.113,2.219,0.011,12526.086,1644.049,...,0.02429,0.0,156.466804,86.462937,0.01171,0.564922,2597.835515,377.96268,53.46756,0.106137
4,33.980867,-86.567371,1009,Blount County,3.8,2.197,0.009,1006.565,37.975,3.79,...,0.100718,422.446337,82.916872,12.063848,0.062996,0.00142,280.552964,10.830925,1.11207,0.018389


In [30]:
# Drop any post-merge NaN values and export DataFrames to csvs
# nuc_cancer_df.dropna(inplace=True)
ff_cancer_df.dropna(inplace=True)
nuc_cancer_df.to_csv('../cleaned_data/ML_data_nuc_cancer.csv',index=False)
ff_cancer_df.to_csv('../cleaned_data/ML_data_ff_cancer.csv',index=False)