In [1]:
# Initial imports.
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
ff_cancer_df = pd.read_csv('../cleaned_data/ML_data_ff_cancer_w_avgs.csv', dtype={'GEOID': str})
ff_cancer_df.head()

Unnamed: 0,latitude,longitude,GEOID,County_x,nameplate_capacity_MW1,NOx_tons1,SO2_tons1,CO2_tons1,CH4_lbs1,N2O_lbs1,...,avg_CO2_over_d2,avg_CH4_over_d2,avg_N2O_over_d2,avg_PM2.5_over_d2,avg_NOx_norm_over_d2,avg_SO2_norm_over_d2,avg_CO2_norm_over_d2,avg_CH4_norm_over_d2,avg_N2O_norm_over_d2,avg_PM2.5_norm_over_d2
0,32.53492,-86.642749,1001,Autauga County,939.4,50.521,3.849,762545.203,28447.358,2844.736,...,9090.126982,405.357721,57.630938,0.40477,0.005512,0.008032,7.723091,1.124036,0.292679,0.000477
1,30.66097,-87.74984,1003,Baldwin County,50.0,450.864,4.58,167490.328,6318.013,631.801,...,555.172427,19.5538,1.95538,0.043945,0.005227,0.000431,3.4755,0.126674,0.012667,0.000382
2,31.869603,-85.393197,1005,Barbour County,120.5,312.818,0.59,0.0,134642.958,24655.8,...,5.282991,35.93527,6.714078,0.001799,0.001069,0.000666,0.060757,0.308435,0.057802,3.1e-05
3,32.998644,-87.126439,1007,Bibb County,13.0,16.113,2.219,0.011,12526.086,1644.049,...,1157.221806,111.274622,15.659547,0.051646,0.000896,0.000105,0.894578,0.282251,0.044297,4.2e-05
4,33.980867,-86.567371,1009,Blount County,3.8,2.197,0.009,1006.565,37.975,3.79,...,3172.217969,659.893056,95.881832,0.032135,0.000698,6.5e-05,1.383653,0.245878,0.035385,3.3e-05


In [3]:
print(ff_cancer_df.columns.tolist())

['latitude', 'longitude', 'GEOID', 'County_x', 'nameplate_capacity_MW1', 'NOx_tons1', 'SO2_tons1', 'CO2_tons1', 'CH4_lbs1', 'N2O_lbs1', 'PM2.5_tons1', 'dist_from_county1', 'nameplate_capacity_MW2', 'NOx_tons2', 'SO2_tons2', 'CO2_tons2', 'CH4_lbs2', 'N2O_lbs2', 'PM2.5_tons2', 'dist_from_county2', 'nameplate_capacity_MW3', 'NOx_tons3', 'SO2_tons3', 'CO2_tons3', 'CH4_lbs3', 'N2O_lbs3', 'PM2.5_tons3', 'dist_from_county3', 'nameplate_capacity_MW4', 'NOx_tons4', 'SO2_tons4', 'CO2_tons4', 'CH4_lbs4', 'N2O_lbs4', 'PM2.5_tons4', 'dist_from_county4', 'nameplate_capacity_MW5', 'NOx_tons5', 'SO2_tons5', 'CO2_tons5', 'CH4_lbs5', 'N2O_lbs5', 'PM2.5_tons5', 'dist_from_county5', 'fuel_type1_Biomass', 'fuel_type1_Coal', 'fuel_type1_Gas', 'fuel_type1_Oil', 'fuel_type1_Other Fossil', 'fuel_type2_Biomass', 'fuel_type2_Coal', 'fuel_type2_Gas', 'fuel_type2_Oil', 'fuel_type2_Other Fossil', 'fuel_type3_Biomass', 'fuel_type3_Coal', 'fuel_type3_Gas', 'fuel_type3_Oil', 'fuel_type3_Other Fossil', 'fuel_type4_Biom

In [4]:
# Define the features set.
X = ff_cancer_df.copy()
X = X.drop(['GEOID', 'County_x', 'County_y', 'cardio_death',
        'total_cancer', 'bladder', 'brain',
       'breast', 'breast_insitu', 'cervix', 'colon', 'esophagus',
       'kidney_and_renal', 'leukemia', 'liver', 'lung', 'melanoma',
       'non-hodgkins_lymphoma', 'oral_cavity', 'ovary', 'pancreas', 'prostate',
       'pediatric_asthma', 'adult_asthma', 'COPD', 'adult_chronic_lung_disease',
       'stomach', 'thyroid', 'uterus','adult_chronic_lung_disease', 'pediatric_asthma_per_100k',
       'adult_asthma_per_100k', 'COPD_per_100k',
       'adult_chronic_lung_disease_per_100k', 'cardio_death_per_100k'], axis=1)
X.head()

Unnamed: 0,latitude,longitude,nameplate_capacity_MW1,NOx_tons1,SO2_tons1,CO2_tons1,CH4_lbs1,N2O_lbs1,PM2.5_tons1,dist_from_county1,...,avg_CO2_over_d2,avg_CH4_over_d2,avg_N2O_over_d2,avg_PM2.5_over_d2,avg_NOx_norm_over_d2,avg_SO2_norm_over_d2,avg_CO2_norm_over_d2,avg_CH4_norm_over_d2,avg_N2O_norm_over_d2,avg_PM2.5_norm_over_d2
0,32.53492,-86.642749,939.4,50.521,3.849,762545.203,28447.358,2844.736,36.930022,10.0,...,9090.126982,405.357721,57.630938,0.40477,0.005512,0.008032,7.723091,1.124036,0.292679,0.000477
1,30.66097,-87.74984,50.0,450.864,4.58,167490.328,6318.013,631.801,7.857154,19.0,...,555.172427,19.5538,1.95538,0.043945,0.005227,0.000431,3.4755,0.126674,0.012667,0.000382
2,31.869603,-85.393197,120.5,312.818,0.59,0.0,134642.958,24655.8,1.386006,30.0,...,5.282991,35.93527,6.714078,0.001799,0.001069,0.000666,0.060757,0.308435,0.057802,3.1e-05
3,32.998644,-87.126439,13.0,16.113,2.219,0.011,12526.086,1644.049,0.0,31.0,...,1157.221806,111.274622,15.659547,0.051646,0.000896,0.000105,0.894578,0.282251,0.044297,4.2e-05
4,33.980867,-86.567371,3.8,2.197,0.009,1006.565,37.975,3.79,0.154237,30.0,...,3172.217969,659.893056,95.881832,0.032135,0.000698,6.5e-05,1.383653,0.245878,0.035385,3.3e-05


In [5]:
# Define the target set.
y = ff_cancer_df["uterus"]
y

0       25.5
1       17.6
2       22.7
3       25.3
4       23.5
        ... 
2584    16.6
2585     0.0
2586     0.0
2587     0.0
2588     0.0
Name: uterus, Length: 2589, dtype: float64

In [6]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [7]:
# Creating a StandardScaler instance.
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Random Forest Model

In [8]:
# Create a random forest regressor.
rf_model = RandomForestRegressor(n_estimators = 1024, random_state = 78)

In [9]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [10]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [11]:
print(rf_model.score(X_train_scaled, y_train))
print(rf_model.score(X_test_scaled, y_test))

0.950391802731808
0.6636867780516436


### Lasso Model

In [12]:
# Fit Lasso regression model
lasso = Lasso(alpha=0.02)
lasso = lasso.fit(X_train_scaled, y_train)

In [13]:
# Making predictions using the testing data.
predictions = lasso.predict(X_test_scaled)

In [14]:
print(lasso.score(X_train_scaled, y_train))
print(lasso.score(X_test_scaled, y_test))

0.28247675143780826
0.18828243053977023
