In [2]:
# Initial imports.
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [5]:
nuc_cancer_df = pd.read_csv('../../cleaned_data/us_clean/SW16 - ML_data_nuc_cancer.csv', dtype={'GEOID': str})
nuc_cancer_df.head()

Unnamed: 0,latitude,longitude,GEOID,County_State,closest_plant,distance,plant_capacity,County,cardio_death,total_cancer,...,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease,pediatric_asthma_per_100k,adult_asthma_per_100k,COPD_per_100k,cardio_death_per_100k,adult_chronic_lung_disease_per_100k,cap_over_d2
0,32.53492,-86.642749,1001,"Autauga County, Alabama",Joseph M. Farley Nuclear Plant,128.0,1776.4,"Autauga, Alabama",263.9,506.4,...,1718.0,3906.0,4274.0,8657.0,3059.934099,6956.986375,7612.432095,470.03295,15419.004364,0.108423
1,30.66097,-87.74984,1003,"Baldwin County, Alabama",Joseph M. Farley Nuclear Plant,161.0,1776.4,"Baldwin, Alabama",241.9,455.7,...,6393.0,16246.0,19461.0,36546.0,2788.208664,7085.443135,8487.615957,105.500966,15938.976043,0.068531
2,31.869603,-85.393197,1005,"Barbour County, Alabama",Joseph M. Farley Nuclear Plant,48.0,1776.4,"Barbour, Alabama",351.2,447.2,...,664.0,1760.0,2001.0,3855.0,2700.394485,7157.67213,8137.785188,1428.280939,15677.74208,0.771007
3,32.998644,-87.126439,1007,"Bibb County, Alabama",Browns Ferry Nuclear Plant,118.0,3567.5,"Bibb, Alabama",323.6,466.1,...,584.0,1603.0,1754.0,3433.0,2638.236357,7241.597398,7923.744127,1461.872064,15508.673654,0.256212
4,33.980867,-86.567371,1009,"Blount County, Alabama",Browns Ferry Nuclear Plant,59.0,3567.5,"Blount, Alabama",283.6,438.7,...,1742.0,4028.0,4638.0,9075.0,3009.727189,6959.346222,8013.269061,489.987733,15679.261908,1.024849


In [6]:
nuc_cancer_df.columns

Index(['latitude', 'longitude', 'GEOID', 'County_State', 'closest_plant',
       'distance', 'plant_capacity', 'County', 'cardio_death', 'total_cancer',
       'bladder', 'brain', 'breast', 'breast_insitu', 'cervix', 'colon',
       'esophagus', 'kidney_and_renal', 'leukemia', 'liver', 'lung',
       'melanoma', 'non-hodgkins_lymphoma', 'oral_cavity', 'ovary', 'pancreas',
       'prostate', 'stomach', 'thyroid', 'uterus', 'Total Population',
       'pediatric_asthma', 'adult_asthma', 'COPD',
       'adult_chronic_lung_disease', 'pediatric_asthma_per_100k',
       'adult_asthma_per_100k', 'COPD_per_100k', 'cardio_death_per_100k',
       'adult_chronic_lung_disease_per_100k', 'cap_over_d2'],
      dtype='object')

In [7]:
# Define the features set.
X = nuc_cancer_df.copy()
X = X.drop(['GEOID', 'County_State', 'closest_plant', 'County',
        'cardio_death', 'total_cancer', 'bladder', 'brain',
       'breast', 'breast_insitu', 'cervix', 'colon', 'esophagus',
       'kidney_and_renal', 'leukemia', 'liver', 'lung', 'melanoma',
       'non-hodgkins_lymphoma', 'oral_cavity', 'ovary', 'pancreas', 'prostate',
       'stomach', 'thyroid', 'uterus', 'pediatric_asthma',
       'adult_asthma', 'COPD', 'adult_chronic_lung_disease', 'pediatric_asthma_per_100k',
       'adult_asthma_per_100k', 'COPD_per_100k',
       'adult_chronic_lung_disease_per_100k', 'Total Population','cardio_death_per_100k'], axis=1)
X.head()

Unnamed: 0,latitude,longitude,distance,plant_capacity,cap_over_d2
0,32.53492,-86.642749,128.0,1776.4,0.108423
1,30.66097,-87.74984,161.0,1776.4,0.068531
2,31.869603,-85.393197,48.0,1776.4,0.771007
3,32.998644,-87.126439,118.0,3567.5,0.256212
4,33.980867,-86.567371,59.0,3567.5,1.024849


In [9]:
# Define the target set.
tar_list = ['cardio_death', 'total_cancer', 'bladder', 'brain',
       'breast', 'breast_insitu', 'cervix', 'colon', 'esophagus',
       'kidney_and_renal', 'leukemia', 'liver', 'lung', 'melanoma',
       'non-hodgkins_lymphoma', 'oral_cavity', 'ovary', 'pancreas', 'prostate',
       'stomach', 'thyroid', 'uterus', 'pediatric_asthma',
       'adult_asthma', 'COPD', 'adult_chronic_lung_disease', 'pediatric_asthma_per_100k',
       'adult_asthma_per_100k', 'COPD_per_100k',
       'adult_chronic_lung_disease_per_100k', 'Total Population','cardio_death_per_100k']
y = nuc_cancer_df[tar_list]
y

Unnamed: 0,cardio_death,total_cancer,bladder,brain,breast,breast_insitu,cervix,colon,esophagus,kidney_and_renal,...,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease,pediatric_asthma_per_100k,adult_asthma_per_100k,COPD_per_100k,adult_chronic_lung_disease_per_100k,Total Population,cardio_death_per_100k
0,263.9,506.4,15.8,7.0,124.4,23.9,0.0,45.8,0.0,10.6,...,1718.0,3906.0,4274.0,8657.0,3059.934099,6956.986375,7612.432095,15419.004364,56145.0,470.032950
1,241.9,455.7,23.1,6.5,124.7,25.5,11.0,33.3,0.0,11.6,...,6393.0,16246.0,19461.0,36546.0,2788.208664,7085.443135,8487.615957,15938.976043,229287.0,105.500966
2,351.2,447.2,13.3,0.0,109.5,22.6,0.0,41.9,0.0,0.0,...,664.0,1760.0,2001.0,3855.0,2700.394485,7157.672130,8137.785188,15677.742080,24589.0,1428.280939
3,323.6,466.1,19.8,0.0,113.9,0.0,0.0,26.4,0.0,0.0,...,584.0,1603.0,1754.0,3433.0,2638.236357,7241.597398,7923.744127,15508.673654,22136.0,1461.872064
4,283.6,438.7,17.4,6.7,113.6,21.6,0.0,34.4,0.0,11.2,...,1742.0,4028.0,4638.0,9075.0,3009.727189,6959.346222,8013.269061,15679.261908,57879.0,489.987733
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2637,204.1,427.8,23.0,8.2,86.4,14.9,0.0,25.7,0.0,15.1,...,932.0,3218.0,1824.0,5399.0,2184.050805,7541.068123,4274.365524,12652.028215,42673.0,478.288379
2638,118.9,410.8,18.7,0.0,155.2,32.7,0.0,37.9,0.0,0.0,...,357.0,1945.0,1149.0,3094.0,1519.342895,8277.652466,4889.985956,13167.638422,23497.0,506.022045
2639,221.3,353.7,20.6,0.0,112.2,0.0,0.0,43.3,0.0,0.0,...,482.0,1460.0,902.0,2565.0,2384.368044,7222.359634,4462.033144,12688.597576,20215.0,1094.731635
2640,207.3,351.0,0.0,0.0,101.7,0.0,0.0,0.0,0.0,0.0,...,147.0,591.0,433.0,1043.0,1894.329897,7615.979381,5579.896907,13440.721649,7760.0,2671.391753


In [10]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [11]:
# Creating a StandardScaler instance.
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.

X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Random Forest Model

In [14]:
# Create a random forest regressor.
from sklearn.metrics import mean_squared_error, r2_score

RF = {}
eval_RF = {}
for i in y_train.columns:
    RF[i] = RandomForestRegressor(n_estimators=1024, random_state = 78)
    RF[i].fit(X_train_scaled,y_train[i])
    pred = RF[i].predict(X_test_scaled)
    pred_train = RF[i].predict(X_train_scaled)
    eval_RF[i] = [r2_score(y_train[i],pred_train),r2_score(y_test[i], pred)]
eval_RF


#rf_model = RandomForestRegressor(n_estimators = 1024, random_state = 78)

{'cardio_death': [0.9488556384443473, 0.6491803279874103],
 'total_cancer': [0.959113213314835, 0.6641219488281753],
 'bladder': [0.9164968316180541, 0.3735707251305612],
 'brain': [0.9009807780284089, 0.1970302231226274],
 'breast': [0.9131570704125099, 0.3137212089683985],
 'breast_insitu': [0.9182811101064937, 0.3331402441683714],
 'cervix': [0.8821483695746061, 0.060237968322484625],
 'colon': [0.8964012216012133, 0.26468479012509516],
 'esophagus': [0.8873126096705646, 0.06702600824673521],
 'kidney_and_renal': [0.8869799536645866, 0.19986204039565214],
 'leukemia': [0.895531412776757, 0.13041715777379237],
 'liver': [0.896672985794133, 0.10350532128998358],
 'lung': [0.9311077556958605, 0.5276775162771157],
 'melanoma': [0.9050318977653057, 0.24777330505900597],
 'non-hodgkins_lymphoma': [0.9048463441448599, 0.2080026335495121],
 'oral_cavity': [0.8912052383640356, 0.12041864259519885],
 'ovary': [0.9021034865225097, 0.18585084365830962],
 'pancreas': [0.8990306339484537, 0.18473

In [15]:
df = pd.DataFrame.from_dict(eval_RF,orient='columns')

In [17]:
df["R2"] = ["R2 train","R2 test"]

In [33]:
df2 = df[df.columns.to_list()[-1:] + df.columns.to_list()[:-1]]
df2.to_csv("R2_RF_nuclear.csv",index=False)

In [9]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [10]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [11]:
print(rf_model.score(X_train_scaled, y_train))
print(rf_model.score(X_test_scaled, y_test))

0.910458716647331
0.31177163034647315


### Lasso Model

In [12]:
# Fit Lasso regression model
lasso = Lasso(alpha=0.001)
lasso = lasso.fit(X_train_scaled, y_train)

In [13]:
# Making predictions using the testing data.
predictions = lasso.predict(X_test_scaled)

In [14]:
print(lasso.score(X_train_scaled, y_train))
print(lasso.score(X_test_scaled, y_test))

0.10889685990720466
0.13140374413264233
