In [1]:
# Initial imports.
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
nuc_cancer_df = pd.read_csv('../cleaned_data/ML_data_nuc_cancer.csv', dtype={'GEOID': str})
nuc_cancer_df.head()

Unnamed: 0,latitude,longitude,GEOID,County_State,closest_plant,distance,plant_capacity,County,cardio_death,total_cancer,...,pediatric_asthma,adult_asthma,COPD,adult_chronic_lung_disease,pediatric_asthma_per_100k,adult_asthma_per_100k,COPD_per_100k,cardio_death_per_100k,adult_chronic_lung_disease_per_100k,cap_over_d2
0,32.53492,-86.642749,1001,"Autauga County, Alabama",Joseph M. Farley Nuclear Plant,128.0,1776.4,"Autauga, Alabama",263.9,506.4,...,1718.0,3906.0,4274.0,8657.0,3059.934099,6956.986375,7612.432095,470.03295,15419.004364,0.108423
1,30.66097,-87.74984,1003,"Baldwin County, Alabama",Joseph M. Farley Nuclear Plant,161.0,1776.4,"Baldwin, Alabama",241.9,455.7,...,6393.0,16246.0,19461.0,36546.0,2788.208664,7085.443135,8487.615957,105.500966,15938.976043,0.068531
2,31.869603,-85.393197,1005,"Barbour County, Alabama",Joseph M. Farley Nuclear Plant,48.0,1776.4,"Barbour, Alabama",351.2,447.2,...,664.0,1760.0,2001.0,3855.0,2700.394485,7157.67213,8137.785188,1428.280939,15677.74208,0.771007
3,32.998644,-87.126439,1007,"Bibb County, Alabama",Browns Ferry Nuclear Plant,118.0,3567.5,"Bibb, Alabama",323.6,466.1,...,584.0,1603.0,1754.0,3433.0,2638.236357,7241.597398,7923.744127,1461.872064,15508.673654,0.256212
4,33.980867,-86.567371,1009,"Blount County, Alabama",Browns Ferry Nuclear Plant,59.0,3567.5,"Blount, Alabama",283.6,438.7,...,1742.0,4028.0,4638.0,9075.0,3009.727189,6959.346222,8013.269061,489.987733,15679.261908,1.024849


In [3]:
nuc_cancer_df.columns

Index(['latitude', 'longitude', 'GEOID', 'County_State', 'closest_plant',
       'distance', 'plant_capacity', 'County', 'cardio_death', 'total_cancer',
       'bladder', 'brain', 'breast', 'breast_insitu', 'cervix', 'colon',
       'esophagus', 'kidney_and_renal', 'leukemia', 'liver', 'lung',
       'melanoma', 'non-hodgkins_lymphoma', 'oral_cavity', 'ovary', 'pancreas',
       'prostate', 'stomach', 'thyroid', 'uterus', 'Total Population',
       'pediatric_asthma', 'adult_asthma', 'COPD',
       'adult_chronic_lung_disease', 'pediatric_asthma_per_100k',
       'adult_asthma_per_100k', 'COPD_per_100k', 'cardio_death_per_100k',
       'adult_chronic_lung_disease_per_100k', 'cap_over_d2'],
      dtype='object')

In [4]:
# Define the features set.
X = nuc_cancer_df.copy()
X = X.drop(['GEOID', 'County_State', 'closest_plant', 'County',
        'cardio_death', 'total_cancer', 'bladder', 'brain',
       'breast', 'breast_insitu', 'cervix', 'colon', 'esophagus',
       'kidney_and_renal', 'leukemia', 'liver', 'lung', 'melanoma',
       'non-hodgkins_lymphoma', 'oral_cavity', 'ovary', 'pancreas', 'prostate',
       'stomach', 'thyroid', 'uterus', 'pediatric_asthma',
       'adult_asthma', 'COPD', 'adult_chronic_lung_disease', 'pediatric_asthma_per_100k',
       'adult_asthma_per_100k', 'COPD_per_100k',
       'adult_chronic_lung_disease_per_100k', 'Total Population','cardio_death_per_100k'], axis=1)
X.head()

Unnamed: 0,latitude,longitude,distance,plant_capacity,cap_over_d2
0,32.53492,-86.642749,128.0,1776.4,0.108423
1,30.66097,-87.74984,161.0,1776.4,0.068531
2,31.869603,-85.393197,48.0,1776.4,0.771007
3,32.998644,-87.126439,118.0,3567.5,0.256212
4,33.980867,-86.567371,59.0,3567.5,1.024849


In [5]:
# Define the target set.
y = nuc_cancer_df["uterus"]
y

0       25.5
1       17.6
2       22.7
3       25.3
4       23.5
        ... 
2637    16.6
2638     0.0
2639     0.0
2640     0.0
2641     0.0
Name: uterus, Length: 2642, dtype: float64

In [6]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [7]:
# Creating a StandardScaler instance.
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Random Forest Model

In [8]:
# Create a random forest regressor.
rf_model = RandomForestRegressor(n_estimators = 1024, random_state = 78)

In [9]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [10]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [11]:
print(rf_model.score(X_train_scaled, y_train))
print(rf_model.score(X_test_scaled, y_test))

0.910458716647331
0.31177163034647315


### Lasso Model

In [12]:
# Fit Lasso regression model
lasso = Lasso(alpha=0.001)
lasso = lasso.fit(X_train_scaled, y_train)

In [13]:
# Making predictions using the testing data.
predictions = lasso.predict(X_test_scaled)

In [14]:
print(lasso.score(X_train_scaled, y_train))
print(lasso.score(X_test_scaled, y_test))

0.10889685990720466
0.13140374413264233
