In [6]:
from google.colab import drive
import os
import pandas
import seaborn
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize, LabelEncoder, OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from torch.nn.functional import softmax
from sklearn.utils.random import sample_without_replacement
import torch
import joblib

In [30]:
print(sklearn.__version__, joblib.__version__, pandas.__version__, np.__version__)

1.2.2 1.4.0 2.0.3 1.25.2


In [3]:
drive.mount('/content/gdrive', force_remount=True)
os.environ['CONFIG_DIR'] = '/content/gdrive/MyDrive/CSE 6242 Project'

Mounted at /content/gdrive


In [4]:
raw = pandas.read_csv(os.environ["CONFIG_DIR"] + "/heart_2022_no_nans.csv")
raw.head(5)
print(raw.columns)

Index(['State', 'Sex', 'GeneralHealth', 'PhysicalHealthDays',
       'MentalHealthDays', 'LastCheckupTime', 'PhysicalActivities',
       'SleepHours', 'RemovedTeeth', 'HadHeartAttack', 'HadAngina',
       'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
       'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
       'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory', 'AgeCategory',
       'HeightInMeters', 'WeightInKilograms', 'BMI', 'AlcoholDrinkers',
       'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap',
       'HighRiskLastYear', 'CovidPos'],
      dtype='object')


In [5]:
unimportant_features = ["RemovedTeeth", "DeafOrHardOfHearing", "BlindOrVisionDifficulty", "DifficultyDressingBathing", \
                        "DifficultyErrands", "DifficultyWalking"]
raw = raw.drop(unimportant_features, axis=1)
print(raw.columns)

Index(['State', 'Sex', 'GeneralHealth', 'PhysicalHealthDays',
       'MentalHealthDays', 'LastCheckupTime', 'PhysicalActivities',
       'SleepHours', 'HadHeartAttack', 'HadAngina', 'HadStroke', 'HadAsthma',
       'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder', 'HadKidneyDisease',
       'HadArthritis', 'HadDiabetes', 'DifficultyConcentrating',
       'SmokerStatus', 'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory',
       'AgeCategory', 'HeightInMeters', 'WeightInKilograms', 'BMI',
       'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
       'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos'],
      dtype='object')


In [None]:
y = raw["HadHeartAttack"]
X = raw.drop("HadHeartAttack", axis=1)
encoder = LabelEncoder()
y = encoder.fit_transform(y)

for i in X.columns:
  X[i] = encoder.fit_transform(X[i])

In [None]:
X.head(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,HadAngina,HadStroke,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,0,0,4,4,0,3,1,8,0,0,...,47,121,1191,0,0,1,1,1,0,0
1,0,1,4,0,0,3,1,5,0,0,...,65,196,1382,0,0,1,1,2,0,0
2,0,1,4,0,0,3,0,7,0,0,...,72,238,1514,1,0,0,1,0,0,2
3,0,0,1,5,0,3,1,8,0,0,...,57,181,1482,0,0,1,1,0,0,2
4,0,0,2,3,15,3,1,4,0,0,...,42,146,1635,0,0,1,1,0,0,0


In [None]:
PCA_COMPONENTS = 17

pca = PCA(n_components=PCA_COMPONENTS).fit(X)
pca_X = pca.transform(X)
print(pca_X.shape)
print(pca_X[0:5, :])

(246022, 17)
[[-4.32092805e+01  3.45796889e+01  2.73373619e+01 -3.47967304e+00
  -3.44990343e+00 -1.69497835e+00  5.27003554e-02 -1.82372101e+00
   1.83638934e+00 -1.02194930e-01 -2.88293325e-01  3.80487711e-01
  -3.34996219e-01 -2.97999686e-01 -3.19708700e-01 -2.78907250e-01
   2.68788740e-01]
 [ 1.54571121e+02 -2.29889621e+01  2.76998840e+01 -5.81088935e+00
  -8.52888153e-01 -3.59683092e+00 -7.45313702e-01 -1.63843694e+00
  -1.23065724e+00 -2.98288129e-01  6.48550792e-01  5.11334773e-01
  -4.26155085e-01 -6.69844812e-01  1.51582522e+00 -2.71786514e-01
  -6.35183288e-02]
 [ 2.90234383e+02 -5.17704559e+01  2.78958690e+01 -5.62993385e+00
  -9.68365358e-01 -4.86820557e+00  9.36365180e-01 -1.55283582e+00
   6.35883453e-01  4.05213674e-01 -9.40366835e-01 -4.24154989e-01
   1.81066913e+00  6.29991209e-02 -5.14579579e-01 -1.81742314e-01
   5.80825645e-03]
 [ 2.52465656e+02  3.68708695e+00  2.76125541e+01 -2.84832238e+00
  -4.48484979e+00 -4.70680274e+00 -7.91809236e-01  1.23378033e+00
   1.7

## WITHOUT PCA

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [None]:
ESTIMATORS = 32
MAX_DEPTH = 5
rf = RandomForestClassifier(n_estimators=ESTIMATORS, max_depth=MAX_DEPTH)
rf = rf.fit(X_train, y_train)

In [None]:
y_predict_train = rf.predict(X_train)
y_predict_test = rf.predict(X_test)
train_accuracy = accuracy_score(y_train, y_predict_train)
test_accuracy = accuracy_score(y_test, y_predict_test)
print("Train Accuracy: " + str(train_accuracy))
print("Test Accuracy: " + str(test_accuracy))

Train Accuracy: 0.945132537816102
Test Accuracy: 0.94610267318818


In [20]:
depths = [5, 8, 10, 12]
estimators = [32, 64, 100, 128]
rf_gs = RandomForestClassifier()

parameters = {"max_depth": depths, "n_estimators": estimators}
gs = GridSearchCV(rf_gs, parameters)
gs = gs.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
y_predict_train_gs = gs.predict(X_train)
y_predict_test_gs = gs.predict(X_test)
train_accuracy_gs = accuracy_score(y_train, y_predict_train_gs)
test_accuracy_gs = accuracy_score(y_test, y_predict_test_gs)
print("Train Accuracy: " + str(train_accuracy_gs))
print("Test Accuracy: " + str(test_accuracy_gs))

Train Accuracy: 0.9596608890050228
Test Accuracy: 0.9484195266031678


In [None]:
print(gs.best_params_)

{'max_depth': 12, 'n_estimators': 128}


In [None]:
feature_imp = rf.feature_importances_
feature_imp_sm = softmax(torch.Tensor(feature_imp), dim=0)
mapping = {}
for i, value in enumerate(X_train.columns):
  mapping[value] = feature_imp_sm[i].item()

print(mapping)

{'State': 0.02927371859550476, 'Sex': 0.029715562239289284, 'GeneralHealth': 0.02967844344675541, 'PhysicalHealthDays': 0.030194193124771118, 'MentalHealthDays': 0.029275700449943542, 'LastCheckupTime': 0.02930067479610443, 'PhysicalActivities': 0.02936900593340397, 'SleepHours': 0.029305562376976013, 'HadAngina': 0.05176640674471855, 'HadStroke': 0.031881485134363174, 'HadAsthma': 0.02925657294690609, 'HadSkinCancer': 0.02924789860844612, 'HadCOPD': 0.030507760122418404, 'HadDepressiveDisorder': 0.029234513640403748, 'HadKidneyDisease': 0.02967216819524765, 'HadArthritis': 0.02951102703809738, 'HadDiabetes': 0.029820343479514122, 'DifficultyConcentrating': 0.029269013553857803, 'SmokerStatus': 0.02957776002585888, 'ECigaretteUsage': 0.029239028692245483, 'ChestScan': 0.03136112540960312, 'RaceEthnicityCategory': 0.02924414351582527, 'AgeCategory': 0.03116634115576744, 'HeightInMeters': 0.029369773343205452, 'WeightInKilograms': 0.02927291765809059, 'BMI': 0.029271550476551056, 'Alcoho

# WITH PCA

In [None]:
X_train, X_test, y_train, y_test = train_test_split(pca_X, y, train_size=0.7)
ESTIMATORS = 32
MAX_DEPTH = 5
rf = RandomForestClassifier(n_estimators=ESTIMATORS, max_depth=MAX_DEPTH)
rf = rf.fit(X_train, y_train)
y_predict_train = rf.predict(X_train)
y_predict_test = rf.predict(X_test)
train_accuracy = accuracy_score(y_train, y_predict_train)
test_accuracy = accuracy_score(y_test, y_predict_test)
print("Train Accuracy: " + str(train_accuracy))
print("Test Accuracy: " + str(test_accuracy))

Train Accuracy: 0.9452777051940888
Test Accuracy: 0.9456555611256385


In [None]:
neg = 0
for i in range(y_train.shape[0]):
  if y_train[i] == 0:
    neg += 1

print(neg / y_train.shape[0])

for i in range(y_predict_test.shape[0]):
  if y_predict_test[i] == 1:
    print("here")

0.9452777051940888


# ~94% OF THE DATA IS 0 SO MODEL ALWAYS PREDICTS 0

We are sampling 15000 data points with 0 label and taking all of the data points with 1 label. 232632 have 0 label, 13390.

In [12]:
encoder = LabelEncoder()
standard_scaler = StandardScaler()
resampled = raw.copy()
continuous_cols = ["PhysicalHealthDays", "MentalHealthDays", "SleepHours", "HeightInMeters", "WeightInKilograms", "BMI"]
for i in raw.columns:
  if i in continuous_cols:
    resampled[i] = standard_scaler.fit_transform(raw[i].values.reshape(-1, 1))
  else:
    resampled[i] = encoder.fit_transform(raw[i])

resampled.head(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,HadHeartAttack,HadAngina,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,0,0,4,-0.01416,-0.514292,3,1,1.373428,0,0,...,-0.985904,-0.560199,-0.104105,0,0,1,1,1,0,0
1,0,1,4,-0.49002,-0.514292,3,1,-0.708924,0,0,...,0.701799,0.545644,0.22442,0,0,1,1,2,0,0
2,0,1,4,-0.49002,-0.514292,3,0,0.679311,0,0,...,1.358128,1.183918,0.4593,1,0,0,1,0,0,2
3,0,0,1,0.104805,-0.514292,3,1,1.373428,0,0,...,-0.048291,0.333198,0.407105,0,0,1,1,0,0,2
4,0,0,2,-0.133125,1.336949,3,1,-1.403041,0,0,...,-1.45471,-0.198619,0.675759,0,0,1,1,0,0,0


In [13]:
mask = resampled["HadHeartAttack"] == 1
pos = np.flatnonzero(mask)
resampled_1 = resampled.iloc[pos]
resampled_1.head(5)
print(resampled_1.shape)

(13435, 34)


In [14]:
mask = resampled["HadHeartAttack"] == 0
pos = np.flatnonzero(mask)
resampled_0 = resampled.iloc[pos]
resampled_0.head(5)
print(resampled_0.shape)
resampled_0 = resampled_0.sample(n=resampled_1.shape[0], axis=0, replace=False)
print(resampled_0.shape)

(232587, 34)
(13435, 34)


In [15]:
resampled_0.head(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,HadHeartAttack,HadAngina,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
194094,45,1,4,-0.49002,-0.26746,3,1,-0.014807,0,0,...,0.889322,-0.092162,-0.547768,1,1,1,0,0,0,0
101386,23,0,4,-0.49002,-0.514292,3,0,-0.708924,0,0,...,-2.392323,-1.475168,-0.580006,1,0,1,1,2,0,0
178361,42,0,2,-0.49002,-0.514292,3,0,-0.708924,0,0,...,-0.70462,0.205637,0.711067,0,0,0,1,0,0,2
118211,26,1,2,2.008247,-0.514292,3,0,0.679311,0,0,...,-0.048291,1.183918,1.369653,0,1,1,1,2,0,1
75280,18,1,4,-0.49002,-0.26746,1,1,-0.014807,0,0,...,-0.048291,-0.517991,-0.553908,1,1,1,1,1,0,0


In [16]:
resampled = pandas.concat([resampled_0, resampled_1])
print(resampled.shape)

(26870, 34)


In [17]:
y = resampled["HadHeartAttack"]
X = resampled.drop("HadHeartAttack", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [22]:
ESTIMATORS = 32
MAX_DEPTH = 5
rf = RandomForestClassifier(n_estimators=ESTIMATORS, max_depth=MAX_DEPTH)
rf = rf.fit(X_train, y_train)

In [28]:
y_predict_train_rf = rf.predict(X_train)
y_predict_test_rf = rf.predict(X_test)
train_accuracy_rf = accuracy_score(y_train, y_predict_train_rf)
test_accuracy_rf = accuracy_score(y_test, y_predict_test_rf)
print("Train Accuracy: " + str(train_accuracy_rf))
print("Test Accuracy: " + str(test_accuracy_rf))

Train Accuracy: 0.7946195970014355
Test Accuracy: 0.7958069718397222


In [23]:
depths = [5, 7, 9, 10]
estimators = [32, 64, 100, 128, 200]
rf_gs = RandomForestClassifier()

parameters = {"max_depth": depths, "n_estimators": estimators}
gs = GridSearchCV(rf_gs, parameters)
gs = gs.fit(X_train, y_train)

In [24]:
y_predict_train_gs = gs.predict(X_train)
y_predict_test_gs = gs.predict(X_test)
train_accuracy_gs = accuracy_score(y_train, y_predict_train_gs)
test_accuracy_gs = accuracy_score(y_test, y_predict_test_gs)
print("Train Accuracy: " + str(train_accuracy_gs))
print("Test Accuracy: " + str(test_accuracy_gs))

Train Accuracy: 0.8445956722845447
Test Accuracy: 0.8039945416201464


In [25]:
print(gs.best_params_)

{'max_depth': 10, 'n_estimators': 200}


In [26]:
feature_imp = rf.feature_importances_
feature_imp_sm = softmax(torch.Tensor(feature_imp), dim=0)
mapping = {}
for i, value in enumerate(X_train.columns):
  mapping[value] = feature_imp_sm[i].item()

print(mapping)

{'State': 0.02935226447880268, 'Sex': 0.029798226431012154, 'GeneralHealth': 0.03015824779868126, 'PhysicalHealthDays': 0.02994614467024803, 'MentalHealthDays': 0.029358334839344025, 'LastCheckupTime': 0.029459748417139053, 'PhysicalActivities': 0.029475022107362747, 'SleepHours': 0.02934924326837063, 'HadAngina': 0.04185376688838005, 'HadStroke': 0.031076563522219658, 'HadAsthma': 0.029333118349313736, 'HadSkinCancer': 0.02934161014854908, 'HadCOPD': 0.02989214099943638, 'HadDepressiveDisorder': 0.029336366802453995, 'HadKidneyDisease': 0.02948157861828804, 'HadArthritis': 0.03034832328557968, 'HadDiabetes': 0.03107606992125511, 'DifficultyConcentrating': 0.029363363981246948, 'SmokerStatus': 0.02999257855117321, 'ECigaretteUsage': 0.029330993071198463, 'ChestScan': 0.03441629186272621, 'RaceEthnicityCategory': 0.02932823821902275, 'AgeCategory': 0.03453536331653595, 'HeightInMeters': 0.02935885824263096, 'WeightInKilograms': 0.029417583718895912, 'BMI': 0.02941913902759552, 'AlcoholD

In [29]:
joblib.dump(gs, os.environ["CONFIG_DIR"] + "/random_forest.joblib")

['/content/gdrive/MyDrive/CSE 6242 Project/random_forest.joblib']