In [56]:
from google.colab import drive
import os
import pandas
import seaborn
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from torch.nn.functional import softmax
from sklearn.utils.random import sample_without_replacement
import torch
import joblib

In [57]:
print(sklearn.__version__, joblib.__version__, pandas.__version__)

1.2.2 1.4.0 2.0.3


In [58]:
drive.mount('/content/gdrive', force_remount=True)
os.environ['CONFIG_DIR'] = '/content/gdrive/MyDrive/CSE 6242 Project'

Mounted at /content/gdrive


In [59]:
raw = pandas.read_csv(os.environ["CONFIG_DIR"] + "/heart_2022_no_nans.csv")
raw.head(5)
print(raw.columns)

Index(['State', 'Sex', 'GeneralHealth', 'PhysicalHealthDays',
       'MentalHealthDays', 'LastCheckupTime', 'PhysicalActivities',
       'SleepHours', 'RemovedTeeth', 'HadHeartAttack', 'HadAngina',
       'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
       'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
       'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory', 'AgeCategory',
       'HeightInMeters', 'WeightInKilograms', 'BMI', 'AlcoholDrinkers',
       'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap',
       'HighRiskLastYear', 'CovidPos'],
      dtype='object')


In [60]:
unimportant_features = ["RemovedTeeth", "DeafOrHardOfHearing", "BlindOrVisionDifficulty", "DifficultyDressingBathing", \
                        "DifficultyErrands", "DifficultyWalking"]
raw = raw.drop(unimportant_features, axis=1)
print(raw.columns)

Index(['State', 'Sex', 'GeneralHealth', 'PhysicalHealthDays',
       'MentalHealthDays', 'LastCheckupTime', 'PhysicalActivities',
       'SleepHours', 'HadHeartAttack', 'HadAngina', 'HadStroke', 'HadAsthma',
       'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder', 'HadKidneyDisease',
       'HadArthritis', 'HadDiabetes', 'DifficultyConcentrating',
       'SmokerStatus', 'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory',
       'AgeCategory', 'HeightInMeters', 'WeightInKilograms', 'BMI',
       'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
       'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos'],
      dtype='object')


In [61]:
y = raw["HadHeartAttack"]
X = raw.drop("HadHeartAttack", axis=1)
encoder = LabelEncoder()
y = encoder.fit_transform(y)

for i in X.columns:
  X[i] = encoder.fit_transform(X[i])

In [62]:
X.head(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,HadAngina,HadStroke,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,0,0,4,4,0,3,1,8,0,0,...,47,121,1191,0,0,1,1,1,0,0
1,0,1,4,0,0,3,1,5,0,0,...,65,196,1382,0,0,1,1,2,0,0
2,0,1,4,0,0,3,0,7,0,0,...,72,238,1514,1,0,0,1,0,0,2
3,0,0,1,5,0,3,1,8,0,0,...,57,181,1482,0,0,1,1,0,0,2
4,0,0,2,3,15,3,1,4,0,0,...,42,146,1635,0,0,1,1,0,0,0


In [63]:
PCA_COMPONENTS = 17

pca = PCA(n_components=PCA_COMPONENTS).fit(X)
pca_X = pca.transform(X)
print(pca_X.shape)
print(pca_X[0:5, :])

(246022, 17)
[[-4.32092805e+01  3.45796889e+01  2.73373619e+01 -3.47967304e+00
  -3.44990343e+00 -1.69497835e+00  5.27003554e-02 -1.82372101e+00
   1.83638934e+00 -1.02194930e-01 -2.88293325e-01  3.80487711e-01
  -3.34996219e-01 -2.97999686e-01 -3.19708700e-01 -2.78907250e-01
   2.68788740e-01]
 [ 1.54571121e+02 -2.29889621e+01  2.76998840e+01 -5.81088935e+00
  -8.52888153e-01 -3.59683092e+00 -7.45313702e-01 -1.63843694e+00
  -1.23065724e+00 -2.98288129e-01  6.48550792e-01  5.11334773e-01
  -4.26155085e-01 -6.69844812e-01  1.51582522e+00 -2.71786514e-01
  -6.35183288e-02]
 [ 2.90234383e+02 -5.17704559e+01  2.78958690e+01 -5.62993385e+00
  -9.68365358e-01 -4.86820557e+00  9.36365180e-01 -1.55283582e+00
   6.35883453e-01  4.05213674e-01 -9.40366835e-01 -4.24154989e-01
   1.81066913e+00  6.29991209e-02 -5.14579579e-01 -1.81742314e-01
   5.80825645e-03]
 [ 2.52465656e+02  3.68708695e+00  2.76125541e+01 -2.84832238e+00
  -4.48484979e+00 -4.70680274e+00 -7.91809236e-01  1.23378033e+00
   1.7

## WITHOUT PCA

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [65]:
ESTIMATORS = 32
MAX_DEPTH = 5
rf = RandomForestClassifier(n_estimators=ESTIMATORS, max_depth=MAX_DEPTH)
rf = rf.fit(X_train, y_train)

In [66]:
y_predict_train = rf.predict(X_train)
y_predict_test = rf.predict(X_test)
train_accuracy = accuracy_score(y_train, y_predict_train)
test_accuracy = accuracy_score(y_test, y_predict_test)
print("Train Accuracy: " + str(train_accuracy))
print("Test Accuracy: " + str(test_accuracy))

Train Accuracy: 0.945132537816102
Test Accuracy: 0.94610267318818


In [67]:
depths = [5, 8, 10, 12]
estimators = [32, 64, 100, 128]
rf_gs = RandomForestClassifier()

parameters = {"max_depth": depths, "n_estimators": estimators}
gs = GridSearchCV(rf_gs, parameters)
gs = gs.fit(X_train, y_train)

In [68]:
y_predict_train_gs = gs.predict(X_train)
y_predict_test_gs = gs.predict(X_test)
train_accuracy_gs = accuracy_score(y_train, y_predict_train_gs)
test_accuracy_gs = accuracy_score(y_test, y_predict_test_gs)
print("Train Accuracy: " + str(train_accuracy_gs))
print("Test Accuracy: " + str(test_accuracy_gs))

Train Accuracy: 0.9596608890050228
Test Accuracy: 0.9484195266031678


In [69]:
print(gs.best_params_)

{'max_depth': 12, 'n_estimators': 128}


In [70]:
feature_imp = rf.feature_importances_
feature_imp_sm = softmax(torch.Tensor(feature_imp), dim=0)
mapping = {}
for i, value in enumerate(X_train.columns):
  mapping[value] = feature_imp_sm[i].item()

print(mapping)

{'State': 0.02927371859550476, 'Sex': 0.029715562239289284, 'GeneralHealth': 0.02967844344675541, 'PhysicalHealthDays': 0.030194193124771118, 'MentalHealthDays': 0.029275700449943542, 'LastCheckupTime': 0.02930067479610443, 'PhysicalActivities': 0.02936900593340397, 'SleepHours': 0.029305562376976013, 'HadAngina': 0.05176640674471855, 'HadStroke': 0.031881485134363174, 'HadAsthma': 0.02925657294690609, 'HadSkinCancer': 0.02924789860844612, 'HadCOPD': 0.030507760122418404, 'HadDepressiveDisorder': 0.029234513640403748, 'HadKidneyDisease': 0.02967216819524765, 'HadArthritis': 0.02951102703809738, 'HadDiabetes': 0.029820343479514122, 'DifficultyConcentrating': 0.029269013553857803, 'SmokerStatus': 0.02957776002585888, 'ECigaretteUsage': 0.029239028692245483, 'ChestScan': 0.03136112540960312, 'RaceEthnicityCategory': 0.02924414351582527, 'AgeCategory': 0.03116634115576744, 'HeightInMeters': 0.029369773343205452, 'WeightInKilograms': 0.02927291765809059, 'BMI': 0.029271550476551056, 'Alcoho

# WITH PCA

In [71]:
X_train, X_test, y_train, y_test = train_test_split(pca_X, y, train_size=0.7)
ESTIMATORS = 32
MAX_DEPTH = 5
rf = RandomForestClassifier(n_estimators=ESTIMATORS, max_depth=MAX_DEPTH)
rf = rf.fit(X_train, y_train)
y_predict_train = rf.predict(X_train)
y_predict_test = rf.predict(X_test)
train_accuracy = accuracy_score(y_train, y_predict_train)
test_accuracy = accuracy_score(y_test, y_predict_test)
print("Train Accuracy: " + str(train_accuracy))
print("Test Accuracy: " + str(test_accuracy))

Train Accuracy: 0.9452777051940888
Test Accuracy: 0.9456555611256385


In [72]:
neg = 0
for i in range(y_train.shape[0]):
  if y_train[i] == 0:
    neg += 1

print(neg / y_train.shape[0])

for i in range(y_predict_test.shape[0]):
  if y_predict_test[i] == 1:
    print("here")

0.9452777051940888


# ~94% OF THE DATA IS 0 SO MODEL ALWAYS PREDICTS 0

We are sampling 15000 data points with 0 label and taking all of the data points with 1 label. 232632 have 0 label, 13390.

In [73]:
encoder = LabelEncoder()
resampled = raw.copy()

for i in raw.columns:
  resampled[i] = encoder.fit_transform(raw[i])

resampled.head(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,HadHeartAttack,HadAngina,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,0,0,4,4,0,3,1,8,0,0,...,47,121,1191,0,0,1,1,1,0,0
1,0,1,4,0,0,3,1,5,0,0,...,65,196,1382,0,0,1,1,2,0,0
2,0,1,4,0,0,3,0,7,0,0,...,72,238,1514,1,0,0,1,0,0,2
3,0,0,1,5,0,3,1,8,0,0,...,57,181,1482,0,0,1,1,0,0,2
4,0,0,2,3,15,3,1,4,0,0,...,42,146,1635,0,0,1,1,0,0,0


In [74]:
mask = resampled["HadHeartAttack"] == 1
pos = np.flatnonzero(mask)
resampled_1 = resampled.iloc[pos]
resampled_1.head(5)
print(resampled_1.shape)

(13435, 34)


In [75]:
mask = resampled["HadHeartAttack"] == 0
pos = np.flatnonzero(mask)
resampled_0 = resampled.iloc[pos]
resampled_0.head(5)
print(resampled_0.shape)
resampled_0 = resampled_0.sample(n=resampled_1.shape[0], axis=0, replace=False)
print(resampled_0.shape)

(232587, 34)
(13435, 34)


In [76]:
resampled_0.head(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,HadHeartAttack,HadAngina,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
3948,1,1,2,0,0,2,1,5,0,0,...,70,265,1819,1,0,0,0,1,0,2
107795,24,0,4,0,0,1,1,7,0,0,...,60,153,1134,1,0,1,0,2,0,2
103139,23,1,2,0,0,3,1,5,0,0,...,80,302,1813,1,0,0,0,1,0,0
60589,15,0,1,6,2,3,1,5,0,1,...,39,172,2005,0,0,1,1,1,0,2
1151,0,0,1,3,12,3,1,4,0,0,...,50,124,1142,0,0,1,1,2,0,2


In [77]:
resampled = pandas.concat([resampled_0, resampled_1])
print(resampled.shape)

(26870, 34)


In [78]:
y = resampled["HadHeartAttack"]
X = resampled.drop("HadHeartAttack", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [79]:
ESTIMATORS = 32
MAX_DEPTH = 5
rf = RandomForestClassifier(n_estimators=ESTIMATORS, max_depth=MAX_DEPTH)
rf = rf.fit(X_train, y_train)

In [80]:
y_predict_train_gs = gs.predict(X_train)
y_predict_test_gs = gs.predict(X_test)
train_accuracy_gs = accuracy_score(y_train, y_predict_train_gs)
test_accuracy_gs = accuracy_score(y_test, y_predict_test_gs)
print("Train Accuracy: " + str(train_accuracy_gs))
print("Test Accuracy: " + str(test_accuracy_gs))

Train Accuracy: 0.6086447977032272
Test Accuracy: 0.6130753008311624


In [81]:
depths = [5, 7, 9, 10]
estimators = [32, 64, 100, 128, 200]
rf_gs = RandomForestClassifier()

parameters = {"max_depth": depths, "n_estimators": estimators}
gs = GridSearchCV(rf_gs, parameters)
gs = gs.fit(X_train, y_train)

In [82]:
y_predict_train_gs = gs.predict(X_train)
y_predict_test_gs = gs.predict(X_test)
train_accuracy_gs = accuracy_score(y_train, y_predict_train_gs)
test_accuracy_gs = accuracy_score(y_test, y_predict_test_gs)
print("Train Accuracy: " + str(train_accuracy_gs))
print("Test Accuracy: " + str(test_accuracy_gs))

Train Accuracy: 0.8270508798979213
Test Accuracy: 0.8011412976057561


In [83]:
print(gs.best_params_)

{'max_depth': 9, 'n_estimators': 200}


In [84]:
feature_imp = rf.feature_importances_
feature_imp_sm = softmax(torch.Tensor(feature_imp), dim=0)
mapping = {}
for i, value in enumerate(X_train.columns):
  mapping[value] = feature_imp_sm[i].item()

print(mapping)

{'State': 0.02933443896472454, 'Sex': 0.029637716710567474, 'GeneralHealth': 0.029688579961657524, 'PhysicalHealthDays': 0.02997865341603756, 'MentalHealthDays': 0.029337940737605095, 'LastCheckupTime': 0.029798012226819992, 'PhysicalActivities': 0.029604293406009674, 'SleepHours': 0.029379144310951233, 'HadAngina': 0.04432046413421631, 'HadStroke': 0.03100547380745411, 'HadAsthma': 0.02931283414363861, 'HadSkinCancer': 0.029328282922506332, 'HadCOPD': 0.02973254956305027, 'HadDepressiveDisorder': 0.029314551502466202, 'HadKidneyDisease': 0.02947290800511837, 'HadArthritis': 0.03066699393093586, 'HadDiabetes': 0.03073950856924057, 'DifficultyConcentrating': 0.029351145029067993, 'SmokerStatus': 0.029956335201859474, 'ECigaretteUsage': 0.02931501343846321, 'ChestScan': 0.03356216847896576, 'RaceEthnicityCategory': 0.029315264895558357, 'AgeCategory': 0.033494751900434494, 'HeightInMeters': 0.029357189312577248, 'WeightInKilograms': 0.029359055683016777, 'BMI': 0.029377546161413193, 'Alc

In [85]:
estimators_2 = [50, 64, 70, 80, 90]
rf_gs_2 = RandomForestClassifier(max_depth=7)

parameters_2 = {"n_estimators": estimators_2}
gs_2 = GridSearchCV(rf_gs_2, parameters_2)
gs_2 = gs_2.fit(X_train, y_train)

In [86]:
y_predict_train_gs = gs_2.predict(X_train)
y_predict_test_gs = gs_2.predict(X_test)
train_accuracy_gs = accuracy_score(y_train, y_predict_train_gs)
test_accuracy_gs = accuracy_score(y_test, y_predict_test_gs)
print("Train Accuracy: " + str(train_accuracy_gs))
print("Test Accuracy: " + str(test_accuracy_gs))

Train Accuracy: 0.8064756233717901
Test Accuracy: 0.7975437290658727


In [87]:
print(gs_2.best_params_)

{'n_estimators': 80}


In [88]:
joblib.dump(gs_2, os.environ["CONFIG_DIR"] + "/random_forest.joblib")

['/content/gdrive/MyDrive/CSE 6242 Project/random_forest.joblib']