In [59]:
# Import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter
import psycopg2
from psycopg2 import Error

In [60]:
# Connection parameters, yours will be different
param_dic = {
    "host"      : "ec2-34-239-241-121.compute-1.amazonaws.com",
    "database"  : "dfm419pf436p66",
    "user"      : "eulpledavzofae",
    "password"  : "07be1edaf3f3f3fef6702e318a67f7bfb3dde0268efc076985820060aef10f96"
}
def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

In [61]:
def postgresql_to_dataframe(conn, select_query, column_names):
    """
    Tranform a SELECT query into a pandas dataframe
    """
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        cursor.close()
        return 1
    
    #list of tupples
    tupples = cursor.fetchall()
    cursor.close()
    
    # turn it into a pandas dataframe
    heart_dp = pd.DataFrame(tupples, columns=column_names)
    return heart_dp

In [64]:
# Connect to the database
conn = connect(param_dic)
column_names = ["Pt_ID", "Age", "Sex", "ChestPainType", "RestingBP", "Cholesterol", "FastingBS","RestingEKG","MaxHR","ExerciseAngina","OldPeak","ST_Slope", "HeartDisease"]
# Execute the "SELECT *" query
heart_dp_test = postgresql_to_dataframe(conn, "select * from heart_dp_test", column_names)
heart_dp_test.head()

Connecting to the PostgreSQL database...
Connection successful


Unnamed: 0,Pt_ID,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingEKG,MaxHR,ExerciseAngina,OldPeak,ST_Slope,HeartDisease
0,10157,43,M,ASY,110,211,0,Normal,161,N,0.0,Up,Absence
1,10436,45,M,ASY,115,260,0,LVH,185,N,0.0,Up,Absence
2,10437,64,M,ASY,145,212,0,LVH,132,N,2.0,flat,Presence
3,10474,52,M,ASY,108,233,1,Normal,147,N,0.1,Up,Absence
4,10477,48,M,NAP,124,255,1,Normal,175,N,0.0,Up,Absence


In [65]:
heart_dp_train = postgresql_to_dataframe(conn, "select * from heart_dp_train", column_names)
heart_dp_train.head()

Unnamed: 0,Pt_ID,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingEKG,MaxHR,ExerciseAngina,OldPeak,ST_Slope,HeartDisease
0,25047,74,M,NAP,138,0,0,Normal,116,N,0.2,Up,Absence
1,25177,60,M,ASY,130,186,1,ST,140,Y,0.5,Flat,Presence
2,25498,40,M,ASY,125,0,1,Normal,165,N,0.0,Flat,Presence
3,25189,69,M,ASY,140,208,0,ST,140,Y,2.0,Flat,Presence
4,25434,62,M,ASY,158,210,1,Normal,112,Y,3.0,Down,Presence


In [30]:
# Filter df for 65+
filtered_df = heart_disease_df[heart_disease_df['Age'] >= 65]

In [31]:
# Check the distribution of our target variable
heart_disease_df['HeartDisease'].value_counts()

1    508
0    410
Name: HeartDisease, dtype: int64

In [68]:
# Encode target variable with Scikit Learn
object_columns = heart_dp_test.dtypes[heart_dp_test.dtypes == "object"].index.tolist()
le = LabelEncoder()
test_encoded_df = heart_dp_test.copy()

for column in object_columns:
    test_encoded_df[column] = le.fit_transform(test_encoded_df[column])
    
test_encoded_df.head()

Unnamed: 0,Pt_ID,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingEKG,MaxHR,ExerciseAngina,OldPeak,ST_Slope,HeartDisease
0,10157,43,1,0,110,211,0,1,161,0,0,0,0
1,10436,45,1,0,115,260,0,0,185,0,0,0,0
2,10437,64,1,0,145,212,0,0,132,0,19,2,1
3,10474,52,1,0,108,233,1,1,147,0,1,0,0
4,10477,48,1,2,124,255,1,1,175,0,0,0,0


In [69]:
# Encode target variable with Scikit Learn
object_columns = heart_dp_train.dtypes[heart_dp_train.dtypes == "object"].index.tolist()
le = LabelEncoder()
train_encoded_df = heart_dp_train.copy()

for column in object_columns:
    train_encoded_df[column] = le.fit_transform(train_encoded_df[column])
    
train_encoded_df.head()

Unnamed: 0,Pt_ID,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingEKG,MaxHR,ExerciseAngina,OldPeak,ST_Slope,HeartDisease
0,25047,74,1,2,138,0,0,1,116,0,12,2,0
1,25177,60,1,0,130,186,1,2,140,1,15,1,1
2,25498,40,1,0,125,0,1,1,165,0,10,1,1
3,25189,69,1,0,140,208,0,2,140,1,30,1,1
4,25434,62,1,0,158,210,1,1,112,1,39,0,1


In [70]:
# Create train and test groups
X_train = train_encoded_df.drop('HeartDisease', axis = 1)
X_test = test_encoded_df.drop('HeartDisease', axis = 1)
y_train = train_encoded_df['HeartDisease'].ravel()
y_test = test_encoded_df['HeartDisease'].ravel()

In [71]:
# Create random forest classifier
model = RandomForestClassifier(n_estimators = 128, random_state = 20)

In [72]:
# Fit the model
model = model.fit(X_train, y_train)

In [73]:
# Make predictions using the testing data
predictions = model.predict(X_test)

In [74]:
# Calculate confusion matrix
matrix = confusion_matrix(y_test, predictions)
matrix

array([[104,  46],
       [ 40,  80]], dtype=int64)

In [75]:
# Calculate accuracy score
accuracy_score = accuracy_score(y_test, predictions)
print(f"Accuracy score: {accuracy_score}")

Accuracy score: 0.6814814814814815


In [76]:
classification_report = classification_report(y_test, predictions)
print(classification_report)

              precision    recall  f1-score   support

           0       0.72      0.69      0.71       150
           1       0.63      0.67      0.65       120

    accuracy                           0.68       270
   macro avg       0.68      0.68      0.68       270
weighted avg       0.68      0.68      0.68       270



In [77]:
# Calculate feature importance
importances = model.feature_importances_
importances

array([0.06733644, 0.07047511, 0.03480573, 0.12622438, 0.06460155,
       0.10446331, 0.02324026, 0.02308124, 0.09467028, 0.09888383,
       0.10161636, 0.19060151])

In [79]:
# Sort the features by their importance.
sorted(zip(model.feature_importances_, X.columns), reverse=True)

[(0.12622438042641004, 'Cholesterol'),
 (0.07047510970987561, 'ChestPainType'),
 (0.06733644345562911, 'ST_Slope'),
 (0.034805731858299695, 'MaxHR')]

In [85]:
all_X_df = pd.concat([X_train,X_test])
all_y_df = pd.concat([train_encoded_df['HeartDisease'],test_encoded_df['HeartDisease']])

In [83]:
all_predictions = model.predict(all_X_df)

In [87]:
# Calculate confusion matrix
matrix = confusion_matrix(all_y_df, all_predictions)
matrix

array([[514,  46],
       [ 40, 588]], dtype=int64)

In [88]:
# Calculate accuracy score
accuracy_score = accuracy_score(all_y_df, all_predictions)
print(f"Accuracy score: {accuracy_score}")

TypeError: 'numpy.float64' object is not callable

## Eliminate Unecessary Variables

In [43]:
columns_keep = ['ST_Slope','ChestPainType','MaxHR','Cholesterol']

new_db = encoded_df[columns_keep]

new_db.head()

Unnamed: 0,ST_Slope,ChestPainType,MaxHR,Cholesterol
0,2,1,172,289
1,1,2,156,180
2,2,1,98,283
3,1,0,108,214
4,2,2,122,195


In [51]:
# Define the target and features
y = encoded_df['HeartDisease'].ravel()

X = new_db

In [52]:
# Split in to test and train sets
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(X, y, random_state = 22)

In [53]:
# Create random forest classifier
new_model = RandomForestClassifier(n_estimators = 128, random_state = 20)

In [54]:
# Fit the model
new_model = model.fit(new_X_train, new_y_train)

In [55]:
# Make predictions using the testing data
predictions = new_model.predict(new_X_test)

In [56]:
# Calculate confusion matrix
matrix = confusion_matrix(new_y_test, predictions)
matrix

array([[ 76,  23],
       [ 21, 110]], dtype=int64)

In [57]:
# Calculate accuracy score
accuracy_score = accuracy_score(new_y_test, predictions)
print(f"Accuracy score: {accuracy_score}")

TypeError: 'numpy.float64' object is not callable

In [27]:
classification_report = classification_report(y_test, predictions)
print(classification_report)

TypeError: 'str' object is not callable