In [3]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

s3 = boto3.resource('s3')
bucket_name = 'data-445-timlincoln'
bucket = s3.Bucket(bucket_name)

file_key = 'framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the data-file
heart = pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
# Dropping N/A variables
heart = heart.dropna()

In [5]:
# Defining the input and target variables
X = heart[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate']]
Y = heart['TenYearCHD']

# Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [7]:
# Building the logistic model
logit_md = LogisticRegression().fit(X_train, Y_train)

# Predicting on the test dataset
logit_pred = logit_md.predict_proba(X_test)[:, 1]
logit_pred

array([0.05311206, 0.29802904, 0.15098592, 0.23139418, 0.0865068 ,
       0.12136778, 0.23392585, 0.1189662 , 0.14616151, 0.16620518,
       0.08439073, 0.25208038, 0.04976175, 0.16343605, 0.0432696 ,
       0.06759932, 0.14895421, 0.06371273, 0.25377248, 0.09704343,
       0.12881536, 0.20797038, 0.09392893, 0.28696214, 0.18487526,
       0.20305217, 0.06600963, 0.26986676, 0.17358585, 0.07884335,
       0.14516574, 0.28035744, 0.09648252, 0.08468647, 0.12198432,
       0.07748619, 0.33973788, 0.15774106, 0.05819158, 0.09304183,
       0.10627703, 0.12808969, 0.09359943, 0.22813442, 0.14010363,
       0.09070799, 0.09114195, 0.16097883, 0.12978833, 0.04875517,
       0.26942636, 0.11148424, 0.22099304, 0.27624173, 0.08861903,
       0.18806   , 0.26865749, 0.30555123, 0.20772979, 0.30217466,
       0.03369496, 0.04285856, 0.21412106, 0.25110141, 0.18797837,
       0.12120376, 0.03912967, 0.05612444, 0.16290346, 0.24965887,
       0.13266909, 0.06923969, 0.37473816, 0.18123206, 0.23840

In [8]:
# Changing liklihoods to labels
logit_label = np.where(logit_pred < 0.25, 0, 1)
logit_label

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,

In [9]:
# Constructing the confusion matrix
confusion_matrix(Y_test, logit_label)

array([[535,  87],
       [ 70,  40]])

In [10]:
# Computing the accuracy
accuracy_score(Y_test, logit_label)

0.7855191256830601