# Problem Statement: 
- The project involves creating an effective tool for early detection and risk assessment of heart
diseases. Utilising key variables such as ‘HadHeartAttack’ among others. We have to predict that whether the person is going to suffer from any Heart Attack in future based on his/her present health condition.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('heart_2022_no_nans.csv')
data.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.6,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.7,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [3]:
data.shape

(246022, 40)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246022 entries, 0 to 246021
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      246022 non-null  object 
 1   Sex                        246022 non-null  object 
 2   GeneralHealth              246022 non-null  object 
 3   PhysicalHealthDays         246022 non-null  float64
 4   MentalHealthDays           246022 non-null  float64
 5   LastCheckupTime            246022 non-null  object 
 6   PhysicalActivities         246022 non-null  object 
 7   SleepHours                 246022 non-null  float64
 8   RemovedTeeth               246022 non-null  object 
 9   HadHeartAttack             246022 non-null  object 
 10  HadAngina                  246022 non-null  object 
 11  HadStroke                  246022 non-null  object 
 12  HadAsthma                  246022 non-null  object 
 13  HadSkinCancer              24

In [5]:
data.describe()

Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,BMI
count,246022.0,246022.0,246022.0,246022.0,246022.0,246022.0
mean,4.119026,4.16714,7.021331,1.70515,83.615179,28.668136
std,8.405844,8.102687,1.440681,0.106654,21.323156,6.513973
min,0.0,0.0,1.0,0.91,28.12,12.02
25%,0.0,0.0,6.0,1.63,68.04,24.27
50%,0.0,0.0,7.0,1.7,81.65,27.46
75%,3.0,4.0,8.0,1.78,95.25,31.89
max,30.0,30.0,24.0,2.41,292.57,97.65


In [6]:
data.columns

Index(['State', 'Sex', 'GeneralHealth', 'PhysicalHealthDays',
       'MentalHealthDays', 'LastCheckupTime', 'PhysicalActivities',
       'SleepHours', 'RemovedTeeth', 'HadHeartAttack', 'HadAngina',
       'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
       'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
       'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory', 'AgeCategory',
       'HeightInMeters', 'WeightInKilograms', 'BMI', 'AlcoholDrinkers',
       'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap',
       'HighRiskLastYear', 'CovidPos'],
      dtype='object')

In [None]:
sns.countplot(data['Sex'])
plt.show()

In [None]:
sns.countplot(data['HadHeartAttack'])
plt.show()

In [None]:
(len(data.HadHeartAttack[data.HadHeartAttack == 'Yes'])/len(data)) * 100

- As we can see this uneven distribution of classes and only 5.46% of the classes are Yes and rest are no's.

In [None]:
col = []
for i in data.columns:
  if len(data[i].value_counts().astype('category')) == 2:
    col.append(i)
len(col)

In [None]:
def binary_map(x):
  return x.map({'Yes' : 1, 'No': 0})

data[col[1:]] = data[col[1:]].apply(binary_map)
data['Sex'] = data['Sex'].map({'Male': 1, 'Female': 0})

data[col].head()

In [None]:
categ = ['State', 'GeneralHealth', 'LastCheckupTime', 'RemovedTeeth', 'HadDiabetes', 'SmokerStatus', 'ECigaretteUsage', 'AgeCategory', 'RaceEthnicityCategory', 'TetanusLast10Tdap' ,'CovidPos']

categ_col = data[categ]
categ_col.head()

In [None]:
encoder = LabelEncoder()

for i in categ:
  data[i] = encoder.fit_transform(data[i])

data[categ].head()

In [None]:
for i in categ:
  print(np.unique(data[i]))

In [None]:
numeric = []
for i in data.columns:
  if (i not in categ) & (i not in col):
    numeric.append(i)

numeric

In [None]:
# Now as we can see all of the columns have been converted into numeric
data.info()

In [None]:
# checking the description of only numerical vars for different percentiles
data[numeric].describe(percentiles=[0.01, 0.10, .15, .25, .50, 0.65, .75, 0.80, 0.85, 0.88, .90, .95, .99])

In [None]:
plt.figure(figsize=(20, 12))

count = 1
for i in numeric:
  plt.subplot(2, 3, count)
  sns.boxplot(data[i])
  count = count+1
plt.show()

- As we can see from the above plot that some of them has to be replaced with 90th percentile and some at 95th percentile and so on.
---



In [None]:
# Using clip in method
upper_threshold = data['PhysicalHealthDays'].quantile(0.84)
data['PhysicalHealthDays'] = data['PhysicalHealthDays'].clip(upper=upper_threshold)

upper_threshold = data['MentalHealthDays'].quantile(0.86)
data['MentalHealthDays'] = data['MentalHealthDays'].clip(upper=upper_threshold)

upper_threshold = data['SleepHours'].quantile(0.99)
lower_threshold = data['SleepHours'].quantile(0.01)
data['SleepHours'] = data['SleepHours'].clip(lower=lower_threshold, upper=upper_threshold)

upper_threshold = data['HeightInMeters'].quantile(0.99)
lower_threshold = data['HeightInMeters'].quantile(0.01)
data['HeightInMeters'] = data['HeightInMeters'].clip(lower=lower_threshold, upper=upper_threshold)

upper_threshold = data['WeightInKilograms'].quantile(0.97)
data['WeightInKilograms'] = data['WeightInKilograms'].clip(upper=upper_threshold)

upper_threshold = data['BMI'].quantile(0.96)
lower_threshold = data['BMI'].quantile(0.01)
data['BMI'] = data['BMI'].clip(lower=lower_threshold, upper=upper_threshold)

In [None]:
plt.figure(figsize=(20, 12))

count = 1
for i in numeric:
  plt.subplot(2, 3, count)
  sns.boxplot(data[i])
  count = count+1
plt.show()

- Now all outliers have been clipped up and the dataset is fine to move ahead for the next stop towards splitting.

In [None]:
# visualising the heatmap
plt.figure(figsize=(28,18))

sns.heatmap(data.corr(), annot=True, linewidth=3, cmap='Greens')
plt.show()

- As we can see from the above heatmap that there are some columns which are highly correlated but they are generally related to each other e.g. BMI and WeightInKilograms have a correlation of 0.86 but BMI is to be calculated usign Weight and Height.
Sex and HeightInMetres are also having high correlation and this is common also in general as Males are more taller than females.
That's why we don't need to drop these highly correlated columns.

## Train Test Split

In [None]:
X = data.drop('HadHeartAttack', axis=1)
y = data['HadHeartAttack']

In [None]:
smote = SMOTE(random_state=42)
X_over, y_over = smote.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=42, stratify=y_over)
X_train.head()

In [None]:
X_train.shape

In [None]:
def draw_roc(actual, probs):
  fpr, tpr, thresholds = metrics.roc_curve(actual, probs, drop_intermediate=False)
  auc_score = metrics.roc_auc_score(actual, probs)
  plt.figure(figsize=(5,5))
  plt.plot(fpr,tpr,label='ROC curve (area = {}f'.format(auc_score))
  plt.plot([0,1], [0,1], 'r--')
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.05])
  plt.xlabel('False Positive rate or (1 - specificity)')
  plt.ylabel('True Positive rate')
  plt.title('Receiver Operating Characterisitcs example')
  plt.legend(loc='lower right')
  plt.show()

  return None

In [None]:
# Initialize XGBoost classifier
xgb_classifier = xgb.XGBClassifier(n_estimators=500, objective='binary:logistic', random_state=42)

# Train the classifier
xgb_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_classifier.predict(X_test)

# Calculate accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
proba = xgb_classifier.predict_proba(X_test)
draw_roc(y_test, proba[:, 1])

In [None]:
# finding the confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

# finding some important terms from the above confusion_matrix
TP = cm[1,1]
TN = cm[0,0]
FP = cm[0,1]
FN = cm[1,0]

# sensitivity
sensi = TP/ float(TP + FN)
print(f'Sensitvity is : {sensi}')

# specificty
speci = TN/ float(TN + FP)
print(f'Specificity is : {speci}')

# finding f1_score
f1_score = metrics.f1_score(y_true=y_test, y_pred=y_pred)
print(f'F1 Score is : {f1_score}')

# positive predictive value
PPV = TP / float(TP + FP)
print(f'Positive Predicted Value is : {PPV}')

#  false positive rate
FPR = FP / float(FP + TN)
print(f'False Positive Rate is : {FPR}')

# finding prevalence
prevalence = (TP+FN)/float(TP+FN+FP+TN)

# finding negative_predicted_value
negative_predicted_value = (TN+FP)/float(TP+TN+FP+FN)

# finding true accuracy
true_accu = sensi * prevalence + speci * negative_predicted_value
print(f'True accuracy of the model is : {true_accu}')

In [None]:
import pickle

with open('model.pkl', 'wb') as f:
  pickle.dump(xgb_classifier, f)

f.close()