## Importing the dataset and libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_selector

In [2]:
df = pd.read_csv('heart_attack_prediction_dataset.csv')

In [3]:
df

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,MSV9918,60,Male,121,94/76,61,1,1,1,0,...,10.806373,235420,19.655895,67,7,7,Thailand,Asia,Northern Hemisphere,0
8759,QSV6764,28,Female,120,157/102,73,1,0,0,1,...,3.833038,217881,23.993866,617,4,9,Canada,North America,Northern Hemisphere,0
8760,XKA5925,47,Male,250,161/75,105,0,1,1,1,...,2.375214,36998,35.406146,527,4,4,Brazil,South America,Southern Hemisphere,1
8761,EPE6801,36,Male,178,119/67,60,1,0,1,0,...,0.029104,209943,27.294020,114,2,8,Brazil,South America,Southern Hemisphere,0


In [4]:
df.isnull().sum()

Patient ID                         0
Age                                0
Sex                                0
Cholesterol                        0
Blood Pressure                     0
Heart Rate                         0
Diabetes                           0
Family History                     0
Smoking                            0
Obesity                            0
Alcohol Consumption                0
Exercise Hours Per Week            0
Diet                               0
Previous Heart Problems            0
Medication Use                     0
Stress Level                       0
Sedentary Hours Per Day            0
Income                             0
BMI                                0
Triglycerides                      0
Physical Activity Days Per Week    0
Sleep Hours Per Day                0
Country                            0
Continent                          0
Hemisphere                         0
Heart Attack Risk                  0
dtype: int64

## Droping the unnecessary columns and Preprocessing

In [5]:
df = df.drop(['Patient ID'], axis =1)

In [6]:
df.columns

Index(['Age', 'Sex', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Diabetes',
       'Family History', 'Smoking', 'Obesity', 'Alcohol Consumption',
       'Exercise Hours Per Week', 'Diet', 'Previous Heart Problems',
       'Medication Use', 'Stress Level', 'Sedentary Hours Per Day', 'Income',
       'BMI', 'Triglycerides', 'Physical Activity Days Per Week',
       'Sleep Hours Per Day', 'Country', 'Continent', 'Hemisphere',
       'Heart Attack Risk'],
      dtype='object')

In [7]:
# Assuming 'Blood Pressure' is the column containing fraction values
df['Blood Pressure'] = df['Blood Pressure'].astype(str)  # Convert to string

In [8]:
# Split the 'Blood Pressure' column into two separate columns
pressure_split = df['Blood Pressure'].str.split('/', expand=True)

In [9]:
# Concatenate the new columns with the original dataframe
pressure_split.columns = ['systolic_pressure', 'diastolic_pressure']
df = pd.concat([df, pressure_split], axis=1)

In [10]:
# Convert the new columns to numeric format
df[['systolic_pressure', 'diastolic_pressure']] = df.iloc[:, -2:].apply(pd.to_numeric, errors='coerce')

In [11]:
# Drop multiple columns
columns_to_drop = ['Blood Pressure', 'Country', 'Continent']
df = df.drop(columns=columns_to_drop, axis=1)

In [12]:
categories = ['Sex', 'Diet', 'Hemisphere']

In [13]:
# Apply label encoding to categorical columns
label_encoder = LabelEncoder()
for category in categories:
    df[category] = label_encoder.fit_transform(df[category])

In [14]:
df.head(2)

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Hemisphere,Heart Attack Risk,systolic_pressure,diastolic_pressure
0,67,1,208,72,0,0,1,0,0,4.168189,...,6.615001,261404,31.251233,286,0,6,1,0,158,88
1,21,1,389,98,1,1,1,1,1,1.813242,...,4.963459,285768,27.194973,235,1,7,0,0,165,93


In [15]:
df.columns

Index(['Age', 'Sex', 'Cholesterol', 'Heart Rate', 'Diabetes', 'Family History',
       'Smoking', 'Obesity', 'Alcohol Consumption', 'Exercise Hours Per Week',
       'Diet', 'Previous Heart Problems', 'Medication Use', 'Stress Level',
       'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides',
       'Physical Activity Days Per Week', 'Sleep Hours Per Day', 'Hemisphere',
       'Heart Attack Risk', 'systolic_pressure', 'diastolic_pressure'],
      dtype='object')

In [16]:
df.shape

(8763, 24)

In [17]:
# Specify the columns to be standardized
selected_columns = ['Age', 'Cholesterol', 'Heart Rate', 'Exercise Hours Per Week', 'Income', 'BMI','Triglycerides', 'systolic_pressure', 'diastolic_pressure']

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the selected columns
#df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [18]:
#X = df.iloc[:, :-1]
X = df.iloc[:, list(range(0, 21)) + list(range(22, 24))]
y = df.iloc[:, 21]

In [19]:
y

0       0
1       0
2       0
3       0
4       0
       ..
8758    0
8759    0
8760    1
8761    0
8762    1
Name: Heart Attack Risk, Length: 8763, dtype: int64

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
#X_train, X_test, y_train, y_test = train_test_split(df.drop('Patient ID', axis=1), df['Patient ID'], test_size=0.2, random_state=42)

In [21]:
X_train[selected_columns] = scaler.fit_transform(X_train[selected_columns])
X_test[selected_columns] = scaler.fit_transform(X_test[selected_columns])

In [22]:
#corr_matrix = X_train.corrwith(y_train, axis=0)
corr_matrix = X_train.corr()

In [23]:
print(corr_matrix)

                                      Age       Sex  Cholesterol  Heart Rate  \
Age                              1.000000  0.014346     0.000529   -0.016336   
Sex                              0.014346  1.000000     0.015163   -0.010525   
Cholesterol                      0.000529  0.015163     1.000000   -0.002963   
Heart Rate                      -0.016336 -0.010525    -0.002963    1.000000   
Diabetes                        -0.024148 -0.012469    -0.025931    0.005494   
Family History                   0.004047  0.000115    -0.019460   -0.016214   
Smoking                          0.387507  0.510212     0.026923   -0.024577   
Obesity                         -0.013891 -0.002800    -0.013842    0.018449   
Alcohol Consumption             -0.005837  0.009586    -0.010336    0.005361   
Exercise Hours Per Week         -0.001411 -0.009356     0.033360    0.002820   
Diet                            -0.001133 -0.013989     0.005182   -0.009837   
Previous Heart Problems          0.00631

In [24]:
# Get the column names of the dataframe
columns = corr_matrix.columns

#Create an empty list to keep track of the columns to drop
columns_to_drop = []

# loop over the columns
for i in range (len(columns)):
    for j in range(i +1, len(columns)):
        # Access the cell of the dataframe
        if corr_matrix.loc[columns[i], columns[j]] > 0.095:
            columns_to_drop.append(columns[j])

print(len(columns_to_drop))

2


In [25]:
columns_to_drop = set(columns_to_drop)

In [26]:
columns_to_drop

{'Smoking'}

In [27]:
df.shape

(8763, 24)

## Model selection

In [28]:
def models(X_train, y_train):
    #Logistic regression
    from sklearn.linear_model import LogisticRegression
    log=LogisticRegression(random_state=0)
    log.fit(X_train, y_train)
    
    #Decision Tree
    from sklearn.tree import DecisionTreeClassifier
    tree=DecisionTreeClassifier(random_state =0, criterion="entropy")
    tree.fit(X_train, y_train)
    
    #Random Forest
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(random_state=0, criterion="entropy", n_estimators =10)
    forest.fit(X_train, y_train)

    #SVM
    from sklearn import svm
    clf = svm.SVC()
    clf.fit(X_train, y_train)
    
    print('[0]Logistic regression accuracy: ', log.score(X_train, y_train))
    print('[0]Decision Tree: ', tree.score(X_train, y_train))
    print('[0]Random Forest accuracy: ', forest.score(X_train, y_train))
    print('[0]SVM accuracy: ', clf.score(X_train, y_train))
    return log,tree, forest, clf

In [29]:
model = models(X_train, y_train)

[0]Logistic regression accuracy:  0.6433666191155493
[0]Decision Tree:  1.0
[0]Random Forest accuracy:  0.9774607703281027
[0]SVM accuracy:  0.6433666191155493


In [30]:
# testing the models/ results

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [31]:
for i in range(len(model)):
    print("Model : ", i)
    print(classification_report(y_test, model[i].predict(X_test)))
    print('Accuracy : ', accuracy_score(y_test, model[i].predict(X_test)))

Model :  0
              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1114
           1       0.00      0.00      0.00       639

    accuracy                           0.64      1753
   macro avg       0.32      0.50      0.39      1753
weighted avg       0.40      0.64      0.49      1753

Accuracy :  0.6354820308043354
Model :  1
              precision    recall  f1-score   support

           0       0.64      0.64      0.64      1114
           1       0.38      0.37      0.37       639

    accuracy                           0.55      1753
   macro avg       0.51      0.51      0.51      1753
weighted avg       0.54      0.55      0.55      1753

Accuracy :  0.5453508271534512
Model :  2
              precision    recall  f1-score   support

           0       0.64      0.89      0.75      1114
           1       0.42      0.15      0.22       639

    accuracy                           0.62      1753
   macro avg       0.53      0.52

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1114
           1       0.00      0.00      0.00       639

    accuracy                           0.64      1753
   macro avg       0.32      0.50      0.39      1753
weighted avg       0.40      0.64      0.49      1753

Accuracy :  0.6354820308043354


## Saving the model for future use 

In [32]:
import pickle

# Assuming 'model[0]' is the object you want to save
with open("heart_saved", 'wb') as file:
    pickle.dump(model[0], file)


In [33]:
model_loaded =pickle.load(open('heart_saved', 'rb'))
model_loaded.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [34]:
import joblib 
joblib.dump(model[0], "Heart_attack_risk_prediction.joblib")

['Heart_attack_risk_prediction.joblib']

In [35]:
loaded_model2 = joblib.load('Heart_attack_risk_prediction.joblib')
loaded_model2.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## Fine Tuning logistic model

In [36]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()

In [37]:
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [50, 100, 200],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],  # Add l1_ratio only for elasticnet
    'random_state': [42]
}


In [38]:
# import the grid search method 
from sklearn.model_selection import GridSearchCV

In [39]:
grid_search = GridSearchCV(logmodel, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

In [40]:
bet_model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1800 candidates, totalling 9000 fits


3600 fits failed out of a total of 9000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
450 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\tirke\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\tirke\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\tirke\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [41]:
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'C': 0.001, 'l1_ratio': 0.1, 'max_iter': 50, 'penalty': 'l1', 'random_state': 42, 'solver': 'liblinear'}


In [42]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [43]:
 print(classification_report(y_test, best_model.predict(X_test)))
print('Accuracy : ', accuracy_score(y_test, best_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1114
           1       0.00      0.00      0.00       639

    accuracy                           0.64      1753
   macro avg       0.32      0.50      0.39      1753
weighted avg       0.40      0.64      0.49      1753

Accuracy :  0.6354820308043354


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Fine Tune SVM

In [44]:
from sklearn.svm import SVC
svm_model = SVC()

# Set up a parameter grid to search through
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': [0.1, 1, 'scale', 'auto'],
}

In [None]:
# Create a grid search object
grid_search_svm = GridSearchCV(svm_model, param_grid_svm, cv=5)

# Fit the grid search to the data
grid_search_svm.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search_svm.best_params_
best_svm_model = grid_search_svm.best_estimator_

# Print the best parameters
print("Best Parameters:", best_params)

# Evaluate the model on the test set
y_pred = best_svm_model.predict(X_test)

# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
best_params_svm = grid_search_svm.best_params_
print("Best Parameters:", best_params)

In [None]:
best_model_svm = grid_search_svm.best_estimator_
y_pred = best_model_svm.predict(X_test)

In [None]:
 print(classification_report(y_test, best_model_svm.predict(X_test)))
print('Accuracy : ', accuracy_score(y_test, best_model_svm.predict(X_test)))