# cvd prediction dataset

In [None]:

import pandas as pd

# Load the dataset
file_path = '/content/Cvd_Prediction_Dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [None]:

# Checking for missing values and data types
missing_values = data.isnull().sum()
data_types = data.dtypes

missing_values, data_types


(Age               0
 Sex               0
 ChestPainType     0
 RestingBP         0
 Cholesterol       0
 FastingBS         0
 RestingECG        0
 MaxHR             0
 ExerciseAngina    0
 Oldpeak           0
 ST_Slope          0
 HeartDisease      0
 dtype: int64,
 Age                 int64
 Sex                object
 ChestPainType      object
 RestingBP           int64
 Cholesterol         int64
 FastingBS           int64
 RestingECG         object
 MaxHR               int64
 ExerciseAngina     object
 Oldpeak           float64
 ST_Slope           object
 HeartDisease        int64
 dtype: object)

In [None]:

from sklearn.preprocessing import StandardScaler, LabelEncoder

# Creating a copy of the dataset for preprocessing
data_preprocessed = data.copy()

# Encoding categorical variables
label_encoders = {}
categorical_columns = data_preprocessed.select_dtypes(include=['object']).columns
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data_preprocessed[column] = label_encoders[column].fit_transform(data_preprocessed[column])

# Normalizing continuous variables
scaler = StandardScaler()
numerical_columns = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
data_preprocessed[numerical_columns] = scaler.fit_transform(data_preprocessed[numerical_columns])

# Display the first few rows of the preprocessed dataset
data_preprocessed.head()


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,-1.43314,1,1,0.410909,0.82507,0,1,1.382928,0,-0.832432,2,0
1,-0.478484,0,2,1.491752,-0.171961,0,1,0.754157,0,0.105664,1,1
2,-1.751359,1,1,-0.129513,0.770188,0,2,-1.525138,0,-0.832432,2,0
3,-0.584556,0,0,0.302825,0.13904,0,1,-1.132156,1,0.574711,1,1
4,0.051881,1,2,0.951331,-0.034755,0,1,-0.581981,0,-0.832432,2,0


### SVM Model Application


Next, we apply the Support Vector Machine model to the preprocessed dataset. We split the data into training and testing sets, train the SVM model, and then evaluate its performance.

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Separating the features and target variable
X = data_preprocessed.drop('HeartDisease', axis=1)
y = data_preprocessed['HeartDisease']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Applying the SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

accuracy
classification_rep


'              precision    recall  f1-score   support\n\n           0       0.78      0.87      0.82        77\n           1       0.90      0.82      0.86       107\n\n    accuracy                           0.84       184\n   macro avg       0.84      0.85      0.84       184\nweighted avg       0.85      0.84      0.84       184\n'

In [None]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[67 10]
 [19 88]]







Cardio risk dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, auc, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
data = pd.read_csv('/content/cardio_tej.csv')
data.head()

Unnamed: 0,id,age,education,sex,is_smoking,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,0,64,2.0,F,YES,3.0,0.0,0,0,0,221.0,148.0,85.0,,90.0,80.0,1
1,1,36,4.0,M,NO,0.0,0.0,0,1,0,212.0,168.0,98.0,29.77,72.0,75.0,0
2,2,46,1.0,F,YES,10.0,0.0,0,0,0,250.0,116.0,71.0,20.35,88.0,94.0,0
3,3,50,1.0,M,YES,20.0,0.0,0,1,0,233.0,158.0,88.0,28.26,68.0,94.0,1
4,4,64,1.0,F,YES,30.0,0.0,0,0,0,241.0,136.5,85.0,26.42,70.0,77.0,0


In [None]:
print(data.isnull().sum())
print(data.isnull().sum().sum())

id                   0
age                  0
education           87
sex                  0
is_smoking           0
cigsPerDay          22
BPMeds              44
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             38
sysBP                0
diaBP                0
BMI                 14
heartRate            1
glucose            304
TenYearCHD           0
dtype: int64
510


In [None]:
#filling NaN values with respect to the analysis


data['education'] = data['education'].fillna(data['education'].mean())
data['cigsPerDay'] = data['cigsPerDay'].fillna(data['cigsPerDay'].mean())
data['BPMeds'] = data['BPMeds'].fillna(data['BPMeds'].mode()[0])
data['totChol'] = data['totChol'].fillna(data['totChol'].mean())
data['BMI'] = data['BMI'].fillna(data['BMI'].mean())
data['heartRate'] = data['heartRate'].fillna(data['heartRate'].mean())
data['glucose'] = data['glucose'].fillna(data['glucose'].mean())

In [None]:
data.isnull().sum()

id                 0
age                0
education          0
sex                0
is_smoking         0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [None]:
data['sex'].replace(['F','M'],[0,1], inplace=True)
data['is_smoking'].replace(['NO','YES'],[0,1], inplace=True)

In [None]:
data = data.drop(['id', 'education'], axis=1)

In [None]:
# Calculate the correlation matrix
correlation_matrix = data.corr()

# Select columns with correlation greater than or equal to 0.15 with respect to "TenYearCHD"
selected_columns = correlation_matrix["TenYearCHD"] >= 0.15

# Filter the correlation matrix based on selected columns
filtered_data = correlation_matrix[selected_columns]
filtered_data

filtered_data = correlation_matrix[selected_columns]
index_list = filtered_data.index.tolist()
index_list

data_seletion = data[index_list]
data_seletion

Unnamed: 0,age,prevalentHyp,sysBP,TenYearCHD
0,64,0,148.0,1
1,36,1,168.0,0
2,46,0,116.0,0
3,50,1,158.0,1
4,64,0,136.5,0
...,...,...,...,...
3385,60,0,123.5,0
3386,46,0,102.0,0
3387,44,1,164.0,1
3388,60,1,167.0,0


In [None]:
X = data.drop("TenYearCHD", axis=1) # feature
y = data['TenYearCHD'].copy() # Label

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating and training the SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)

# Predicting the Test set results
y_pred = svm_model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

accuracy, classification_rep

(0.8569321533923304,
 '              precision    recall  f1-score   support\n\n           0       0.86      1.00      0.92       581\n           1       0.50      0.01      0.02        97\n\n    accuracy                           0.86       678\n   macro avg       0.68      0.50      0.47       678\nweighted avg       0.81      0.86      0.79       678\n')

In [None]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[580   1]
 [ 96   1]]


Cardio train dataset


In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/cardio_karthik.csv'
# Reload the dataset with the correct delimiter
data = pd.read_csv(file_path, delimiter=';')

# Displaying the first few rows of the dataset again
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [None]:
# Checking for missing values
missing_values = data.isnull().sum()

# Dropping the 'id' column as it's not useful for prediction
data.drop('id', axis=1, inplace=True)

# Checking the data types to identify categorical variables
data_types = data.dtypes

missing_values, data_types


(id             0
 age            0
 gender         0
 height         0
 weight         0
 ap_hi          0
 ap_lo          0
 cholesterol    0
 gluc           0
 smoke          0
 alco           0
 active         0
 cardio         0
 dtype: int64,
 age              int64
 gender           int64
 height           int64
 weight         float64
 ap_hi            int64
 ap_lo            int64
 cholesterol      int64
 gluc             int64
 smoke            int64
 alco             int64
 active           int64
 cardio           int64
 dtype: object)

In [None]:
from sklearn.preprocessing import StandardScaler

# Separating features and target variable
X = data.drop('cardio', axis=1)
y = data['cardio']

# Applying standard scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Displaying the first few rows of the scaled features
pd.DataFrame(X_scaled, columns=X.columns).head()


Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,-0.436062,1.364055,0.443452,-0.847873,-0.122182,-0.088238,-0.539322,-0.39572,-0.310879,-0.238384,0.494167
1,0.307686,-0.733108,-1.018168,0.749831,0.07261,-0.03518,2.400793,-0.39572,-0.310879,-0.238384,0.494167
2,-0.247997,-0.733108,0.078047,-0.708942,0.007679,-0.141297,2.400793,-0.39572,-0.310879,-0.238384,-2.023607
3,-0.748152,1.364055,0.565254,0.541435,0.137541,0.017879,-0.539322,-0.39572,-0.310879,-0.238384,0.494167
4,-0.808543,-0.733108,-1.018168,-1.264666,-0.187113,-0.194356,-0.539322,-0.39572,-0.310879,-0.238384,-2.023607


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Creating and training the SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)

# Predicting the Test set results
y_pred = svm_model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

accuracy, classification_rep


(0.7319285714285715,
 '              precision    recall  f1-score   support\n\n           0       0.72      0.76      0.74      6988\n           1       0.75      0.70      0.72      7012\n\n    accuracy                           0.73     14000\n   macro avg       0.73      0.73      0.73     14000\nweighted avg       0.73      0.73      0.73     14000\n')

In [None]:
accuracy_percentage = accuracy * 100
accuracy_percentage

73.19285714285715

In [None]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[5321 1667]
 [2086 4926]]
