# Import Libraries

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder


# Load Dataset

In [0]:
df = pd.read_csv('Cardio.csv')

In [0]:
df.head()

In [0]:
df.shape

#Empty values in each column

In [0]:
df.isna().sum()

In [0]:
df.isnull().values.any()

In [0]:
df.describe()

# Number of  patients with cardiovascular disease and ones without 

In [0]:
df['cardio'].value_counts()

In [0]:
# Visualise 
sns.countplot(df['cardio'])

In [0]:
df.head()

In [0]:
#Compare the cardiovascular disease number 
#look years column
df['years'] = (df['age'] / 365).round(0)
df['years'] = pd.to_numeric( df['years'], downcast= 'integer')


In [0]:
df.drop(columns='age', inplace=True)

In [0]:
#Applying one hot encoder on the categorical feature(age)

enc=OneHotEncoder()
enc.fit_transform(df['gender'].reshape(1,-1))


In [0]:
df['gender'].shape

In [0]:
df.head()

In [0]:
sns.countplot(x='years',hue='cardio',data=df,palette='colorblind',edgecolor= sns.color_palette('dark',n_colors=1))

In [0]:
sns.countplot(x='cholesterol',hue='cardio',data=df,palette='colorblind',edgecolor= sns.color_palette('dark',n_colors=1))

In [0]:
#correlation of the columns 
df.corr()

# visualise the data 

In [0]:
plt.figure(figsize=(7,7))
sns.heatmap(df.corr(),annot=True, fmt='.0%', linewidths=0, )

In [0]:
df

In [0]:
#drop id column 
df = df.drop('id', axis=1)

# Split the Data into feature data and target data

In [0]:
y = df['cardio']
x=df.drop(columns='cardio')

In [0]:
#Split data in 75% training data and 25% test data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.25, random_state=1)

In [None]:
standardize the features

# To standardize the features...

from sklearn.preprocessing import StandardScaler

std=StandardScaler()
std.fit(x_train)
std.transform(x_test)

In [0]:
#Libraries used 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report , accuracy_score , roc_auc_score
from sklearn import svm
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


In [0]:
def base_func(element):
    #train and fit the model
    model = element()
    model.fit(x_train , y_train)
    
    #predict
    train_preds = model.predict(x_train)
    test_preds = model.predict(x_test)
    
    #evaluation
    train_accuracy = roc_auc_score(y_train , train_preds)
    test_accuracy = roc_auc_score(y_test , test_preds)
    
    print(str(element))
    print("--------------------------------------------")
    print(f"Training Accuracy: {(train_accuracy * 100) :.4}%")
    print(f"Test Accuracy : {(test_accuracy * 100) :.4}%")
    
    #Store accuracy in a new DataFrame
    score_logreg = [element , train_accuracy , test_accuracy]
    models = pd.DataFrame([score_logreg])    

In [0]:
##Five algorithms used
algorithms = [LogisticRegression , KNeighborsClassifier , RandomForestClassifier , XGBClassifier ,svm.SVC]

#running each model and print accuracy scores
for element in algorithms:
    base_func(element)

In [0]:
def grd_src(classifier , param_grid):
  param_grid = param_grid
  
  #Instantiate the tuned random forest model
  grid_search = GridSearchCV(classifier, param_grid, cv=3, n_jobs=-1)
  
  #train the tuned random forest model
  grid_search.fit(x_train , y_train)

  #print best paramets during the grid search
  print((str(classifier) + "Best Parameters"))
  print("-----------------------------------")
  print(grid_search.best_params_)
  return grid_search.best_params_

In [0]:
##Grid Search for best parameters of RandomForestClassifier
param_grid_rf = {"n_estimators" : [10,15,20,21,22],
                 "criterion" : ["gini" , "entropy"],
                 "max_depth" : [8,9,10,11],
                 "min_samples_split" : [1,2,3,4,5,6,7]}

rf_params = grd_src(RandomForestClassifier() , param_grid_rf)                 

In [0]:
#GridSearch for best parameters of XGBClassifier
param_grid_xgb = {"n_estimators" : [120,100,90,80,60,],
                  "learning_rate" : [0.01,0.1,0.2] , 
                  "max_depth" : [2,3,4,5],
                  "colsample_by_tree" : [0,0.02],
                  "gamma":[0,0.01,0.1,0.2]}

grd_src(XGBClassifier() , param_grid_xgb)

In [0]:
#Run models with their best parameters and also print accuracy scores

from sklearn import metrics
def run_model(model, x_train, y_train,x_test, y_test ):
    model.fit(x_train, y_train)

    # predict
    train_preds = model.predict_proba(x_train).argmax(1)
    test_preds = model.predict_proba(X_test).argmax(1)

    

    fpr, tpr, threshold = metrics.roc_curve(y_test, test_preds)
    roc_auc = metrics.auc(fpr, tpr)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.gcf().savefig('roc.png')

    # evaluate
    train_auc = roc_auc_score(y_train, train_preds)
    test_auc = roc_auc_score(y_test, test_preds)
    report = classification_report(y_test, test_preds)

    print(metrics.confusion_matrix(y_test, test_preds))

    test_preds[test_preds>roc_auc]= 1
    test_preds[test_preds<=roc_auc]= 0

    #print reports of the model accuracy
    print('Model Scores')
    print("------------------------")
    print(f"Training AUC: {(train_auc * 100):.4}%")
    print(f"Test AUC:     {(test_auc * 100):.4}%")
    print("------------------------------------------------------")
    print('Classification Report : \n', report)
    return test_preds

In [0]:
#Random forest with best parameters
#{'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 3, 'n_estimators': 20}
rf_model=RandomForestClassifier(n_estimators=20, 
                                  criterion= 'entropy', 
                                  max_depth= 10, 
                                  min_samples_split= 3)
rfc_cv_score = cross_val_score(rf_model, features, target, cv=3, scoring='roc_auc')

                               
                               
run_model(rf_model, X_train, y_train, X_test, y_test)

In [0]:
##Xg boost with the best parameters

xgb_model = XGBClassifier(colsample_by_tree = 0 , n_estimators = 100
                          , gamma = 0 , learning_rate = 0.4 , 
                          max_depth = 3)


run_model(xgb_model , X_train , y_train , X_test , y_test)
xgb_cv_score = cross_val_score(xgb_model, features, target, cv=3, scoring='roc_auc')