# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder 


# Load Dataset

In [2]:
df = pd.read_csv('../data/cardio.csv')

# Split the Data into feature data and target data

In [3]:
y = df['cardio']
x=df.drop(columns='cardio')

In [4]:
#df.columns

# Modelling 

 Splitting data into 75% training data and 25% test data 

In [5]:
#Split data in 75% training data and 25% test data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.25, random_state=1)


# Feature Standization

In [6]:
#To standardize the features...

from sklearn.preprocessing import StandardScaler

std=StandardScaler()
std.fit(x_train)
std.transform(x_test)

array([[-1.40622524, -1.3630344 , -0.73462899, ..., -0.31243322,
        -0.23931996,  0.49648558],
       [ 0.71822501, -2.03605939, -0.73462899, ..., -0.31243322,
        -0.23931996,  0.49648558],
       [ 0.77762764,  0.15147431, -0.73462899, ..., -0.31243322,
        -0.23931996,  0.49648558],
       ...,
       [ 1.4595615 ,  0.45275626,  1.36123134, ..., -0.31243322,
        -0.23931996,  0.49648558],
       [-0.06077129,  1.09905463,  1.36123134, ...,  3.20068396,
         4.17850639,  0.49648558],
       [-0.2367945 , -1.63961446, -0.73462899, ..., -0.31243322,
        -0.23931996,  0.49648558]])

In [7]:
#Libraries used 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report , accuracy_score , roc_auc_score
from sklearn import svm
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import pickle as pkl

# Training and Fitting the Models 

In [8]:
def base_func(element):
    #train and fit the model
    model = element()
    model.fit(x_train , y_train)
    
    #predict
    train_preds = model.predict(x_train)
    test_preds = model.predict(x_test)
    
    #evaluation
    train_accuracy = roc_auc_score(y_train , train_preds)
    test_accuracy = roc_auc_score(y_test , test_preds)
    
    print(str(element))
    print("--------------------------------------------")
    print(f"Training Accuracy: {(train_accuracy * 100) :.4}%")
    print(f"Test Accuracy : {(test_accuracy * 100) :.4}%")
    
    #Store accuracy in a new DataFrame
    score_logreg = [element , train_accuracy , test_accuracy]
    models = pd.DataFrame([score_logreg])    

# Models 

I used 5 models to find the best model accuracy for better prediction and comparison .
Categorical classifier prediction is 

In [9]:
##Five algorithms used
algorithms = [LogisticRegression , KNeighborsClassifier , RandomForestClassifier , XGBClassifier ,svm.SVC]

#running each model and print accuracy scores
for element in algorithms:
    base_func(element)



<class 'sklearn.linear_model.logistic.LogisticRegression'>
--------------------------------------------
Training Accuracy: 71.19%
Test Accuracy : 71.4%
<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
--------------------------------------------
Training Accuracy: 71.3%
Test Accuracy : 55.71%




<class 'sklearn.ensemble.forest.RandomForestClassifier'>
--------------------------------------------
Training Accuracy: 98.14%
Test Accuracy : 70.59%
<class 'xgboost.sklearn.XGBClassifier'>
--------------------------------------------
Training Accuracy: 73.9%
Test Accuracy : 73.84%




<class 'sklearn.svm.classes.SVC'>
--------------------------------------------
Training Accuracy: 100.0%
Test Accuracy : 49.99%


In [10]:
def grd_src(classifier , param_grid):
    param_grid = param_grid
  
  #Instantiate the tuned random forest model
    grid_search = GridSearchCV(classifier, param_grid, cv=3, n_jobs=-1)
  
  #train the tuned random forest model
    grid_search.fit(x_train , y_train)

  #print best paramets during the grid search
    print((str(classifier) + "Best Parameters"))
    print("-----------------------------------")
    print(grid_search.best_params_)
    return grid_search.best_params_

 splitting data in decision tree can use Gini Index or Entropy .
 Hyperparameter modeling to improve the fit .

In [11]:
##Grid Search for best parameters of RandomForestClassifier
param_grid_rf = {"n_estimators" : [10,15,20,21,22],
                 "criterion" : ["gini" , "entropy"],
                 "max_depth" : [8,9,10,11],
                 "min_samples_split" : [2,3,4,5,6,7]}

rf_params = grd_src(RandomForestClassifier() , param_grid_rf)                 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)Best Parameters
-----------------------------------
{'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 21}


In [12]:
#GridSearch for best parameters of XGBClassifier
param_grid_xgb = {"n_estimators" : [120,100,90,80,60,],
                  "learning_rate" : [0.01,0.1,0.2] , 
                  "max_depth" : [2,3,4,5],
                  "colsample_by_tree" : [0,0.02],
                  "gamma":[0,0.01,0.1,0.2]}

xg_param = grd_src(XGBClassifier() , param_grid_xgb) 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)Best Parameters
-----------------------------------
{'colsample_by_tree': 0, 'gamma': 0.2, 'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 100}


In [13]:
with open('xgboost_model.pkl', 'wb') as pickle_file:
    pkl.dump(xg_param, pickle_file)

In [14]:
with open('rf_model.pkl', 'wb') as pickle_file:
    pkl.dump(rf_params, pickle_file)

In [15]:
with open('data/pickled_diamonds.pkl', 'wb') as pickle_file:
    pickle.dump(my_diamonds, pickle_file)

FileNotFoundError: [Errno 2] No such file or directory: 'data/pickled_diamonds.pkl'

In [None]:
!ls

In [None]:
with open('rf_model.pkl','rb') as pickle_file:
    rf_params = pkl.load(pickle_file)

In [None]:
rf_params

In [None]:
with open('xgboost_model.pkl','rb') as pickle_file:
    xg_param = pkl.load(pickle_file)

In [None]:
#Run models with their best parameters and also print accuracy scores
from sklearn import metrics
def  run_model(model, x_train, y_train,x_test, y_test ):
     model.fit(x_train, y_train)

    # predict
    train_preds = model.predict_proba(x_train).argmax(1)
    test_preds = model.predict_proba(x_test).argmax(1)

    

    fpr, tpr, threshold = metrics.roc_curve(y_test, test_preds)
    roc_auc = metrics.auc(fpr, tpr)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.gcf().savefig('roc.png')

    # evaluate
    train_auc = roc_auc_score(y_train, train_preds)
    test_auc = roc_auc_score(y_test, test_preds)
    report = classification_report(y_test, test_preds)

    print(metrics.confusion_matrix(y_test, test_preds))

    test_preds[test_preds>roc_auc]= 1
    test_preds[test_preds<=roc_auc]= 0

    #print reports of the model accuracy
       print('Model Scores')
    print("------------------------")
    print(f"Training AUC: {(train_auc * 100):.4}%")
    print(f"Test AUC:     {(test_auc * 100):.4}%")
    print("------------------------------------------------------")
    print('Classification Report : \n', report)
    return test_preds

In [None]:
#Random forest with best parameters
#{'criterion': 'gini', 'max_depth': 9, 'min_samples_split': 5, 'n_estimators': 20}
rf_model=RandomForestClassifier(n_estimators=20, 
                                  criterion= 'gini', 
                                  max_depth= 9, 
                                  min_samples_split= 5)
rfc_cv_score = cross_val_score(rf_model, x, y, cv=3, scoring='roc_auc')

                               
                               
run_model(rf_model, x_train, y_train, x_test, y_test)

In [None]:
##Xg boost with the best parameters
#{'colsample_by_tree': 0, 'gamma': 0.2, 'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 80}


xgb_model = XGBClassifier(colsample_by_tree = 0 , n_estimators = 80
                          , gamma = 0.2 , learning_rate = 0.2 , 
                          max_depth = 4)


run_model(xgb_model , x_train , y_train , x_test , y_test)
xgb_cv_score = cross_val_score(xgb_model, x, y, cv=3, scoring='roc_auc')