#Cross Validation

<p><a href="/guided-machine-learning/">Back to Index</a></p>


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
from scipy.stats import norm
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

sns.set()
pd.set_option('display.float_format', lambda x: '%.5f' % x)

plt.style.use(style='ggplot')
plt.rcParams['figure.figsize'] = (10, 6)
from scipy.stats import norm

## Helper functions

In [None]:
from IPython.core.display import display, HTML


def large_print(data, size=140):
    display( HTML('<span style="font-size:'+ str(size * 2) +'%; line-height:'+str(size)+'%"><p><p>' + data + '</p></p></span>'))

##### Encoding

In [None]:
## Encoding helper functions
def using_cat_encoding(dset, lstvariable):
  # fisrt convert the column to category type
  for variable in lstvariable:
    var_new_col = variable + '_category_type'
    var_col_encode = variable + '_encode_val'
    dset[var_new_col] =  dset[variable].astype('category')
    dset[var_col_encode] = dset[var_new_col].cat.codes
    dset.drop(columns=[var_new_col] , inplace=True)
  return dset

def using_index_encoding(dset, variable):
  gle = LabelEncoder()
  var_col_encode = variable + '_encode_val'
  dset[var_col_encode] = gle.fit_transform(dset[variable])
  return dset

def using_onehot_encoding (dset , lstvariable):
  return pd.get_dummies(dset, columns=lstvariable, prefix=lstvariable)

##### Compare the algorithms

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.calibration import calibration_curve
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import pandas as pd
import pprint 

def run_cross_Validation(X,y,  scoring_columns, modeltype='mnb'): 
  if (modeltype == 'Logistic'):
    ## Logistic Regression Model
    model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')

  if (modeltype == 'svm'):
    ## Support Vector Machine Model
    model = svm.SVC(decision_function_shape="ovo")

  if (modeltype == 'rf'):
    ##Random Forest Model
    model = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)

  if (modeltype == 'nn'):
    ##Neural Network Model
    model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(150, 10), random_state=1)
  
  if (modeltype == 'nb'):
    ## Gaussian Naive Bayes
    model = GaussianNB()
      
  if (modeltype == 'mnb'):
    ## Multinominal Naive Bayes
    model = MultinomialNB()
  if (modeltype == 'KNN'):
    ## KNN
    model = KNeighborsClassifier()
  if (modeltype == 'XGB'):
    ## XGB
    model = XGBClassifier()

 
  ## CV can be integer or a KFold.  Try both
  kfold = KFold(n_splits=10, shuffle=True)
  cv_results = cross_validate(model, X, y, cv= kfold , scoring=scoring_columns, return_train_score=True)

  ##pprint.pprint( "============CV Results for debugging============\n" )
  ##pprint.pprint(cv_results )

  ##type(cv_results)

  ##print ('\n avg test_accuracy : ', cv_results['test_accuracy'].mean())
  return cv_results

In [None]:
def  Run_Classification (x_tr, y_tr, x_tst, y_tst, feature_cols, usescaling=False, show_roc=False, modeltype='mnb', bDebugPrint=False):

  from sklearn.linear_model import LogisticRegression
  from sklearn import svm
  from sklearn.ensemble import RandomForestClassifier
  from sklearn.neural_network import MLPClassifier
  from sklearn.naive_bayes import GaussianNB
  from sklearn.calibration import calibration_curve
  from sklearn.naive_bayes import MultinomialNB
  from sklearn.neighbors import KNeighborsClassifier
  from xgboost import XGBClassifier


  # Create a Logistic Regression Object, perform Logistic Regression
  x_tr = pd.DataFrame(x_tr)
  y_tr = pd.DataFrame(y_tr)
  x_tst = pd.DataFrame(x_tst)
  y_tst = pd.DataFrame(y_tst)
  
  # Scale the data
  if (usescaling == True ):
    x_tr = ZScore_Standarization_Scaling(x_tr)
    x_tst = ZScore_Standarization_Scaling(x_tst)
  
  if ( bDebugPrint):
    print ( "Running Classifications ")
  if (modeltype == 'Logistic'):
    ## Logistic Regression Model
    model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(x_tr, y_tr)

  if (modeltype == 'svm'):
    ## Support Vector Machine Model
    model = svm.SVC(decision_function_shape="ovo").fit(x_tr, y_tr)

  if (modeltype == 'rf'):
    ##Random Forest Model
    model = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0).fit(x_tr, y_tr)

  if (modeltype == 'nn'):
    ##Neural Network Model
    model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(150, 10), random_state=1).fit(x_tr, y_tr)
  
  if (modeltype == 'nb'):
    ## Gaussian Naive Bayes
    model = GaussianNB().fit(x_tr, y_tr)
      
  if (modeltype == 'mnb'):
    ## Multinominal Naive Bayes
    model = MultinomialNB().fit(x_tr, y_tr)
  if (modeltype == 'KNN'):
    ## KNN
    model = KNeighborsClassifier().fit(x_tr, y_tr)
  if (modeltype == 'XGB'):
    ## XGB
    model = XGBClassifier().fit(x_tr, y_tr)
  tr_score = model.score(x_tr, y_tr)   
  tst_score =   model.score(x_tst, y_tst)

  y_pred = model.predict(x_tst)
  results_model_type, score_train, score_test  = modeltype, tr_score ,   tst_score 
  if ( bDebugPrint):
    large_print ( "{0}  Model Accuracy for training  :{1}  ,  Testing :{2}".format( modeltype, tr_score ,   tst_score   ))
    large_print('=============show confusion matrix values=============', 60)
    # Show the Confusion Matrix
    cnf_matrix  = confusion_matrix(y_tst, y_pred)
    print ('\n\t\t \n',cnf_matrix )
    large_print('=============Visualize confusion Matrix=============', 60)
    #Visualize confusion Matrix  
    fig, ax1 = plt.subplots(figsize=(10,5))
    tick_marks = np.arange(len(feature_cols))
    plt.xticks(tick_marks, feature_cols)
    plt.yticks(tick_marks, feature_cols)
    # create heatmap
    sns.heatmap(pd.DataFrame(cnf_matrix ), cmap="YlGnBu" , annot=True,  fmt='g', ax=ax1)
    ax1.xaxis.set_label_position("top")
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()    
    '''
    large_print('=============Confusion Matrix Evaluation Metrics=============', 70)
    ## Confusion Matrix Evaluation Metrics 
    from sklearn import metrics
    print("\n\t\t Accuracy:",metrics.accuracy_score(y_tst, y_pred))
    print("\n\t\t Precision:",metrics.precision_score(y_tst, y_pred))
    print("\n\t\t Recall:",metrics.recall_score(y_tst, y_pred))
    '''    
    large_print('=============Classification report=============\n', 70)
    print(classification_report(y_tst, y_pred))
    if ( show_roc== True):
      ## Show ROC Curve 
      print ( "Show ROC Curve")
      y_pred_proba = log_reg.predict_proba(x_tst)[::,1]
      fpr, tpr, _ = metrics.roc_curve(y_tst,  y_pred_proba)
      auc = metrics.roc_auc_score(y_tst, y_pred_proba)
      plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
      plt.legend(loc=4)
      plt.show()
      ## find accuracy
      print ('\n\t\t AUC Score  between 0.5 - 1 is perfect classifier. Accuracy of the mdoel is',   ) 
      ## AUC score 1 represents perfect classifier, and 0.5 represents a worthless classifier.
  return results_model_type, score_train, score_test

In [None]:
def  Run_Plot_Classification ( X_train, y_train, X_test, y_test ):
  import numpy as np
  np.random.seed(0)

  import matplotlib.pyplot as plt

  from sklearn import datasets
  from sklearn.naive_bayes import GaussianNB
  from sklearn.linear_model import LogisticRegression
  from sklearn.ensemble import RandomForestClassifier
  from sklearn.svm import LinearSVC
  from sklearn.calibration import calibration_curve
  from sklearn.naive_bayes import MultinomialNB


 ## X, y = datasets.make_classification(n_samples=100000, n_features=20,
 ##                                     n_informative=2, n_redundant=2)

  train_samples = 100  # Samples used for training the models

  X_train = pd.DataFrame(X_train)
  y_train = pd.DataFrame(y_train)
  X_test = pd.DataFrame(X_test)
  y_test = pd.DataFrame(y_test)

  # Create classifiers
  lr = LogisticRegression()
  gnb = GaussianNB()
  svc = LinearSVC(C=1.0)
  rfc = RandomForestClassifier()
  nb = MultinomialNB ()


  # #############################################################################
  # Plot calibration plots

  plt.figure(figsize=(10, 10))
  ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
  ax2 = plt.subplot2grid((3, 1), (2, 0))

  ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
  for clf, name in [(lr, 'Logistic'),
                    (gnb, 'Naive Bayes'),
                    (svc, 'Support Vector Classification'),
                    (rfc, 'Random Forest')
                    ##,                    (nb, 'Multinomial NB')
                    ]:
      clf.fit(X_train, y_train)
      if hasattr(clf, "predict_proba"):
          prob_pos = clf.predict_proba(X_test)[:, 1]
      else:  # use decision function
          prob_pos = clf.decision_function(X_test)
          prob_pos = \
              (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
      fraction_of_positives, mean_predicted_value = \
          calibration_curve(y_test, prob_pos, n_bins=10)

      ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
              label="%s" % (name, ))

      ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
              histtype="step", lw=2)

  ax1.set_ylabel("Fraction of positives")
  ax1.set_ylim([-0.05, 1.05])
  ax1.legend(loc="lower right")
  ax1.set_title('Calibration plots  (reliability curve)')

  ax2.set_xlabel("Mean predicted value")
  ax2.set_ylabel("Count")
  ax2.legend(loc="upper center", ncol=2)

  plt.tight_layout()
  plt.show()

### PreProcessing_of_Data

In [None]:
def describe_features ( dset, featurelist, NumberValues=True):
  ##num_features = dset.select_dtypes([np.number]).columns.tolist()
  ##cat_features = dset.select_dtypes( exclude = [np.number] ).columns.tolist()
  if NumberValues == True : 
    astat_df = dset[featurelist].describe().loc[['min','max','mean', 'std']].T.sort_values('max')
  else:
    astat_df= dset[featurelist].describe()  
  return astat_df

def find_unique_values_in_Colums ( dset, bprintdebug = False):
  if ( bprintdebug == True ):
    print('\nAll Feature Count: ', len(dset.columns))
  discrete_feature = [feature for feature in dset.columns 
                      if len(dset[feature].unique())<50
                      and feature not in ['Id']]

  if ( bprintdebug == True ):
    print('\nDiscrete Feature Count: ', len(discrete_feature))
    for feature in discrete_feature:
      print ( '\nUnique lables for feature :  {0} \n {1}'.format( feature, dset[feature].unique() )  )
  return discrete_feature

# What percentage of data is clean?
def calculate_missing_values_percent_by_columns(diff):
  for col in diff.columns:
    pct_missing = np.mean(diff[col].isnull())
    pct_missing = round(pct_missing*100, 3)
    pct_clean = 100 - pct_missing
    
    print('{0: >20} - pecent clean: {1: >7}% - percent missing: {2: >7}%'.format(col,pct_clean,  pct_missing))
def missing_zero_values_table(df):
        zero_val = (df == 0.00).astype(int).sum(axis=0)
        mis_val = df.isnull().sum()
        mis_val_percent = 100 *( df.isnull().sum() / len(df) )
        mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
        mz_table = mz_table.rename(
        columns = { 0 : 'Zero Values',1 : 'Missing Values', 2 : '% of Missing Values'})
        mz_table['Total Zero Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
        mz_table['% Total Zero Missing Values'] = 100 * ( mz_table['Total Zero Missing Values'] / len(df) )
        mz_table['Data Type'] = df.dtypes
        mz_table['Data Length'] = len(df)
        mz_table = mz_table[
            mz_table.iloc[:,1] != 0].sort_values(
        '% of Missing Values', ascending=False).round(3)

        ##print ("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"      
        ##    "There are " + str(mz_table.shape[0]) +
         ##     " columns that have missing values.")
#         mz_table.to_excel('D:/sampledata/missing_and_zero_values.xlsx', freeze_panes=(1,0), index = False)
        return mz_table
def  find_Null_Value_Cnt_by_Feature ( dset):
  nulls_df = pd.DataFrame(dset.isnull().sum().sort_values(ascending=False)[:25])
  nulls_df.columns = ['Null Count']
  nulls_df.index.name = 'Feature'
  return nulls_df

def PreProcessing_Step1 ( dset , bprintdebug = False) :
 
  num_features = dset.select_dtypes([np.number]).columns.tolist()
  cat_features = dset.select_dtypes( exclude = [np.number] ).columns.tolist()
  if cat_features.count('ID') > 0 :
    cat_features.remove('ID')
  if cat_features.count('Name') > 0 :
    cat_features.remove('Name')
  if ( bprintdebug == True ):
    print ( '\nSample data\n {0} \n'.format(dset.head(5)) )
    print ( "\nDataFrame shape: {0}\n".format(dset.shape))
    print ( "\nDataFrame size: {0}\n".format(dset.size))
    print ("\n num_features: {0} \n cat_features: {1} \n".format(num_features,cat_features  ) )
    print   ("""
              *********************************
              Percent of Cleanness in the data - Method
              *********************************
             """)
    print ( find_Null_Value_Cnt_by_Feature(dset ) )

    print ("""
    *********************************
    Data Stats are 
    *********************************
          """)
    print ( "\nNumerica feature stats are \n")
    df_sta1 = describe_features(dset , num_features).sort_values(by='max', ascending=False)
    print ( df_sta1 )
    print ("\nCategorica feature stats are \n", describe_features(dset , cat_features, NumberValues=False) )
    print ( """
    *********************************
    Finding Unique values in discrete features
    *********************************
          """)
  discrete_features = find_unique_values_in_Colums ( dset, bprintdebug)
  for cval in cat_features:
    ##print ( cval)
    if  discrete_features.count('cval') <= 0:
      cat_features.remove(cval)
  return num_features, cat_features

## HCV Data ( UCI ML Data Repository )

Data location : https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv

      The data set contains laboratory values of blood donors and Hepatitis C patients and demographic values like age.

      The target attribute for classification is Category (blood donors vs. Hepatitis C (including its progress ('just' Hepatitis C, Fibrosis, Cirrhosis).


      Attribute Information:

      All attributes except Category and Sex are numerical. The laboratory data are the attributes 5-14.
      1) X (Patient ID/No.)
      2) Category (diagnosis) (values: '0=Blood Donor', '0s=suspect Blood Donor', '1=Hepatitis', '2=Fibrosis', '3=Cirrhosis')
      3) Age (in years)
      4) Sex (f,m)
      5) ALB
      6) ALP
      7) ALT
      8) AST
      9) BIL
      10) CHE
      11) CHOL
      12) CREA
      13) GGT
      14) PROT

### Step1 - Load the data

In [None]:
db_file_Col_Names = ['Patient_ID', 'Category', 'Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST',
       'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']

dataurl = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv'
## Read CSV with predefined column list
## Skip the headrow
## Read CSV without index column
df_base = pd.read_csv(dataurl, names = db_file_Col_Names, skiprows=1, na_values='?', index_col=False)

### Step2 - Explore the data

In [None]:
num_Features,  cat_Features = PreProcessing_Step1(df_base, bprintdebug=True)


Sample data
    Patient_ID       Category  Age Sex  ...    CHOL      CREA      GGT     PROT
0           1  0=Blood Donor   32   m  ... 3.23000 106.00000 12.10000 69.00000
1           2  0=Blood Donor   32   m  ... 4.80000  74.00000 15.60000 76.50000
2           3  0=Blood Donor   32   m  ... 5.20000  86.00000 33.20000 79.30000
3           4  0=Blood Donor   32   m  ... 4.74000  80.00000 33.80000 75.70000
4           5  0=Blood Donor   32   m  ... 4.32000  76.00000 29.90000 68.70000

[5 rows x 14 columns] 


DataFrame shape: (615, 14)


DataFrame size: 8610


 num_features: ['Patient_ID', 'Age', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT'] 
 cat_features: ['Category', 'Sex'] 


              *********************************
              Percent of Cleanness in the data - Method
              *********************************
             
            Null Count
Feature               
ALP                 18
CHOL                10
PROT                 1
ALT 

In [None]:
## Find unique values in Category columns  Category , Sex 
print ( "Unique values in Sex feature are " , df_base['Sex'].unique() )
print ( "Unique values in Category feature are " , df_base['Category'].unique()  )

Unique values in Sex feature are  ['m' 'f']
Unique values in Category feature are  ['0=Blood Donor' '0s=suspect Blood Donor' '1=Hepatitis' '2=Fibrosis'
 '3=Cirrhosis']


### Drop missing values

In [None]:
## drop missing records from both datasets
# making new data frame with dropped NA values  
##df_train  =  pd.DataFrame(df_census_train, columns= x_cols + y_cols  + ['income'] ) 
df_HCV = df_base.dropna(axis = 0, how ='any')


print("\nCleaning NA values in training set \n\tLength before: {0} Length After : {1}".format( len(df_base), len(df_HCV)    )    ) 
print("\n\tNumber of rows with at least 1 NA value: ", ( len(df_base) - len(df_HCV)  )) 



Cleaning NA values in training set 
	Length before: 615 Length After : 589

	Number of rows with at least 1 NA value:  26


###Step3 - Encoding

We need to encode  these category values to numerical values

In [None]:
# Encoding above ordinal data using OrdinalEncoder
from sklearn.preprocessing import OrdinalEncoder
ordinalencoder = OrdinalEncoder()
'''
df_HCV['Gender_index'] = ordinalencoder.fit_transform(df_HCV[['Sex']])

'''

df_HCV["Gender_index"] = df_HCV["Sex"].apply(lambda x:0 if x.strip() =='m' else 1)
print ( "Unique values in Gender_index feature are " , df_HCV['Gender_index'].unique() )

Unique values in Gender_index feature are  [0 1]


In [None]:
## Encode Category target column 
df_HCV[['Category_index','Category_Name']] = df_HCV.Category.str.split("=",expand=True) 
print ( "Unique values in Category_index feature are " , df_HCV['Category_index'].unique() )
df_HCV['Category_index'] = LabelEncoder().fit_transform(df_HCV[['Category_index']])
print ( "Unique values in Category_index feature are " , df_HCV['Category_index'].unique() )

Unique values in Category_index feature are  ['0' '0s' '1' '2' '3']
Unique values in Category_index feature are  [0 1 2 3 4]


### Step4 - Feature Engineering

In [None]:
##df_HCV.head() 

num_features = df_HCV.select_dtypes([np.number]).columns.tolist()
#defin features: text and predict output: Category
feature_labels = num_features

## drop Patient_ID and target variable from feature_lables
if ( feature_labels.count('Patient_ID') >0  ):
  feature_labels.remove('Patient_ID')
if ( feature_labels.count('Category_index') >0  ):
  feature_labels.remove('Category_index')
target_variable = ['Category_index']


##df_filtered = pd.DataFrame ( df_HCV, columns=list(feature_labels))
##df_New = pd.DataFrame ( dset, columns=list(num_features))

X = pd.DataFrame ( df_HCV, columns=list(feature_labels))
y = df_HCV['Category_index']
X, y


(     Age      ALB       ALP      ALT  ...      CREA       GGT     PROT  Gender_index
 0     32 38.50000  52.50000  7.70000  ... 106.00000  12.10000 69.00000             0
 1     32 38.50000  70.30000 18.00000  ...  74.00000  15.60000 76.50000             0
 2     32 46.90000  74.70000 36.20000  ...  86.00000  33.20000 79.30000             0
 3     32 43.20000  52.00000 30.60000  ...  80.00000  33.80000 75.70000             0
 4     32 39.20000  74.10000 32.60000  ...  76.00000  29.90000 68.70000             0
 ..   ...      ...       ...      ...  ...       ...       ...      ...           ...
 608   58 34.00000  46.40000 15.00000  ...  56.00000  49.70000 80.60000             1
 609   59 39.00000  51.30000 19.60000  ... 136.10000 101.10000 70.50000             1
 610   62 32.00000 416.60000  5.90000  ...  55.70000 650.90000 68.50000             1
 611   64 24.00000 102.80000  2.90000  ...  63.00000  35.90000 71.30000             1
 612   64 29.00000  87.30000  3.50000  ...  66.70000  

### Step5 - Run Classifications

In [None]:
print ( "features {0} \ntarget : {1}".format( feature_labels, target_variable))

x_train , x_test , y_train , y_test = train_test_split(X , y, test_size = 0.20, random_state = 42)
print( "Size of \nx_train :{}  y_train :{} \nx_test :{}  y_test :{}".format(  x_train.shape ,  y_train.shape , x_test.shape, y_test.shape )  )

features ['Age', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'Gender_index'] 
target : ['Category_index']
Size of 
x_train :(471, 12)  y_train :(471,) 
x_test :(118, 12)  y_test :(118,)


##### Find score by multiple ML Models

In [None]:
models = ['mnb', 'Logistic' , 'svm', 'rf' , 'nn', 'nb','KNN','XGB']
model_results = []
for model_name in models:
  mdl_results =  Run_Classification (x_train , y_train , x_test , y_test ,  X.columns, modeltype= model_name   )
  model_results.append(mdl_results)
##print ( model_results)
df = pd.DataFrame(model_results, columns =['Algorithm', 'Training_Score', 'Testing_Score' ])   
print ("================= Scores of Algorithms ===================== \n\n")
print(df)  


  Algorithm  Training_Score  Testing_Score
0       mnb         0.94268        0.86441
1  Logistic         0.98089        0.92373
2       svm         0.95117        0.88136
3        rf         1.00000        0.88983
4        nn         1.00000        0.91525
5        nb         0.95117        0.88136
6       KNN         0.95329        0.89831
7       XGB         1.00000        0.92373


#### Cross validation

In [None]:
 ## We can specify more than one metrics
scoring_metrics = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']
models = ['Logistic',  'svm', 'rf' , 'nn', 'nb', 'mnb', 'KNN','XGB']
cv_results = []
cv_stats  = []
for model_name in models:
  cv =  run_cross_Validation (X,y, scoring_metrics, model_name)
  cv_results.append(cv)
  cv_stats.append( [model_name 
                    , cv['test_accuracy'].mean(),  cv['train_accuracy'].mean()  
                    , cv['test_precision_weighted'].mean(),  cv['train_precision_weighted'].mean()  
                    , cv['test_recall_weighted'].mean(),  cv['train_recall_weighted'].mean()  
                    , cv['test_f1_weighted'].mean(),  cv['train_f1_weighted'].mean()  
                    , cv['score_time'].mean(),  cv['fit_time'].mean()
                                        ]  
                   )


In [None]:
print ("================= Cross validation of Algorithms - Stats ===================== \n\n")
df_cv = pd.DataFrame(cv_stats, 
                     columns = ['ModelName' , 'tst_Accuracy' , 'train_Accuracy',  'tst_pw' , 'train_pw' , 'tst_rw', 'train_rw' , 'tst_fw', 'train_fw' , 'score_time' , 'fit_time']
                     , index=None)
df_cv.head(20) 





Unnamed: 0,ModelName,tst_Accuracy,train_Accuracy,tst_pw,train_pw,tst_rw,train_rw,tst_fw,train_fw,score_time,fit_time
0,Logistic,0.94231,0.97698,0.95408,0.97647,0.94231,0.97698,0.94177,0.97646,0.00459,0.05641
1,svm,0.9322,0.94775,0.90416,0.92982,0.9322,0.94775,0.91632,0.93671,0.00405,0.00595
2,rf,0.94404,1.0,0.92259,1.0,0.94404,1.0,0.93152,1.0,0.07859,1.79723
3,nn,0.94056,0.9966,0.94774,0.99723,0.94056,0.9966,0.93943,0.99676,0.00573,1.03421
4,nb,0.92709,0.93869,0.93525,0.9416,0.92709,0.93869,0.92935,0.93877,0.00469,0.00298
5,mnb,0.91517,0.92153,0.93016,0.94169,0.91517,0.92153,0.91944,0.92754,0.00329,0.00215
6,KNN,0.92361,0.94605,0.90473,0.93812,0.92361,0.94605,0.90887,0.93777,0.00618,0.00243
7,XGB,0.94912,1.0,0.94309,1.0,0.94912,1.0,0.9417,1.0,0.00468,0.18836
