### K-fold evaluation

In [1]:
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=7,
    n_redundant=3,
    n_classes=2,
    weights=[0.3,0.7],
    random_state = 42
)

In [2]:
from collections import Counter

Counter(y)

Counter({np.int64(1): 699, np.int64(0): 301})

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
print(Counter(y_train), Counter(y_test))

Counter({np.int64(1): 558, np.int64(0): 242}) Counter({np.int64(1): 141, np.int64(0): 59})


In [5]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

model.score(X_test, y_test)

0.81

In [6]:
import numpy as np
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

X = np.array(X)
y = np.array(y)

scores = []

for train_index, test_index in  kf.split(X):  #pass numpy array not pandas DataFrame
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))

scores

# np.mean(scores) #output = np.float64(0.913)

[0.81, 0.78, 0.825, 0.785, 0.805]

### Using Cross_val_score

In [7]:
from sklearn.model_selection import cross_val_score
import time

start = time.time()
score = cross_val_score(LogisticRegression(), X, y, cv=kf)
end = time.time()

mean = np.mean(score)

print(score, end-start, mean)

[0.81  0.78  0.825 0.785 0.805] 0.03403735160827637 0.8009999999999999


In [8]:
from sklearn.tree import DecisionTreeClassifier

start = time.time()
score = cross_val_score(DecisionTreeClassifier(), X, y, cv=kf)
end = time.time()

mean = np.mean(score)

print(score, end-start, mean)

[0.865 0.88  0.885 0.835 0.885] 0.03964996337890625 0.8699999999999999


In [9]:
from sklearn.ensemble import RandomForestClassifier


start = time.time()
score = cross_val_score(RandomForestClassifier(), X, y, cv=kf)
end = time.time()

mean = np.mean(score)

print(score, end-start, mean)

[0.91  0.93  0.93  0.895 0.94 ] 1.2470118999481201 0.921


In [10]:
from xgboost import XGBClassifier

start = time.time()
score = cross_val_score(XGBClassifier(), X, y, cv=kf, scoring='accuracy')
end = time.time()

mean = np.mean(score)

print(score, end-start, mean)

[0.925 0.94  0.945 0.9   0.96 ] 1.3628361225128174 0.9339999999999999


In [11]:
### Using StratifiedKFold

In [12]:
from sklearn.model_selection import StratifiedKFold

X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=7,
    n_redundant=3,
    n_classes=2,
    weights=[0.3,0.7],
    random_state=42
)



In [13]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

print(Counter(y_train), Counter(y_test))

Counter({np.int64(1): 559, np.int64(0): 241}) Counter({np.int64(1): 140, np.int64(0): 60})


In [14]:
start = time.time()
score = cross_val_score(LogisticRegression(), X, y, cv=skf)
end = time.time()

mean = np.mean(score)

print(score, end-start, mean)

[0.8   0.79  0.84  0.815 0.775] 0.13008451461791992 0.804


In [15]:
start = time.time()
score = cross_val_score(XGBClassifier(), X, y, cv=skf)
end = time.time()

mean = np.mean(score)

print(score, end-start, mean)

[0.925 0.95  0.925 0.94  0.95 ] 1.5359528064727783 0.938


### Using GridSearchCV

In [16]:
from sklearn.model_selection import GridSearchCV

help(GridSearchCV)

Help on class GridSearchCV in module sklearn.model_selection._search:

class GridSearchCV(BaseSearchCV)
 |  GridSearchCV(
 |      estimator,
 |      param_grid,
 |      *,
 |      scoring=None,
 |      n_jobs=None,
 |      refit=True,
 |      cv=None,
 |      verbose=0,
 |      pre_dispatch='2*n_jobs',
 |      error_score=nan,
 |      return_train_score=False
 |  )
 |
 |  Exhaustive search over specified parameter values for an estimator.
 |
 |  Important members are fit, predict.
 |
 |  GridSearchCV implements a "fit" and a "score" method.
 |  It also implements "score_samples", "predict", "predict_proba",
 |  "decision_function", "transform" and "inverse_transform" if they are
 |  implemented in the estimator used.
 |
 |  The parameters of the estimator used to apply these methods are optimized
 |  by cross-validated grid-search over a parameter grid.
 |
 |  Read more in the :ref:`User Guide <grid_search>`.
 |
 |  Parameters
 |  ----------
 |  estimator : estimator object
 |      Thi

In [17]:
import pandas as pd
parameters = {'criterion': ["gini", "entropy", "log_loss"]}
clf = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=parameters)
clf.fit(X, y)

pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.023227,0.00869,0.002333,0.00021,gini,{'criterion': 'gini'},0.865,0.865,0.89,0.86,0.86,0.868,0.011225,2
1,0.022361,0.002818,0.00253,0.000687,entropy,{'criterion': 'entropy'},0.85,0.88,0.87,0.855,0.855,0.862,0.011225,3
2,0.018798,0.00153,0.002192,0.000388,log_loss,{'criterion': 'log_loss'},0.85,0.9,0.885,0.86,0.86,0.871,0.018547,1


### Using RandomizedSearchCV

In [23]:
from sklearn.model_selection import RandomizedSearchCV

parameters = {'criterion': ["gini", "entropy", "log_loss"], 'splitter': ['best', 'random'], 'max_depth': [20, 30]}
clf = RandomizedSearchCV(estimator=DecisionTreeClassifier(), param_distributions=parameters, cv=5, n_iter = 5)
clf.fit(X, y)

pd.DataFrame(clf.cv_results_)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_splitter,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.013367,0.015921,0.002422,0.000326,random,30,entropy,"{'splitter': 'random', 'max_depth': 30, 'crite...",0.855,0.895,0.885,0.84,0.855,0.866,0.020591,2
1,0.022262,0.00606,0.001645,0.000413,best,20,entropy,"{'splitter': 'best', 'max_depth': 20, 'criteri...",0.855,0.885,0.885,0.86,0.85,0.867,0.015033,1
2,0.004163,0.000404,0.001657,0.000276,random,20,log_loss,"{'splitter': 'random', 'max_depth': 20, 'crite...",0.83,0.855,0.905,0.83,0.855,0.855,0.027386,4
3,0.003652,0.000608,0.001498,0.000167,random,30,gini,"{'splitter': 'random', 'max_depth': 30, 'crite...",0.815,0.885,0.865,0.9,0.805,0.854,0.037736,5
4,0.003568,0.000379,0.002074,0.001724,random,20,entropy,"{'splitter': 'random', 'max_depth': 20, 'crite...",0.865,0.88,0.86,0.84,0.86,0.861,0.012806,3


In [21]:
print(f"Best Parameters: {clf.best_params_}")
print(f"Best Cross-Validation Score: {clf.best_score_}")

Best Parameters: {'splitter': 'best', 'max_depth': 30, 'criterion': 'gini'}
Best Cross-Validation Score: 0.8690000000000001
