# Cross Validation

## Decision Trees Regression

In [2]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_boston
from sklearn.datasets import load_iris

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings('ignore')

In [3]:
X, y = load_boston(return_X_y=True)
print("X has %d rows and %d columns"  %(X.shape[0],X.shape[1]))
print("y has %d rows"  %(y.shape[0]))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

print("X_train has %d rows and %d columns"  %(X_train.shape[0],X_train.shape[1]))
print("-----------------------------------")
print("The coefficient of determination for the test data is R2=%.2f"
      %(model.score(X_test, y_test)))
print("The coefficient of determination for the train data is R2=%.2f"
      %(model.score(X_train, y_train)))


X has 506 rows and 13 columns
y has 506 rows
X_train has 404 rows and 13 columns
-----------------------------------
The coefficient of determination for the test data is R2=0.86
The coefficient of determination for the train data is R2=1.00


In [4]:
scores=cross_val_score(model, X_train, y_train, cv=5)
print("Cross validation scores: ", scores)
print("Score stats: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Cross validation scores:  [0.79656559 0.85688753 0.82954233 0.76022679 0.76608302]
Score stats: 0.80 accuracy with a standard deviation of 0.04


## Decision Trees Classification

In [5]:
X, y = load_iris(return_X_y=True)
print("X has %d rows and %d columns"  %(X.shape[0],X.shape[1]))
print("y has %d rows"  %(y.shape[0]))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
print("The (mean) accuracy on the test set is %.2f" %(model.score(X_test, y_test)))
print("The (mean) accuracy on the train data is %.2f" %(model.score(X_train, y_train)))

X has 150 rows and 4 columns
y has 150 rows
The (mean) accuracy on the test set is 0.97
The (mean) accuracy on the train data is 1.00


<b> multi class problem 

In [6]:
set(y) # 3 labels

{0, 1, 2}

In [7]:
scores=cross_val_score(model, X_train, y_train, cv=5) # 5 cv foldings
scores

array([1.        , 0.91666667, 0.875     , 0.91666667, 0.95833333])

In [8]:
print("Five-fold cv results: \n %0.2f mean accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Five-fold cv results: 
 0.93 mean accuracy with a standard deviation of 0.04


In [9]:
y_test

array([1, 0, 1, 0, 2, 1, 2, 1, 2, 0, 1, 1, 1, 0, 1, 2, 2, 2, 1, 0, 1, 0,
       1, 0, 2, 1, 2, 1, 0, 2])

In [10]:
y_pred = cross_val_predict(model, X_test, y_test, cv=5)
y_pred

array([1, 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 2, 1, 0, 1, 2, 2, 2, 1, 0, 1, 0,
       1, 0, 2, 1, 2, 1, 0, 2])

### Models Comparision (3 regression estimators)

In [11]:
model1 = DecisionTreeRegressor()
model2 = LinearRegression()
model3 = KNeighborsRegressor()

model_pipeline = [model1, model2, model3]
model_names = ['Regression Tree', 'Linear Regression', 'KNN']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

{'Regression Tree': 0.9182379804438309, 'Linear Regression': 0.9237415139637886, 'KNN': 0.9544323733255992}


In [12]:
print("Comparing the 3 regression scores we find \n")

pd.DataFrame([scores], index=["score"])

Comparing the 3 regression scores we find 



Unnamed: 0,Regression Tree,Linear Regression,KNN
score,0.918238,0.923742,0.954432
