In [2]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits
import warnings
warnings.filterwarnings("ignore") 

In [3]:
data = load_digits()

In [4]:
data.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])

In [5]:
 X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.33, random_state=42)

In [6]:
X_train.shape

(1203, 64)

In [7]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test , y_test)

0.9629629629629629

In [8]:
sv = SVC()
sv.fit(X_train, y_train)
sv.score(X_test , y_test)

0.9865319865319865

In [9]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf.score(X_test , y_test)

0.9747474747474747

## This func use to get score of different algorithm

In [10]:
def get_score(model , X_train, y_train, X_test , y_test):
    model.fit(X_train , y_train)
    return model.score(X_test , y_test)


In [11]:
get_score(lr ,X_train, y_train, X_test , y_test )

0.9629629629629629

In [12]:
get_score(sv ,X_train, y_train, X_test , y_test )

0.9865319865319865

In [13]:
get_score(rf ,X_train, y_train, X_test , y_test )

0.9730639730639731

## Drawback of train_test_split

when we use train_test_split we get 80% training sample and 20% testing sample 
but the model didn't train on the 20% sample which goes in testing so model can't 
predict accurate result for that testing samples

# KFold

In [14]:
from sklearn.model_selection import KFold

In [15]:
fold = KFold(n_splits=3)

In [16]:
 l=[1,2,3,4,5,6,7,8,9] 

In [17]:
fold.split(l)

<generator object _BaseKFold.split at 0x000001F48E454DC0>

In [18]:
for train_index, test_index in fold.split(l):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


# StratifiedFold

In [19]:
from sklearn.model_selection import StratifiedKFold

In [20]:
kf = StratifiedKFold(n_splits=3)

i- o 
1- 0
2- 0 
3- 0
4- 1
5- 1
6- 1
7- 2
8- 2
9- 2

In [21]:
l1 =[0,0,0,1,1,1,2,2,2]

In [22]:
for train_index, test_index in kf.split(l, l1):
    print(train_index, test_index)

[1 2 4 5 7 8] [0 3 6]
[0 2 3 5 6 8] [1 4 7]
[0 1 3 4 6 7] [2 5 8]


## KFold on digits datasets:
    

In [23]:
lr_score=[]
svm_score = []
rf_score =[]

In [24]:
for train_index, test_index in kf.split(data.data, data.target):
#     print(train_index, test_index)
    X_train , X_test= data.data[train_index] , data.data[test_index] 
    y_train , y_test = data.target[train_index] , data.target[test_index] 
    lr_score.append(get_score(lr , X_train , y_train , X_test , y_test))
    svm_score.append(get_score(sv , X_train , y_train , X_test , y_test))
    rf_score.append(get_score(rf , X_train , y_train , X_test , y_test))

In [25]:
lr_score , svm_score , rf_score

([0.9215358931552587, 0.9415692821368948, 0.9165275459098498],
 [0.9649415692821369, 0.9799666110183639, 0.9649415692821369],
 [0.9365609348914858, 0.9565943238731218, 0.9298831385642737])

In [26]:
np.mean(lr_score)


0.9265442404006677

In [27]:
np.mean(svm_score)

0.9699499165275459

In [28]:
np.mean(rf_score)

0.9410127991096271

## Cross_val_score to directly get score 

In [29]:
from sklearn.model_selection import cross_val_score

In [30]:
np.mean(cross_val_score(LogisticRegression() , data.data , data.target , cv =5))

0.9137650882079852

In [49]:
np.mean(cross_val_score(SVC(kernel="sigmoid") , data.data , data.target , cv =5))

0.8803775920767564

In [32]:
np.mean(cross_val_score(RandomForestClassifier() , data.data , data.target , cv =5))

0.9382466728567007