In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv("iris.csv")

In [None]:
features = data.iloc[:,:-1].values
label = data.iloc[:,-1].values

# Cross Validation Technique

### Goal:
1. To get the minimum score threshold
2. To understand what optimal score I can achieve from the dataset
3. To extract the best training sample that can give the best score

In [None]:
# Demonstrate the score threshold with LogisticRegression
from sklearn.linear_model import LogisticRegression
modelAlgo = LogisticRegression()

In [None]:
from sklearn.model_selection import cross_val_score

# Supress warnings
import warnings
warnings.filterwarnings('ignore')

scores = cross_val_score(modelAlgo,
                        features,
                        label,
                        cv = 10) #5 or 10

scores

array([1.        , 0.93333333, 1.        , 1.        , 0.93333333,
       0.93333333, 0.93333333, 1.        , 1.        , 1.        ])

In [None]:
# What is the minimum score threshold for this dataset?

print("Minimum Score Threshold is : ",scores.mean())
print("Suggested SL value to commit: ", 1-scores.mean())

Minimum Score Threshold is :  0.9733333333333334


In [None]:
# LogisticR : Threshold: 0.9733333333333334

In [None]:
# What is the optimal score I can achieve for this dataset using LogisticRegression?
scores.max()

1.0

In [None]:
# 3. To extract the best training sample that gives the best score for LogisticRegression

# Step1: Initialize the algo
from sklearn.linear_model import LogisticRegression
modelAlgo = LogisticRegression()

# Step2: Initialize K-Fold Cross Validation function

from sklearn.model_selection import KFold

kfold = KFold(n_splits=10, #Use the same CV values that was applied in cross_val_score
             shuffle=True,
             random_state = 1) # To ensure the data is not randomized at every iteration

# 3. initialize for loop to identify which sample gives the best score and which sample is the best 
#.   training sample

counter = 0

for train,test in kfold.split(features):
    
    #Counter will help you track the sample split
    counter += 1
    
    #Extract the training set and testing set
    X_train,X_test = features[train],features[test]
    y_train,y_test = label[train] , label[test]
    
    #Fit the model 
    modelAlgo.fit(X_train,y_train)
    
    if modelAlgo.score(X_test,y_test) >= 1.0:
        print("Test Score {} Train Score {} for Sample Split {}".format(modelAlgo.score(X_test,y_test),modelAlgo.score(X_train,y_train),counter))

Test Score 1.0 Train Score 0.9777777777777777 for Sample Split 1
Test Score 1.0 Train Score 0.9777777777777777 for Sample Split 4
Test Score 1.0 Train Score 0.9703703703703703 for Sample Split 7
Test Score 1.0 Train Score 0.9703703703703703 for Sample Split 9


In [None]:
# Extract the samples
# Step1: Initialize the algo
from sklearn.linear_model import LogisticRegression
modelAlgo = LogisticRegression()

# Step2: Initialize K-Fold Cross Validation function

from sklearn.model_selection import KFold

kfold = KFold(n_splits=10, #Use the same CV values that was applied in cross_val_score
             shuffle=True,
             random_state = 1) # To ensure the data is not randomized at every iteration

# 3. initialize for loop to identify which sample gives the best score and which sample is the best 
#.   training sample

counter = 0
for train,test in kfold.split(features):
    
    #Counter will help you track the sample split
    counter += 1
    
    if counter == 1:
        X_train,X_test,y_train,y_test = features[train],features[test],label[train] , label[test]
    
 
    

In [None]:
kfold.split(features)

In [None]:
from sklearn.linear_model import LogisticRegression
finalModel = LogisticRegression()
finalModel.fit(X_train,y_train)
finalModel.score(X_test,y_test)

1.0

In [None]:
# Another method to extract best sample (Optimized Way --> Dealing with Large Data in less time)
# StraifiedShuffleSplit
# 3. To extract the best training sample that gives the best score for LogisticRegression

# Step1: Initialize the algo
from sklearn.linear_model import LogisticRegression
modelAlgo = LogisticRegression()

# Step2: Initialize StratifiedShuffleSplit Cross Validation function

from sklearn.model_selection import StratifiedShuffleSplit

ss = StratifiedShuffleSplit(n_splits=10, #Use the same CV values that was applied in cross_val_score
             test_size=0.2,
             random_state = 1) # To ensure the data is not randomized at every iteration

# 3. initialize for loop to identify which sample gives the best score and which sample is the best 
#.   training sample

counter = 0

for train,test in ss.split(features,label):
    
    #Counter will help you track the sample split
    counter += 1
    
    #Extract the training set and testing set
    X_train,X_test = features[train],features[test]
    y_train,y_test = label[train] , label[test]
    
    #Fit the model 
    modelAlgo.fit(X_train,y_train)
    
    if modelAlgo.score(X_test,y_test) >= 1.0:
        print("Test Score {} Train Score {} for Sample Split {}".format(modelAlgo.score(X_test,y_test),modelAlgo.score(X_train,y_train),counter))

Test Score 1.0 Train Score 0.9666666666666667 for Sample Split 3
Test Score 1.0 Train Score 0.975 for Sample Split 7
Test Score 1.0 Train Score 0.9583333333333334 for Sample Split 10


In [None]:
# Extract the samples
# Step1: Initialize the algo
from sklearn.linear_model import LogisticRegression
modelAlgo = LogisticRegression()

# Step2: Initialize K-Fold Cross Validation function

from sklearn.model_selection import StratifiedShuffleSplit

ss = StratifiedShuffleSplit(n_splits=10, #Use the same CV values that was applied in cross_val_score
             test_size=0.2,
             random_state = 1) # To ensure the data is not randomized at every iteration

# 3. initialize for loop to identify which sample gives the best score and which sample is the best 
#.   training sample

counter = 0
for train,test in ss.split(features,label):
    
    #Counter will help you track the sample split
    counter += 1
    
    if counter == 7:
        X_trainSS,X_testSS,y_trainSS,y_testSS = features[train],features[test],label[train] , label[test]
    
 
    

In [None]:
from sklearn.linear_model import LogisticRegression
finalModel = LogisticRegression()
finalModel.fit(X_trainSS,y_trainSS)
finalModel.score(X_testSS,y_testSS)

1.0