## Resampling Methods and Accuracy

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
names = ['preg', 'plas', 'pres', 'skin', 'test','mass', 'pedi', 'age', 'class']
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
data = pd.read_csv(url, names=names)
print(type(data))
array = data.values
x = array[:,0:8] # take out class column
y = array[:,8] # class column as a 1D list of 1s and 0s
y

<class 'pandas.core.frame.DataFrame'>


array([1., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1.,
       1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0.,
       0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       1., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1.,
       1., 0., 0., 0., 0.

### Simple Cross Validation

In [2]:
from sklearn.model_selection import KFold # number of iterations that will divide my db, in the examples is 5
from sklearn.model_selection import cross_val_score # validation method: cross validation and gives u a score
from sklearn.linear_model import LogisticRegression # math model for classification (ml algorithm)

num_folds = 10
kfold = KFold(n_splits=num_folds, shuffle=True) # shuffle is the part that makes the randomness
# logistic regression param = lbfgs is the penalty for diff between y data and predicted data (error)
model = LogisticRegression(solver='lbfgs', max_iter=1000)
# apply everything
results = cross_val_score(model, x, y, cv=kfold)
results.mean() * 100 # percentage of accuracy

77.59398496240601

In [3]:
results.std() * 100 # std between iterations: how much each iter differs from the others

4.515113637081425

### Division by Percentage

In [4]:
from sklearn.model_selection import train_test_split # division by percentage
from sklearn.linear_model import LogisticRegression

test_size = 0.33
seed = 1
# returns you the subsets for training and testing for each x and y
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size, random_state = seed)
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(x_train, y_train)
results = model.score(x_test, y_test)
print(results * 100) # percentage of accuracy

77.55905511811024


### Cross Validation with Repetition

In [5]:
from sklearn.model_selection import RepeatedKFold # Cross Validation with Repetition
from sklearn.model_selection import cross_val_score # validation method
from sklearn.linear_model import LogisticRegression # math model for classification

num_folds = 10
num_repeated = 5
repeatedkfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeated)
model = LogisticRegression(solver='lbfgs', max_iter=1000)
results = cross_val_score(model, x, y, cv=repeatedkfold)
results.mean() * 100 # percentage of accuracy

77.49658236500342

### Leave One Out Cross Validation

In [6]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score # validation method
from sklearn.linear_model import LogisticRegression # math model for classification

loocv = LeaveOneOut()
model = LogisticRegression(solver='lbfgs', max_iter=1000)
results = cross_val_score(model, x, y, cv=loocv)
print(results.mean() * 100)
print(results.std() * 100)

77.60416666666666
41.68944689773287


### Division by Percentage with Repetition (random 33% subsets)

In [7]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score # validation method
from sklearn.linear_model import LogisticRegression # math model for classification

test_size = 0.33
n_splits = 10
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size)
kfold = ShuffleSplit(n_splits=n_splits, test_size = test_size)
model = LogisticRegression(solver='lbfgs', max_iter=1000)
results = cross_val_score(model, x, y, cv=kfold)
print(results.mean() * 100)
print(results.std() * 100)

76.88976377952756
1.7255596063630807
