# Simplilearn CV YouTube Tutorial

## Import modules and load dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, KFold

df = pd.read_csv('adult.csv')
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

## Explore data

### Raw data

In [2]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [3]:
df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K
7,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20,United-States,>50K
8,68,Federal-gov,422013,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,3683,40,United-States,<=50K
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,?,>50K


### Convert gender and marital status to ints

In [4]:
df['sex'] = df['sex'].map({'Male':0, 'Female':1})

df['marital.status'] = df['marital.status'].replace(['Never-married',
                                                     'Divorced',
                                                     'Separated',
                                                     'Widowed'], 0)
df['marital.status'] = df['marital.status'].replace(['Married-civ-spouse',
                                                     'Married-spouse-absent',
                                                     'Married-AF-spouse',
                                                     'Married'], 1)

df['marital.status'] = df['marital.status'].astype(int)

### Drop some features

In [5]:
df.drop(labels=['workclass', 'education', 'occupation', 'relationship', 
                'race', 'native.country'], 
        axis=1, inplace=True)

df.head(10)

Unnamed: 0,age,fnlwgt,education.num,marital.status,sex,capital.gain,capital.loss,hours.per.week,income
0,90,77053,9,0,1,0,4356,40,<=50K
1,82,132870,9,0,1,0,4356,18,<=50K
2,66,186061,10,0,1,0,4356,40,<=50K
3,54,140359,4,0,1,0,3900,40,<=50K
4,41,264663,10,0,1,0,3900,40,<=50K
5,34,216864,9,0,1,0,3770,45,<=50K
6,38,150601,6,0,0,0,3770,40,<=50K
7,74,88638,16,0,1,0,3683,20,>50K
8,68,422013,9,0,1,0,3683,40,<=50K
9,41,70037,10,0,0,0,3004,60,>50K


## Build Model

In [6]:
y = df['income']
df = df.drop('income', axis=1)

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

In [7]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))

## Run Cross Validation

### Prediction Models

In [8]:
results = dict()
names=[]

for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    results[name] = (cv_results.mean(), cv_results.std())
    names.append(name)
    
print('name\tresults.mean\t\treults.std')
for key, value in results.items():
    print(key, value)

name	results.mean		reults.std
LR (0.7973362404214983, 0.007293950941515901)
KNN (0.7752998340070347, 0.00659626168189453)


### Validation Methods

In [9]:
methods = []
methods.append(('KFold', KFold(n_splits=10)))
methods.append(('StratifiedKFold', StratifiedKFold(n_splits=10)))

In [10]:
results = dict()

for name, method in methods:
    model = KNeighborsClassifier()
    cv_results = cross_val_score(model, X_train, y_train, cv=method, scoring='accuracy')
    results[name] = (cv_results.mean(), cv_results.std())
    
print('name\tresults.mean\t\treults.std')
for key, value in results.items():
    print(key, value)

name	results.mean		reults.std
KFold (0.7752998340070347, 0.00659626168189453)
StratifiedKFold (0.7750691391657896, 0.0050817475398429015)


### Number of Splits

In [11]:
n_splits = np.arange(2,21)
n_splits

array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20])

In [12]:
results = dict()

for n_splits in n_splits:
    model = KNeighborsClassifier()
    skFolds = StratifiedKFold(n_splits=n_splits)
    cv_results = cross_val_score(model, X_train, y_train, cv=skFolds, scoring='accuracy')
    results[n_splits] = (cv_results.mean(), cv_results.std())
    
print('name\tresults.mean\t\treults.std')
for key, value in results.items():
    print(key, value)

name	results.mean		reults.std
2 (0.7694640663390664, 0.0006526412776412527)
3 (0.774070974757945, 0.0007745624496180905)
4 (0.7735718673218674, 0.0018218382244632604)
5 (0.7748771375690015, 0.0024210511038378017)
6 (0.7733033127479203, 0.0033502236758280145)
7 (0.7731882268705886, 0.004515157661881555)
8 (0.7741477272727273, 0.0045604165462161525)
9 (0.7748391745333784, 0.005048915545418364)
10 (0.7750691391657896, 0.0050817475398429015)
11 (0.7756449631449631, 0.005898685140254559)
12 (0.7755296921223133, 0.006958207862973347)
13 (0.7760286677307853, 0.005118593286054497)
14 (0.7751843152889242, 0.005239273887706531)
15 (0.7761444905185503, 0.004900079322996503)
16 (0.7754914004914005, 0.005782607611720741)
17 (0.7750313509287259, 0.0060316253237664725)
18 (0.7744934005412438, 0.007135963214655109)
19 (0.7758753085496087, 0.00854818412968389)
20 (0.7751079571778703, 0.007625524148035953)
