# Table of Contents
* [Loading Data](#loading-data)
* [Statistical Summary and Class Breakdown](#statistical-summary)
* [Dropping Rows with Missing Values](#dropping-rows)
* [Data Imputation](#data-imputation)
* [Min-max Scaling](#min-max-scaling)
* [standardization](#standardization)
* [Robust Scaling](#robust-scaling)
* [Categorical Data](#categorical-data)
* [Feature Engineering](#feature-engineering)
* [Univariate Selection](#univariate-selection)
* [Recursive Feature Elimination](#recursive-feature-elimination)
* [Dimensionality Reduction](#dimensionality-reduction)
* [Hold-out Validation](#hold-out-validation)
* [Cross Validation](#cross-validation)
* [Confusion Matrix](#confusion-matrix)
* [Classification Report](#classification-report)
* [Grid Search](#grid-search)

In [1]:
# Initialization
%matplotlib inline
from warnings import filterwarnings
filterwarnings('ignore')

## Loading Data <a class="anchor" id="loading-data"></a>

In [None]:
# Load data from CSV file
from pandas import read_csv
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)

In [3]:
df.head(5)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
array = df.values
# separate data into features and targets
X = array[:,0:8]
y = array[:,8]
print(X.shape, y.shape)

(768, 8) (768,)


## Statistical Summary and Class Breakdown <a class="anchor" id="statistical-summary"></a>

In [4]:
# Print statistical summary and class breakdown
from pandas import read_csv
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
print(df.shape) # print the dimension of the data
print(df.describe()) # print the statistical summary of the data
class_counts = df.groupby('class').size()
print(class_counts) # print the class breakdown of the data

(768, 9)
             preg        plas        pres        skin        test        mass  \
count  768.000000  768.000000  768.000000  768.000000  768.000000  768.000000   
mean     3.845052  120.894531   69.105469   20.536458   79.799479   31.992578   
std      3.369578   31.972618   19.355807   15.952218  115.244002    7.884160   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      1.000000   99.000000   62.000000    0.000000    0.000000   27.300000   
50%      3.000000  117.000000   72.000000   23.000000   30.500000   32.000000   
75%      6.000000  140.250000   80.000000   32.000000  127.250000   36.600000   
max     17.000000  199.000000  122.000000   99.000000  846.000000   67.100000   

             pedi         age       class  
count  768.000000  768.000000  768.000000  
mean     0.471876   33.240885    0.348958  
std      0.331329   11.760232    0.476951  
min      0.078000   21.000000    0.000000  
25%      0.243750   24.000000    0.000000

## Dropping Rows with Missing Values <a class="anchor" id="dropping-rows"></a>

In [5]:
# Handling missing values by dropping data samples with missing values
import pandas as pd
df = pd.DataFrame({'Age': [17, 23, 'x', 38, 54, 67, 32],
                  'Height': [160, 172, 150, 165, 163, 158, 175],
                  'Weight':[50, 68, 43, 52, 47, 49, 'x']})
df.replace({'x': None}, inplace=True) # replace missing values (x) with NaN
print(df)
df.dropna(inplace=True) # drop rows with NaN
print(df)

    Age  Height  Weight
0  17.0     160    50.0
1  23.0     172    68.0
2   NaN     150    43.0
3  38.0     165    52.0
4  54.0     163    47.0
5  67.0     158    49.0
6  32.0     175     NaN
    Age  Height  Weight
0  17.0     160    50.0
1  23.0     172    68.0
3  38.0     165    52.0
4  54.0     163    47.0
5  67.0     158    49.0


## Data Imputation <a class="anchor" id="data-imputation"></a>

In [6]:
# Handling missing values by imputing missing values with statistic
import pandas as pd
df = pd.DataFrame({'Age': [17, 23, 'x', 38, 54, 67, 32],
                  'Height': [160, 172, 150, 165, 163, 158, 175],
                  'Weight':[50, 68, 43, 52, 47, 49, 'x']})
df.replace({'x': None}, inplace=True)
df['Age'].fillna(df['Age'].median(), inplace=True) # replace NaN with median
df['Weight'].fillna(df['Weight'].mean(), inplace=True) # replace NaN with mean
print(df)

    Age  Height  Weight
0  17.0     160    50.0
1  23.0     172    68.0
2  35.0     150    43.0
3  38.0     165    52.0
4  54.0     163    47.0
5  67.0     158    49.0
6  32.0     175    51.5


## Min-max Scaling <a class="anchor" id="min-max-scaling"></a>

In [7]:
# Scale data (between 0 and 1)
from pandas import read_csv
from sklearn.preprocessing import MinMaxScaler
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
# separate array into input and output components
X = array[:,:-1]
y = array[:,-1]
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(X)
scaledX = scaler.transform(X)
# Preview the scaled data
print(scaledX[:5,:])

[[0.35294118 0.74371859 0.59016393 0.35353535 0.         0.50074516
  0.23441503 0.48333333]
 [0.05882353 0.42713568 0.54098361 0.29292929 0.         0.39642325
  0.11656704 0.16666667]
 [0.47058824 0.91959799 0.52459016 0.         0.         0.34724292
  0.25362938 0.18333333]
 [0.05882353 0.44723618 0.54098361 0.23232323 0.11111111 0.41877794
  0.03800171 0.        ]
 [0.         0.68844221 0.32786885 0.35353535 0.19858156 0.64232489
  0.94363792 0.2       ]]


## Standardization <a class="anchor" id="standardization"></a>

In [8]:
# Standardize data (0 mean, 1 stdev)
from sklearn.preprocessing import StandardScaler
from pandas import read_csv
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
# separate array into input and output components
X = array[:, :-1]
y = array[:, -1]
scaler = StandardScaler()
scaledX = scaler.fit_transform(X)
# Preview transformed data
print(scaledX[:5, :])

[[ 0.63994726  0.84832379  0.14964075  0.90726993 -0.69289057  0.20401277
   0.46849198  1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575  0.53090156 -0.69289057 -0.68442195
  -0.36506078 -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 -1.28821221 -0.69289057 -1.10325546
   0.60439732 -0.10558415]
 [-0.84488505 -0.99820778 -0.16054575  0.15453319  0.12330164 -0.49404308
  -0.92076261 -1.04154944]
 [-1.14185152  0.5040552  -1.50468724  0.90726993  0.76583594  1.4097456
   5.4849091  -0.0204964 ]]


## Robust Scaling <a class="anchor" id="robust-scaling"></a>

In [9]:
# Robust scaling (0 median, 1 IQR)
from sklearn.preprocessing import RobustScaler
from pandas import read_csv
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
# separate array into input and output components
X = array[:,:-1]
y = array[:,-1]
scaler = RobustScaler()
scaledX = scaler.fit_transform(X)
# Preview transformed data
print(scaledX[0:5,:])

[[ 0.6         0.75151515  0.          0.375      -0.23968566  0.17204301
   0.66535948  1.23529412]
 [-0.4        -0.77575758 -0.33333333  0.1875     -0.23968566 -0.58064516
  -0.05620915  0.11764706]
 [ 1.          1.6        -0.44444444 -0.71875    -0.23968566 -0.93548387
   0.78300654  0.17647059]
 [-0.4        -0.67878788 -0.33333333  0.          0.49901768 -0.41935484
  -0.5372549  -0.47058824]
 [-0.6         0.48484848 -1.77777778  0.375       1.0805501   1.19354839
   5.00784314  0.23529412]]


## Categorical Data <a class="anchor" id="categorical-data"></a>

In [10]:
# Handling categorical data
import pandas as pd
df0 = pd.DataFrame({'year':[2015, 2017, 2013, 2018, 2020], 
                  'make':['Toyota', 'Honda', 'Perodua', 'Hyundai', 'Toyota'],
                  'engine':[1.5, 1.8, 1.3, 1.6, 1.8],
                  'review':['moderate', 'good', 'poor', 'moderate', 'good']})
mapping = {'poor':1, 'moderate':2, 'good':3}
df0['review'] = df0['review'].map(mapping) # encode ordinal data
df0 = pd.get_dummies(df0) # encode nominal data
print(df0)

   year  engine  review  make_Honda  make_Hyundai  make_Perodua  make_Toyota
0  2015     1.5       2           0             0             0            1
1  2017     1.8       3           1             0             0            0
2  2013     1.3       1           0             0             1            0
3  2018     1.6       2           0             1             0            0
4  2020     1.8       3           0             0             0            1


## Feature Engineering <a class="anchor" id="feature-engineering"></a>

In [11]:
# Create 2 new features
from pandas import read_csv
from sklearn.preprocessing import MinMaxScaler
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
win_size = 3
df['plas_pres'] = df['plas'] + df['pres'] # new feature 1
df['mass_ave'] = df['mass'].rolling(win_size).mean() # new feature 2
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class,plas_pres,mass_ave
0,6,148,72,35,0,33.6,0.627,50,1,220,
1,1,85,66,29,0,26.6,0.351,31,0,151,
2,8,183,64,0,0,23.3,0.672,32,1,247,27.833333
3,1,89,66,23,94,28.1,0.167,21,0,155,26.0
4,0,137,40,35,168,43.1,2.288,33,1,177,31.5


## Univariate Selection <a class="anchor" id="univariate-selection"></a>

In [12]:
# Feature Selection with Univariate Selection
from pandas import read_csv
from sklearn.feature_selection import SelectKBest
# load data
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
selector = SelectKBest(k=4)
features = selector.fit_transform(X, y)
selected = selector.get_support()
# Show selected features
print([names[i] for i in range(len(names)-1) if selected[i]])

['preg', 'plas', 'mass', 'age']


## Recursive Feature Elimination <a class="anchor" id="recursive-feature-elimination"></a>

In [13]:
# Feature Selection with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv(filename, names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
model = DecisionTreeClassifier()
rfe = RFE(model, 3)
features = rfe.fit_transform(X, y)
selected = rfe.get_support()
# Show selected features
print([names[i] for i in range(len(names)-1) if selected[i]])

['plas', 'mass', 'pedi']


## Dimensionality Reduction <a class="anchor" id="dimensionality-reduction"></a>

In [None]:
# Dimensionality Reduction with PCA
from pandas import read_csv
from sklearn.decomposition import PCA
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
pca = PCA(n_components=3)
features = pca.fit_transform(X)
# summarize components
print("Explained Variance: %s" % pca.explained_variance_ratio_)

## Hold-out Validation <a class="anchor" id="hold-out-validation"></a>

In [None]:
# Evaluate using a train and a test set
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsClassifier
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.33, random_state=42)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
result = model.score(X_test, y_test)
print(f"Accuracy: {100 * result:.2f} %")

## Cross Validation <a class="anchor" id="cross-validation"></a>

In [None]:
# Evaluate using Cross Validation
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
model = KNeighborsClassifier()
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
results = cross_val_score(model, X, y, cv=kfold)
print(f"Accuracy: {100*results.mean():.2f} % ({100*results.std():.2f})")

## Confusion Matrix <a class="anchor" id="confusion-matrix"></a>

In [None]:
# Plot Confusion Matrix
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_confusion_matrix
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.33, random_state=42)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
plot_confusion_matrix(model, X_test, y_test)

## Classification Report <a class="anchor" id="classification-report"></a>

In [None]:
# Classification report
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsClassifier
from yellowbrick.classifier import ClassificationReport
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.33, random_state=42)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
report = ClassificationReport(model)
report.score(X_test, y_test)
report.show()

## Grid Search <a class="anchor" id="grid-search"></a>

In [None]:
# Grid Search for Hyperparameter Tuning
from pandas import read_csv
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
params = dict(C=[0.001, 0.01, 0.1, 1, 10], gamma=[0.001, 0.01, 0.1, 1, 10])
model = SVC()
grid = GridSearchCV(model, params, n_jobs=-1, verbose=2)
grid.fit(X, y)
print(grid.best_score_)
print(grid.best_params_)