# Table of Contents
* [Loading Data](#loading-data)
* [Statistical Summary and Class Breakdown](#statistical-summary)
* [Dropping Rows with Missing Values](#dropping-rows)
* [Data Imputation](#data-imputation)
* [Categorical Data](#categorical-data)
* [Min-max Scaling](#min-max-scaling)
* [Standard Scaling](#standard-scaling)
* [Robust Scaling](#robust-scaling)
* [Feature Engineering](#feature-engineering)
* [Univariate Selection](#univariate-selection)
* [Recursive Feature Elimination](#recursive-feature-elimination)
* [Hold-out Validation](#hold-out-validation)
* [Cross Validation](#cross-validation)

In [1]:
# Initialization
%matplotlib inline
from warnings import filterwarnings
filterwarnings('ignore')

## Loading Data <a class="anchor" id="loading-data"></a>

In [2]:
# Load data from CSV file
from pandas import read_csv
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
# separate data into features and targets
X = array[:,0:8]
y = array[:,8]
print(df.shape, X.shape, y.shape) # print the dimension of the dataframe, X & y

(768, 9) (768, 8) (768,)


## Statistical Summary and Class Breakdown <a class="anchor" id="statistical-summary"></a>

In [3]:
# Print statistical summary and class breakdown
from pandas import read_csv
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
print(df.describe()) # print the statistical summary of the data
class_counts = df.groupby('class').size()
print(class_counts) # print the class breakdown of the data

             preg        plas        pres        skin        test        mass  \
count  768.000000  768.000000  768.000000  768.000000  768.000000  768.000000   
mean     3.845052  120.894531   69.105469   20.536458   79.799479   31.992578   
std      3.369578   31.972618   19.355807   15.952218  115.244002    7.884160   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      1.000000   99.000000   62.000000    0.000000    0.000000   27.300000   
50%      3.000000  117.000000   72.000000   23.000000   30.500000   32.000000   
75%      6.000000  140.250000   80.000000   32.000000  127.250000   36.600000   
max     17.000000  199.000000  122.000000   99.000000  846.000000   67.100000   

             pedi         age       class  
count  768.000000  768.000000  768.000000  
mean     0.471876   33.240885    0.348958  
std      0.331329   11.760232    0.476951  
min      0.078000   21.000000    0.000000  
25%      0.243750   24.000000    0.000000  
50%   

## Dropping Rows with Missing Values <a class="anchor" id="dropping-rows"></a>

In [4]:
# Handling missing values by dropping data samples with missing values
import pandas as pd
import numpy as np
df = pd.DataFrame({'Age': [17, 23, 'x', 38, 54, 67, 32],
                  'Height': [160, 172, 150, 165, 163, 158, 175],
                  'Weight':[50, 68, 43, 52, 47, 49, 'x']})
df = df.replace({'x': np.nan}) # replace missing values (x) with NaN
print(df)
print(df.isnull().sum())
df = df.dropna() # drop rows with NaN
print(df)

    Age  Height  Weight
0  17.0     160    50.0
1  23.0     172    68.0
2   NaN     150    43.0
3  38.0     165    52.0
4  54.0     163    47.0
5  67.0     158    49.0
6  32.0     175     NaN
Age       1
Height    0
Weight    1
dtype: int64
    Age  Height  Weight
0  17.0     160    50.0
1  23.0     172    68.0
3  38.0     165    52.0
4  54.0     163    47.0
5  67.0     158    49.0


## Data Imputation <a class="anchor" id="data-imputation"></a>

In [5]:
# Handling missing values by imputing missing values with statistic
import pandas as pd
import numpy as np
df = pd.DataFrame({'Age': [17, 23, 'x', 38, 54, 67, 32],
                  'Height': [160, 172, 150, 165, 163, 158, 175],
                  'Weight':[50, 68, 43, 52, 47, 49, 'x']})
df = df.replace({'x': np.nan})
df['Age'] = df['Age'].fillna(df['Age'].median()) # replace NaN with median
df['Weight'] = df['Weight'].fillna(df['Weight'].mean()) # replace NaN with mean
print(df)

    Age  Height  Weight
0  17.0     160    50.0
1  23.0     172    68.0
2  35.0     150    43.0
3  38.0     165    52.0
4  54.0     163    47.0
5  67.0     158    49.0
6  32.0     175    51.5


## Categorical Data <a class="anchor" id="categorical-data"></a>

In [6]:
# Handling categorical data
import pandas as pd
df0 = pd.DataFrame({'year':[2015, 2017, 2013, 2018, 2020], 
                  'make':['Toyota', 'Honda', 'Perodua', 'Hyundai', 'Toyota'],
                  'engine':[1.5, 1.8, 1.3, 1.6, 1.8],
                  'review':['moderate', 'good', 'poor', 'moderate', 'good']})
mapping = {'poor':1, 'moderate':2, 'good':3}
df0['review'] = df0['review'].map(mapping) # encode ordinal data
df0 = pd.get_dummies(df0) # encode nominal data
print(df0)

   year  engine  review  make_Honda  make_Hyundai  make_Perodua  make_Toyota
0  2015     1.5       2           0             0             0            1
1  2017     1.8       3           1             0             0            0
2  2013     1.3       1           0             0             1            0
3  2018     1.6       2           0             1             0            0
4  2020     1.8       3           0             0             0            1


## Min-max Scaling <a class="anchor" id="min-max-scaling"></a>

In [7]:
# Scale data (between 0 and 1)
import numpy as np
from pandas import read_csv
from sklearn.preprocessing import MinMaxScaler
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
# separate array into input and output components
X = array[:,:-1]
y = array[:,-1]
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(X)
scaledX = scaler.transform(X)
# Check min and max of all column
print(f'minimum={np.min(scaledX, axis=0)}, maximum={np.max(scaledX, axis=0)}')

minimum=[0. 0. 0. 0. 0. 0. 0. 0.], maximum=[1. 1. 1. 1. 1. 1. 1. 1.]


## Standard Scaling <a class="anchor" id="standard-scaling"></a>

In [8]:
# Standardize data (0 mean, 1 stdev)
import numpy as np
from sklearn.preprocessing import StandardScaler
from pandas import read_csv
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
# separate array into input and output components
X = array[:, :-1]
y = array[:, -1]
scaler = StandardScaler()
scaledX = scaler.fit_transform(X)
# Check mean and standard deviation of all columns
print(f'mean={np.mean(scaledX, axis=0)}, variance={np.var(scaledX, axis=0)}')

mean=[-6.47630098e-17 -9.25185854e-18  1.50342701e-17  1.00613962e-16
 -3.00685403e-17  2.59052039e-16  2.45174251e-16  1.93132547e-16], variance=[1. 1. 1. 1. 1. 1. 1. 1.]


## Robust Scaling <a class="anchor" id="robust-scaling"></a>

In [9]:
# Robust scaling (0 median, 1 IQR)
from sklearn.preprocessing import RobustScaler
from pandas import read_csv
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
# separate array into input and output components
X = array[:,:-1]
y = array[:,-1]
scaler = RobustScaler()
scaledX = scaler.fit_transform(X)
# Check median and IQR of all columns
q3, q1 = np.percentile(scaledX, [75 ,25], axis=0)
print(f'median={np.median(scaledX, axis=0)}, IQR={q3-q1}')

median=[0. 0. 0. 0. 0. 0. 0. 0.], IQR=[1. 1. 1. 1. 1. 1. 1. 1.]


## Feature Engineering <a class="anchor" id="feature-engineering"></a>

In [10]:
# Create 2 new features
from pandas import read_csv
from sklearn.preprocessing import MinMaxScaler
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
win_size = 3
df['plas_pres'] = df['plas'] + df['pres'] # new feature 1
df['mass_ave'] = df['mass'].rolling(win_size).mean() # new feature 2
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class,plas_pres,mass_ave
0,6,148,72,35,0,33.6,0.627,50,1,220,
1,1,85,66,29,0,26.6,0.351,31,0,151,
2,8,183,64,0,0,23.3,0.672,32,1,247,27.833333
3,1,89,66,23,94,28.1,0.167,21,0,155,26.0
4,0,137,40,35,168,43.1,2.288,33,1,177,31.5


## Univariate Selection <a class="anchor" id="univariate-selection"></a>

In [11]:
# Feature Selection with Univariate Selection
from pandas import read_csv
from sklearn.feature_selection import SelectKBest
# load data
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
selector = SelectKBest(k=4)
features = selector.fit_transform(X, y)
selected = selector.get_support()
# Show selected features
print([names[i] for i in range(len(names)-1) if selected[i]])

['preg', 'plas', 'mass', 'age']


## Recursive Feature Elimination <a class="anchor" id="recursive-feature-elimination"></a>

In [12]:
# Feature Selection with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv(filename, names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
model = DecisionTreeClassifier()
rfe = RFE(model, n_features_to_select=4)
features = rfe.fit_transform(X, y)
selected = rfe.get_support()
# Show selected features
print([names[i] for i in range(len(names)-1) if selected[i]])

['plas', 'pres', 'mass', 'pedi']


## Hold-out Validation <a class="anchor" id="hold-out-validation"></a>

In [13]:
# Evaluate using a train and a test set
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsClassifier
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.33, random_state=42)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
result = model.score(X_test, y_test)
print(f"Accuracy: {result:.2%}")

Accuracy: 70.08%


## Cross Validation <a class="anchor" id="cross-validation"></a>

In [14]:
# Evaluate using Cross Validation
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
model = KNeighborsClassifier()
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
results = cross_val_score(model, X, y, cv=kfold)
print(f"Accuracy: {results.mean():.2%} ({results.std():.2%})")

Accuracy: 68.75% (1.62%)
