<a href="https://colab.research.google.com/github/wooihaw/ERA3036_T2215/blob/main/Chapter_2/Chapter_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Table of Contents
* [Loading Data](#loading-data)
* [Statistical Summary and Class Breakdown](#statistical-summary)
* [Dropping Rows with Missing Values](#dropping-rows)
* [Data Imputation](#data-imputation)
* [Min-max Scaling](#min-max-scaling)
* [standardization](#standardization)
* [Robust Scaling](#robust-scaling)
* [Categorical Data](#categorical-data)
* [Feature Engineering](#feature-engineering)
* [Univariate Selection](#univariate-selection)
* [Model-based Selection](#model-selection)
* [Recursive Feature Elimination](#recursive-feature-elimination)
* [Dimensionality Reduction](#dimensionality-reduction)
* [Hold-out Validation](#hold-out-validation)
* [Cross Validation](#cross-validation)
* [Confusion Matrix](#confusion-matrix)
* [Classification Report](#classification-report)
* [Grid Search](#grid-search)

In [None]:
# Initialization
%matplotlib inline
from warnings import filterwarnings
filterwarnings('ignore')

## Loading Data <a class="anchor" id="loading-data"></a>

In [None]:
# Load data from CSV file
from pandas import read_csv
from sklearn.preprocessing import MinMaxScaler
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('../data/pima-indians-diabetes.data.csv', names=names)
array = df.values
# separate data into features and targets
X = array[:,0:8]
y = array[:,8]
print(X.shape, y.shape)

## Statistical Summary and Class Breakdown <a class="anchor" id="statistical-summary"></a>

In [None]:
# Print statistical summary and class breakdown
from pandas import read_csv
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('../data/pima-indians-diabetes.data.csv', names=names)
print(df.shape) # print the dimension of the data
print(df.describe()) # print the statistical summary of the data
class_counts = df.groupby('class').size()
print(class_counts) # print the class breakdown of the data

## Dropping Rows with Missing Values <a class="anchor" id="dropping-rows"></a>

In [None]:
# Handling missing values by dropping data samples with missing values
import pandas as pd
import numpy as np
df = pd.DataFrame({'Age': [17, 23, 'x', 38, 54, 67, 32],
                  'Height': [160, 172, 150, 165, 163, 158, 175],
                  'Weight':[50, 68, 43, 52, 47, 49, 'x']})
df.replace({'x': np.nan}, inplace=True) # replace missing values (x) with NaN
print(df)
print(df.isnull().sum())  # show the number of NaN in each column
df.dropna(inplace=True)  # drop rows with NaN
print(df)

## Data Imputation <a class="anchor" id="data-imputation"></a>

In [None]:
# Handling missing values by imputing missing values with statistic
import pandas as pd
import numpy as np
df = pd.DataFrame({'Age': [17, 23, 'x', 38, 54, 67, 32],
                  'Height': [160, 172, 150, 165, 163, 158, 175],
                  'Weight':[50, 68, 43, 52, 47, 49, 'x']})
df.replace({'x': np.nan}, inplace=True)
print(df)
df['Age'].fillna(df['Age'].median(), inplace=True) # replace NaN with median
df['Weight'].fillna(df['Weight'].mean(), inplace=True) # replace NaN with mean
print(df)

## Categorical Data <a class="anchor" id="categorical-data"></a>

In [None]:
# Handling categorical data
import pandas as pd
df0 = pd.DataFrame({'year':[2015, 2017, 2013, 2018, 2020], 
                  'make':['Toyota', 'Honda', 'Perodua', 'Hyundai', 'Toyota'],
                  'engine':[1.5, 1.8, 1.3, 1.6, 1.8],
                  'review':['moderate', 'good', 'poor', 'moderate', 'good']})
mapping = {'poor':1, 'moderate':2, 'good':3}
df0['review'] = df0['review'].map(mapping) # encode ordinal data
df0 = pd.get_dummies(df0) # encode nominal data
print(df0)

## Min-max Scaling <a class="anchor" id="min-max-scaling"></a>

In [None]:
# Scale data (between 0 and 1)
import numpy as np
from pandas import read_csv
from sklearn.preprocessing import MinMaxScaler
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('../data/pima-indians-diabetes.data.csv', names=names)
array = df.values
# separate array into input and output components
X = array[:,:-1]
y = array[:,-1]
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(X)
scaledX = scaler.transform(X)
# Check min and max of all columns
print(f'minimum={np.min(scaledX, axis=0)}, maximum={np.max(scaledX, axis=0)}')

## Standardization <a class="anchor" id="standardization"></a>

In [None]:
# Standardize data (0 mean, 1 stdev)
import numpy as np
from sklearn.preprocessing import StandardScaler
from pandas import read_csv
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('../data/pima-indians-diabetes.data.csv', names=names)
array = df.values
# separate array into input and output components
X = array[:,:-1]
y = array[:,-1]
scaler = StandardScaler()
scaledX = scaler.fit_transform(X)
# Check mean and standard deviation of all columns
print(f'mean={np.mean(scaledX, axis=0)}, variance={np.var(scaledX, axis=0)}')

## Robust Scaling <a class="anchor" id="robust-scaling"></a>

In [None]:
# Robust scaling (0 median, 1 IQR)
import numpy as np
from sklearn.preprocessing import RobustScaler
from pandas import read_csv
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('../data/pima-indians-diabetes.data.csv', names=names)
array = df.values
# separate array into input and output components
X = array[:, :-1]
y = array[:, -1]
scaler = RobustScaler()
scaledX = scaler.fit_transform(X)
# Check median and IQR of all columns
q3, q1 = np.percentile(scaledX, [75 ,25], axis=0)
print(f'median={np.median(scaledX, axis=0)}, IQR={q3-q1}')

## Feature Engineering <a class="anchor" id="feature-engineering"></a>

In [None]:
# Create 2 new features
from pandas import read_csv
from sklearn.preprocessing import MinMaxScaler
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('../data/pima-indians-diabetes.data.csv', names=names)
win_size = 3
df['plas_pres'] = df['plas'] + df['pres'] # new feature 1
df['mass_ave'] = df['mass'].rolling(win_size).mean() # new feature 2
df.head()

## Univariate Selection <a class="anchor" id="univariate-selection"></a>

In [None]:
# Feature Selection with Univariate Selection
from pandas import read_csv
from sklearn.feature_selection import SelectKBest
# load data
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('../data/pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
selector = SelectKBest(k=4)
features = selector.fit_transform(X, y)
selected = selector.get_support()
# Show selected features
print([names[i] for i in range(len(names)-1) if selected[i]])

## Model-based Selection <a class="anchor" id="model-selection"></a>

In [None]:
# Model-based Feature Selection with Random Forest
from pandas import read_csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('../data/pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
selector = SelectFromModel(RandomForestClassifier(), threshold='median')
features = selector.fit_transform(X, y)
selected = selector.get_support()
# Show selected features
print([names[i] for i in range(len(names)-1) if selected[i]])

## Recursive Feature Elimination <a class="anchor" id="recursive-feature-elimination"></a>

In [None]:
# Feature Selection with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
filename = '../data/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv(filename, names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
model = DecisionTreeClassifier()
rfe = RFE(model, n_features_to_select=4)
features = rfe.fit_transform(X, y)
selected = rfe.get_support()
# Show selected features
print([names[i] for i in range(len(names)-1) if selected[i]])

## Dimensionality Reduction <a class="anchor" id="dimensionality-reduction"></a>

In [None]:
# Dimensionality Reduction with PCA
from pandas import read_csv
from sklearn.decomposition import PCA
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('../data/pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
pca = PCA(n_components=3)
features = pca.fit_transform(X)
# summarize components
print(f"Explained Variance: {pca.explained_variance_ratio_}")

## Hold-out Validation <a class="anchor" id="hold-out-validation"></a>

In [None]:
# Evaluate using a train and a test set
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsClassifier
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('../data/pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.33, random_state=42)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
result = model.score(X_test, y_test)
print(f"Accuracy: {result:.2%}")

## Cross Validation <a class="anchor" id="cross-validation"></a>

In [None]:
# Evaluate using Cross Validation
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('../data/pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
model = KNeighborsClassifier()
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
results = cross_val_score(model, X, y, cv=kfold)
print(f"Accuracy: {results.mean():.2%} ({results.std():.2%})")

## Confusion Matrix <a class="anchor" id="confusion-matrix"></a>

In [None]:
# Plot Confusion Matrix
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('../data/pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.33, random_state=42)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)

## Classification Report <a class="anchor" id="classification-report"></a>

In [None]:
# Classification report
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsClassifier
from yellowbrick.classifier import ClassificationReport
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('../data/pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:, :-1]
y = array[:, -1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.33, random_state=42)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
report = ClassificationReport(model)
report.fit(X_train, y_train)
report.score(X_test, y_test)
report.show()

## Grid Search <a class="anchor" id="grid-search"></a>

In [None]:
# Hyperparameter tuning with grid search
from pandas import read_csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split as split, KFold
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('../data/pima-indians-diabetes.data.csv', names=names)
X = df.values[:, :-1]
y = df.values[:, -1]
X_train, X_test, y_train, y_test = split(X, y, random_state=42)
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
print(f'Accuracy without tuning: {model.score(X_test, y_test):.2%}')
kf = KFold(n_splits=5, shuffle=True, random_state=42)
params = dict(criterion=['gini', 'entropy'], max_leaf_nodes=range(2, 21))
grid = GridSearchCV(model, params, cv=kf, n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)
print(grid.best_params_)
model.set_params(**grid.best_params_).fit(X_train, y_train)
print(f'Accuracy with tuning: {model.score(X_test, y_test):.2%}')