# Table of Contents
* [Naive Bayes](#naive-bayes)
* [Support Vector Machines](#svm)
* [Decision Tree](#decision-tree)
* [Random Forest](#random-forest)
* [Gradient Boosting Tree](#gradient-boosting)
* [Voting Classifier](#voting-classifier)
* [k-Means Clustering](#kmeans-clustering)
* [Principal Component Analysis](#pca)
* [Machine Learning Pipeline](#ml-pipeline)

In [1]:
# Initialization
%matplotlib inline
from warnings import filterwarnings
filterwarnings('ignore')

## Naive Bayes <a class="anchor" id="naive-bayes"></a>

In [2]:
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.naive_bayes import GaussianNB
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:,:-1]
y = array[:,-1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
gnb = GaussianNB().fit(X_train, y_train)
print(f'Accuracy: {100 * gnb.score(X_test, y_test):.2f} %')

Accuracy: 73.44 %


## Support Vector Machine <a class="anchor" id="svm"></a>

In [3]:
# Support Vector Classifier
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.svm import SVC
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:,:-1]
y = array[:,-1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
svc = SVC().fit(X_train, y_train)
print(f'Accuracy: {100 * svc.score(X_test, y_test):.2f} %')

Accuracy: 72.92 %


## Decision Tree <a class="anchor" id="decision-tree"></a>

In [4]:
# Decision Tree Classification
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.tree import DecisionTreeClassifier
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:,:-1]
y = array[:,-1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
dtc = DecisionTreeClassifier(max_leaf_nodes=6, random_state=42).fit(X_train, y_train)
print(f'Accuracy: {100 * dtc.score(X_test, y_test):.2f} %')

Accuracy: 69.27 %


## Random Forest <a class="anchor" id="random-forest"></a>

In [5]:
# Random Forest Classification
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.ensemble import RandomForestClassifier
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:,:-1]
y = array[:,-1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
rfc = RandomForestClassifier(max_leaf_nodes=6, random_state=42).fit(X_train, y_train)
print(f'Accuracy: {100 * rfc.score(X_test, y_test):.2f} %')

Accuracy: 75.00 %


## Gradient Boosting Tree <a class="anchor" id="gradient-boosting"></a>

In [6]:
# Gradient Boosting Tree Classification
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.ensemble import GradientBoostingClassifier
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:,:-1]
y = array[:,-1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
gbc = GradientBoostingClassifier(max_depth=2, random_state=42).fit(X_train, y_train)
print(f'Accuracy: {100 * gbc.score(X_test, y_test):.2f} %')

Accuracy: 74.48 %


## Voting Classifier <a class="anchor" id="voting-classifier"></a>

In [7]:
# Voting Classifier
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:,:-1]
y = array[:,-1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
clf1 = LogisticRegression()
clf2 = GaussianNB()
vtc = VotingClassifier(estimators=[('lgr', clf1), ('gnb', clf2)], voting='soft').fit(X_train, y_train)
print(f'Accuracy: {100 * rfc.score(X_test, y_test):.2f} %')

Accuracy: 75.00 %


## k-Means Clustering <a class="anchor" id="kmeans-clustering"></a>

In [None]:
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
X, y = make_blobs(centers=3, cluster_std=2, random_state=42)
km = KMeans(n_clusters=3).fit(X)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))
fig.suptitle('k-means Clustering')
ax1.scatter(X[:, 0], X[:, 1])
ax1.set_title('Before clustering')
ax2.scatter(X[:, 0], X[:, 1], c=km.labels_)
ax2.set_title('After clustering')

## Principal Component Analysis <a class="anchor" id="pca"></a>

In [None]:
# Dimensionality reduction using PCA
from sklearn.decomposition import PCA
from sklearn.datasets import load_digits

X, y = load_digits(return_X_y=True)

pca = PCA(n_components=24)
X_reduced = pca.fit_transform(X)

print(X.shape, X_reduced.shape)
print(pca.explained_variance_ratio_.sum())

## Machine Learning Pipeline <a class="anchor" id="ml-pipeline"></a>

![image.png](attachment:image.png)

In [None]:
# Without pipeline
from pandas import read_csv
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsClassifier
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:,:-1]
y = array[:,-1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
scl = MinMaxScaler() 
X_train_scl = scl.fit_transform(X_train)
X_test_scl = scl.transform(X_test)

dr = PCA(n_components=5)
X_train_dr = dr.fit_transform(X_train_scl)
X_test_dr = dr.transform(X_test_scl)

clf = KNeighborsClassifier().fit(X_train_dr, y_train)
print(f'Accuracy: {100 * clf.score(X_test_dr, y_test):.2f} %')

![image-2.png](attachment:image-2.png)

In [None]:
# With pipeline
from pandas import read_csv
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=names)
array = df.values
X = array[:,:-1]
y = array[:,-1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)

# Make pipeline
pipe = Pipeline([('scl', MinMaxScaler()), ('dr', PCA(n_components=5)), ('clf', KNeighborsClassifier())])

pipe.fit(X_train, y_train)
print(f'Accuracy: {100 * pipe.score(X_test, y_test):.2f} %')