# Topic 1  Overview of Machine Learning and Scikit Learn

In [0]:
import sklearn
from sklearn import datasets

# Topic 2 Classification

## Step 1: Prepare the Data

### Import Data

In [0]:
# Load Iris dataset
import pandas as pd
dataset_path = "https://raw.githubusercontent.com/pandas-dev/pandas/master/pandas/tests/data/iris.csv"
                     
X = pd.read_csv(dataset_path)
X

### Split the Features and Target

In [0]:
X = X.dropna()
y = X.pop('Name')
y

### Encode the Label

In [0]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y

### Scale/Normalize the Features

In [0]:
# Scale the input data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


### Split and Randomize Training and Testing Dataset

In [0]:
# Split and Randomize the data
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.3,random_state=100)

In [0]:
X_train

In [0]:
y_train

## Step 2 Define the Model

In [0]:
# KNN Classifier

from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5)

## Step 3 Train the Model

In [0]:
clf.fit(X_train,y_train)

## Step 4 Evaluate the Model

In [0]:
clf.score(X_test,y_test)

## Step 5 Save the Model

In [0]:
from sklearn.externals import joblib
joblib.dump(clf, 'iris.pkl') 

## Step 6 Load the Model for Inference

In [0]:
from sklearn.externals import joblib
clf2 = joblib.load('iris.pkl')

In [0]:
import numpy as np

X_new = np.array([[6.7,3.1,4.7,1.5]])
y = clf2.predict(X_new)

label = {0:'sentosa',1:'versicolor',2:'virginica'}
print('The flower is ',label[y[0]])

## Ex: Classifiers

In [0]:
# Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression 
clf = LogisticRegression()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

In [0]:
# SVM Classifier

from sklearn.svm import SVC 
clf = SVC()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

In [0]:
# Guassian Navie Bayes Classifer

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

In [0]:
# Decision Tree Classifer

from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

In [0]:
# SGD Classifer

from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

## Ensemble Methods

### Bagging

In [0]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

### Boosting

In [0]:
# Gradient Boosting Classifier

from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

In [0]:
# AdaBoosting Classifier

from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

### Stacking

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = LogisticRegression()
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

clf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

## Ex: Emsemble Methods

In [0]:
# Load dataset
import pandas as pd
dataset_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
                     
X = pd.read_csv(dataset_path,sep=';')
X


In [0]:
X = X.dropna()
y = X.pop('quality')
y

In [0]:
# Scale the input data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


In [0]:
# Split and Randomize the data
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=100)

In [0]:
# Decision Tree Classifer

from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

In [0]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

In [0]:
# Gradient Boosting Tree Classifier

from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = LogisticRegression()
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

clf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

In [0]:
i = 50
x = X_test.iloc[i:i+1]
print('The predicted wine quality is ', clf.predict(x)[0])
print('The actual wine quality is ',y_test.iloc[i])

## Confusion Matrix

In [0]:
from sklearn.metrics import confusion_matrix
y_true = ["cat", "ant", "cat", "cat", "ant", "bird"]
y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"]
confusion_matrix(y_true, y_pred, labels=["ant", "bird", "cat"])

## Ex: Confusion Matrix

In [0]:
# Load Iris dataset
import pandas as pd
dataset_path = "https://raw.githubusercontent.com/pandas-dev/pandas/master/pandas/tests/data/iris.csv"
                     
X = pd.read_csv(dataset_path)

X = X.dropna()
y = X.pop('Name')

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=100)

In [0]:
# KNN Classifier

from sklearn import neighbors
clf = neighbors.KNeighborsClassifier()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
confusion_matrix(y_test,y_pred)

In [0]:
# Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression 
clf = LogisticRegression()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
confusion_matrix(y_test,y_pred)

In [0]:
# SVM Classifier

from sklearn import svm 
clf = svm.SVC()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
confusion_matrix(y_test,y_pred)

In [0]:
# GNB Classifier

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
confusion_matrix(y_test,y_pred)

In [0]:
# Decision Tree Classifer

from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
confusion_matrix(y_test,y_pred)

In [0]:
# SGD Classifer

from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
confusion_matrix(y_test,y_pred)

In [0]:
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(X_test)
confusion_matrix(y_test,y_pred)

## Accuracy, Precision, Recall, F1 Scores

In [0]:
# Load Iris dataset
import pandas as pd
dataset_path = "https://raw.githubusercontent.com/pandas-dev/pandas/master/pandas/tests/data/iris.csv"
                     
X = pd.read_csv(dataset_path)

X = X.dropna()
y = X.pop('Name')

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y[y==2]=0

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=100)

In [0]:
# Load Wine Quality dataset

import pandas as pd
dataset_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

X = pd.read_csv(dataset_path,sep=";")

X = X.dropna()
y = X.pop('quality')

y[y<6]=0
y[y>=6]=1

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=100)
y_train

In [0]:
# from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

from sklearn import neighbors
clf = neighbors.KNeighborsClassifier()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print('Accuracy = ',accuracy_score(y_test,y_pred))
print('Precision = ',precision_score(y_test,y_pred))
print('Recall = ',recall_score(y_test,y_pred))
print('F1 Score = ',f1_score(y_test,y_pred))

In [0]:
# Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression 
clf = LogisticRegression()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print('Accuracy = ',accuracy_score(y_test,y_pred))
print('Precision = ',precision_score(y_test,y_pred))
print('Recall = ',recall_score(y_test,y_pred))
print('F1 Score = ',f1_score(y_test,y_pred))

In [0]:
# SVM Classifier

from sklearn import svm 
clf = svm.SVC()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print('Accuracy = ',accuracy_score(y_test,y_pred))
print('Precision = ',precision_score(y_test,y_pred))
print('Recall = ',recall_score(y_test,y_pred))
print('F1 Score = ',f1_score(y_test,y_pred))

In [0]:
# Decision Tree Classifer

from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print('Accuracy = ',accuracy_score(y_test,y_pred))
print('Precision = ',precision_score(y_test,y_pred))
print('Recall = ',recall_score(y_test,y_pred))
print('F1 Score = ',f1_score(y_test,y_pred))

## Classification Report

In [0]:
from sklearn.metrics import classification_report
y_true = ["cat", "ant", "cat", "cat", "ant", "bird"]
y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"]
print(classification_report(y_true, y_pred, labels=["ant", "bird", "cat"]))

## Ex: Classification Report

In [0]:
# Load Iris dataset
import pandas as pd
dataset_path = "https://raw.githubusercontent.com/pandas-dev/pandas/master/pandas/tests/data/iris.csv"
                     
X = pd.read_csv(dataset_path)

In [0]:
X = X.dropna()
y = X.pop('Name')

In [0]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)


In [0]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [0]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.3,random_state=100)

In [0]:
# KNN Classifier

from sklearn import neighbors
from sklearn.metrics import classification_report

clf = neighbors.KNeighborsClassifier()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [0]:
# Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression 
clf = LogisticRegression()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [0]:
# SVM Classifier

from sklearn import svm 
clf = svm.SVC()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [0]:
# GNB Classifier

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [0]:
# Decision Tree Classifer

from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


In [0]:
# SGD Classifer

from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

## ROC Curve and AUC

In [0]:
# Load Iris dataset
import pandas as pd
dataset_path = "https://raw.githubusercontent.com/pandas-dev/pandas/master/pandas/tests/data/iris.csv"
                     
X = pd.read_csv(dataset_path)

X = X.dropna()
y = X.pop('Name')

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y[y==2]=0

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.3,random_state=100)

In [0]:
# Load Wine Quality dataset

import pandas as pd
dataset_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

X = pd.read_csv(dataset_path,sep=";")

X = X.dropna()
y = X.pop('quality')

y[y<6]=0
y[y>=6]=1

scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

from sklearn.model_selection import train_test_split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.3,random_state=100)
y_train

In [0]:
from sklearn.metrics import roc_curve,roc_auc_score

from sklearn import neighbors
clf = neighbors.KNeighborsClassifier()

clf.fit(X_train,y_train)

# predict probabilities
y_probs = clf.predict_proba(X_test)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

auc_score = roc_auc_score(y_test, y_probs)
print("AUC = ", auc_score)

fpr, tpr, threshold = roc_curve(y_test,y_probs)

In [0]:
import matplotlib.pyplot as plt

plt.plot(fpr,tpr,'o-')
plt.xlabel('False Postive Rate')
plt.ylabel('True Positive Rate')

In [0]:
from sklearn.metrics import roc_curve,roc_auc_score

from sklearn.linear_model import LogisticRegression 
clf = LogisticRegression()

clf.fit(X_train,y_train)

# predict probabilities
y_probs = clf.predict_proba(X_test)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

auc_score = roc_auc_score(y_test, y_probs)
print("AUC = ", auc_score)

fpr, tpr, threshold = roc_curve(y_test,y_probs)

In [0]:
import matplotlib.pyplot as plt

plt.plot(fpr,tpr,'o-')
plt.xlabel('False Postive Rate')
plt.ylabel('True Positive Rate')

# Topic 3 Regression

## Load the Data

In [0]:
import pandas as pd
dataset_path = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"                     
X = pd.read_csv(dataset_path)


### Prepare the Data

In [0]:
# Remove missing data
X = X.dropna()

In [0]:
# Define input and output
y = X.pop('medv')

In [0]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [0]:
# Split and Randomize the data
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.3,random_state=100)

In [0]:
X_train

## Define the Model

In [0]:
# Linear Regression Model

from sklearn import linear_model 

lm = linear_model.LinearRegression()


### Train the Model

In [0]:
lm.fit(X_train,y_train)

## Evaluate the Model

In [0]:
from sklearn.metrics import mean_squared_error, r2_score

yhat = lm.predict(X_train)
mse = mean_squared_error(y_train,yhat)
print('Mean Squared Error, Training: ',mse)
rsq = r2_score(y_train,yhat)
print('R-square, Training: ',rsq)

yhat = lm.predict(X_test)
mse = mean_squared_error(y_test,yhat)
print('Mean Squared Error, Testing: ',mse)
rsq = r2_score(y_test,yhat)
print('R-square, Testing: ',rsq)

In [0]:
%matplotlib inline
import matplotlib.pyplot as plt 

yhat = lm.predict(X_test)


plt.scatter(y_test,yhat)
plt.xlabel('Actual Housing Price')
plt.ylabel('Predicted Price')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.xlim()[1]])
plt.plot([0, 100], [0, 100],'r')

## Regularizations

### Ridge Regularization

In [0]:
from sklearn.linear_model import Ridge
rr = Ridge(alpha=0.01) 
rr.fit(X_train,y_train)

In [0]:
from sklearn.metrics import mean_squared_error, r2_score

yhat = rr.predict(X_train)
mse = mean_squared_error(y_train,yhat)
print('Mean Squared Error, Training: ',mse)
rsq = r2_score(y_train,yhat)
print('R-square, Training: ',rsq)

yhat = rr.predict(X_test)
mse = mean_squared_error(y_test,yhat)
print('Mean Squared Error, Testing: ',mse)
rsq = r2_score(y_test,yhat)
print('R-square, Testing: ',rsq)

### Lasso Regularizaton

In [0]:
from sklearn.linear_model import Lasso
lr = Lasso(alpha=0.01) 
lr.fit(X_train,y_train)

In [0]:
from sklearn.metrics import mean_squared_error, r2_score

yhat = lr.predict(X_train)
mse = mean_squared_error(y_train,yhat)
print('Mean Squared Error, Training: ',mse)
rsq = r2_score(y_train,yhat)
print('R-square, Training: ',rsq)

yhat = lr.predict(X_test)
mse = mean_squared_error(y_test,yhat)
print('Mean Squared Error, Testing: ',mse)
rsq = r2_score(y_test,yhat)
print('R-square, Testing: ',rsq)

### Elastic Net Regularization

In [0]:
from sklearn.linear_model import ElasticNet
er = ElasticNet(alpha=0.01,l1_ratio=0.5)
er.fit(X_train,y_train)

In [0]:
from sklearn.metrics import mean_squared_error, r2_score

yhat = er.predict(X_train)
mse = mean_squared_error(y_train,yhat)
print('Mean Squared Error, Training: ',mse)
rsq = r2_score(y_train,yhat)
print('R-square, Training: ',rsq)

yhat = er.predict(X_test)
mse = mean_squared_error(y_test,yhat)
print('Mean Squared Error, Testing: ',mse)
rsq = r2_score(y_test,yhat)
print('R-square, Testing: ',rsq)

## Ex: Linear Regression

In [0]:
import pandas as pd
dataset_path = "https://raw.githubusercontent.com/pandas-dev/pandas/master/pandas/tests/data/iris.csv"
                     
X = pd.read_csv(dataset_path)
X

In [0]:
X = X.dropna()
X.pop('Name')
X

In [0]:
y = X.pop('SepalWidth')

In [0]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


In [0]:
# Split and Randomize the data
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.3,random_state=100)

In [0]:
# Linear Regression Model

from sklearn import linear_model 

lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

In [0]:
from sklearn.metrics import mean_squared_error, r2_score

yhat = lm.predict(X_train)
mse = mean_squared_error(y_train,yhat)
print('Mean Squared Error, Training: ',mse)
rsq = r2_score(y_train,yhat)
print('R-square, Training: ',rsq)

yhat = lm.predict(X_test)
mse = mean_squared_error(y_test,yhat)
print('Mean Squared Error, Testing: ',mse)
rsq = r2_score(y_test,yhat)
print('R-square, Testing: ',rsq)

In [0]:
%matplotlib inline
import matplotlib.pyplot as plt 

yhat = lm.predict(X_test)


plt.scatter(y_test,yhat)
plt.xlabel('Actual Sepal Width')
plt.ylabel('Predicted Sepal Width')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.xlim()[1]])
plt.plot([0, 100], [0, 100],'r')

# Topic 4 Clustering

## Load the Data

In [0]:
from sklearn.datasets.samples_generator import make_blobs

centers = [[1,1],[1.5,1.5],[2,2]]
X,y = make_blobs(n_samples=100,centers = centers,cluster_std=0.1)

import matplotlib.pyplot as plt
plt.scatter(X[:,0],X[:,1])

## load the Model

In [0]:
from sklearn.cluster import KMeans

cluster = KMeans(n_clusters=3,random_state=10)

## Train the Model

In [0]:
cluster.fit(X)

## Evaluate the Model

In [0]:
cluster.labels_

In [0]:
cluster.cluster_centers_

In [0]:
plt.subplot(1,2,1)
plt.scatter(X[:,0],X[:,1],c=y)
plt.title("Actual")
plt.subplot(1,2,2)
plt.scatter(X[:,0],X[:,1],c=cluster.labels_)
plt.title("Prediction")

## Silhouette Analysis

In [0]:
from sklearn.metrics import silhouette_score

for i in range(2,5):
    cluster = KMeans(n_clusters=i,random_state=10)
    cluster.fit(X)
    s = silhouette_score(X, cluster.labels_)
    print('Cluster: ',i, 'Silhouette Score :',s )

## Ex: KMeans Clustering

In [0]:
# Load Iris dataset
import pandas as pd
dataset_path = "https://raw.githubusercontent.com/pandas-dev/pandas/master/pandas/tests/data/iris.csv"
                     
X = pd.read_csv(dataset_path)

X = X.dropna()
y = X.pop('Name')

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

X

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [0]:
from sklearn.metrics import silhouette_score

for i in range(2,5):
    cluster = KMeans(n_clusters=i,random_state=10)
    cluster.fit(X_scaled)
    s = silhouette_score(X_scaled, cluster.labels_)
    print('Cluster: ',i, 'Silhouette Score :',s )

In [0]:
from sklearn.cluster import KMeans

cluster = KMeans(n_clusters=2,random_state=10)
cluster.fit(X_scaled)

In [0]:
x1 = 'SepalWidth'
x2 = 'SepalLength'
plt.subplot(1,2,1)
plt.scatter(X[x1],X[x2],c=y)
plt.title("Actual")
plt.subplot(1,2,2)
plt.scatter(X[x1],X[x2],c=cluster.labels_)
plt.title("Prediction")

## Hierarchical Agglormerative Clustering

In [0]:
from sklearn.datasets.samples_generator import make_blobs

centers = [[1,1],[1.5,1.5],[2,2]]
X,y = make_blobs(n_samples=100,centers = centers,cluster_std=0.1)

import matplotlib.pyplot as plt
plt.scatter(X[:,0],X[:,1])

In [0]:
from scipy.cluster.hierarchy import dendrogram, linkage

Z = linkage(X)
d = dendrogram(Z)
plt.plot(d)

In [0]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters = 3)
cluster.fit(X)

In [0]:
plt.subplot(1,2,1)
plt.scatter(X[:,0],X[:,1],c=y)
plt.title("Actual")
plt.subplot(1,2,2)
plt.scatter(X[:,0],X[:,1],c=cluster.labels_)
plt.title("Prediction")

## Ex: Hierarchical Agglormerative Clustering

In [0]:
# Load Iris dataset
import pandas as pd
dataset_path = "https://raw.githubusercontent.com/pandas-dev/pandas/master/pandas/tests/data/iris.csv"
                     
X = pd.read_csv(dataset_path)

X = X.dropna()
y = X.pop('Name')

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

y

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [0]:
from scipy.cluster.hierarchy import dendrogram, linkage

Z = linkage(X_scaled)
d = dendrogram(Z)
plt.plot(d)

In [0]:
from sklearn.metrics import silhouette_score

for i in range(2,5):
    cluster = AgglomerativeClustering(n_clusters = i)
    cluster.fit(X_scaled)
    s = silhouette_score(X_scaled, cluster.labels_)
    print('Cluster: ',i, 'Silhouette Score :',s )

In [0]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters = 2)
cluster.fit(X_scaled)

In [0]:
x1 = 'SepalWidth'
x2 = 'SepalLength'
plt.subplot(1,2,1)
plt.scatter(X[x1],X[x2],c=y)
plt.title("Actual")
plt.subplot(1,2,2)
plt.scatter(X[x1],X[x2],c=cluster.labels_)
plt.title("Prediction")

# Topic 5 Dimension Reduction

In [0]:
import numpy as np

X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])

from sklearn import decomposition
pca = decomposition.PCA(n_components=2)
pca.fit_transform(X)

In [0]:
pca.explained_variance_

In [0]:
pca.explained_variance_ratio_

## PCA on Iris Dataset

In [0]:
# Load Iris dataset
import pandas as pd
dataset_path = "https://raw.githubusercontent.com/pandas-dev/pandas/master/pandas/tests/data/iris.csv"
                     
X = pd.read_csv(dataset_path)

X = X.dropna()
y = X.pop('Name')

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [0]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4)

In [0]:
X_t= pca.fit_transform(X_scaled)

In [0]:
pca.explained_variance_

In [0]:
pca.explained_variance_ratio_

## Exercise: PCA on K-Means Clustering

In [0]:
from sklearn.cluster import KMeans

cluster = KMeans(n_clusters=2)
cluster.fit(X_scaled)

cluster2 = KMeans(n_clusters=2)
cluster2.fit(X_t)

In [0]:
import matplotlib.pyplot as plt

x1 = 'SepalWidth'
x2 = 'SepalLength'
plt.subplot(2,2,1)
plt.scatter(X[x1],X[x2],c=y)
plt.title("Actual")
plt.subplot(2,2,2)
plt.scatter(X[x1],X[x2],c=cluster.labels_)
plt.title("Prediction")

plt.subplot(2,2,3)
plt.scatter(X_t[:,0],X_t[:,1],c=y)
plt.title('PCA')

plt.subplot(2,2,4)
plt.scatter(X_t[:,0],X_t[:,1],c=cluster2.labels_)
plt.title('PCA Prediction')