In [25]:
# Import the packages
from sklearn import datasets
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as sm

# loading the iris dataset 
iris = datasets.load_iris() 

# X -> features, y -> label 
x = pd.DataFrame(iris.data, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'])
y = pd.DataFrame(iris.target, columns=['Target'])

In [50]:
# create a model object with 3 clusters
# http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
# http://scikit-learn.org/stable/modules/clustering.html#k-means
model = KMeans(n_clusters=3)
model.fit(x) # unsupervised: no knowledge of target y
model.labels_ # note coincidence that label is the same as target, could be different. can be manually fixed by np.choose() depending on which labels are switched

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1])

In [56]:
# Performance Metrics
sm.accuracy_score(y, y_pred)

0.8933333333333333

In [57]:
# Confusion Matrix
sm.confusion_matrix(y, y_pred)

#correctly identifed all 0 classes as 0’s
#correctly classified 48 class 1’s but miss-classified 2 class 1’s as class 2
#correctly classified 36 class 2’s but miss-classified 14 class 2’s as class 1

array([[50,  0,  0],
       [ 0, 48,  2],
       [ 0, 14, 36]], dtype=int64)

In [58]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      0.94      0.97        16
           2       0.90      1.00      0.95         9

    accuracy                           0.97        38
   macro avg       0.97      0.98      0.97        38
weighted avg       0.98      0.97      0.97        38



In [110]:
# importing necessary libraries 
from sklearn import datasets 
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
  
# loading the iris dataset 
iris = datasets.load_iris() 

### reduce dimension with PCA ------------------------------------------------
#standardize data
sc = StandardScaler().fit(x)
X_std = sc.transform(x)

#If n_components is not set then all components are stored 
sklearn_pca = PCA().fit(X_std)

#choose 3 components
sklearn_pca = PCA(n_components=3)
x_pca = sklearn_pca.fit_transform(X_std) 

### -------------------------------------------------------------------------

# K-means clustering
kmeans = KMeans(n_clusters=3)
kmeans.fit(x_pca)

y_pred_pca = kmeans.labels_
sm.accuracy_score(y, y_pred_pca)

0.8333333333333334

In [111]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y, y_pred_pca))
print(classification_report(y, y_pred_pca))

[[50  0  0]
 [ 0 39 11]
 [ 0 14 36]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       0.74      0.78      0.76        50
           2       0.77      0.72      0.74        50

    accuracy                           0.83       150
   macro avg       0.83      0.83      0.83       150
weighted avg       0.83      0.83      0.83       150

