# Chapter 9: Unsupervised Learning Techniques

In [1]:
import warnings
warnings.filterwarnings("ignore")

## K-Means

In [2]:
from sklearn.cluster import KMeans

In [3]:
from sklearn.datasets import load_digits

In [4]:
data = load_digits()
X = data['data']
y = data['target']

In [5]:
k = 5
kmeans = KMeans(n_clusters = k)
y_pred = kmeans.fit_predict(X)

In [6]:
kmeans.labels_#注意这里是unsupervised learning， 所以data本来就没有target，这个label是predict的，不一定是对的

array([2, 0, 0, ..., 0, 3, 0], dtype=int32)

In [7]:
kmeans.cluster_centers_ # The centroids that the algorithm found

array([[ 0.00000000e+00,  5.20547945e-01,  6.92602740e+00,
         1.31315068e+01,  9.96164384e+00,  2.78356164e+00,
         1.04109589e-01,  1.38777878e-16,  1.91780822e-02,
         2.98904110e+00,  1.05890411e+01,  1.33123288e+01,
         1.26712329e+01,  6.08493151e+00,  4.16438356e-01,
        -8.32667268e-17,  1.09589041e-02,  2.84931507e+00,
         7.99452055e+00,  9.63013699e+00,  1.26301370e+01,
         6.14794521e+00,  4.02739726e-01,  3.46944695e-17,
         2.73972603e-03,  1.07945205e+00,  4.94794521e+00,
         1.05342466e+01,  1.34164384e+01,  3.77534247e+00,
         1.23287671e-01,  8.67361738e-19,  0.00000000e+00,
         3.97260274e-01,  4.60273973e+00,  1.23150685e+01,
         1.22054795e+01,  1.98630137e+00,  3.01369863e-02,
         0.00000000e+00,  3.46944695e-18,  6.65753425e-01,
         7.25753425e+00,  1.19287671e+01,  1.01013699e+01,
         3.32054795e+00,  5.83561644e-01,  5.47945205e-03,
         8.21917808e-03,  1.02191781e+00,  1.00246575e+0

之后predict也只是给你一个label，也就是分配到的cluster

## Mini-batch K-means

In [8]:
from sklearn.cluster import MiniBatchKMeans

In [9]:
minibatch_kmeans = MiniBatchKMeans(n_clusters=5)
minibatch_kmeans.fit(X)
#Each iteration, instead of using the whole dataset, only use a batch data to train the model

MiniBatchKMeans(n_clusters=5)

In [10]:
from sklearn.metrics import silhouette_score #Calculate silhouette_score
silhouette_score(X, kmeans.labels_)

0.1380278237544065

## Using Clustering for Preprocessing

In [11]:
from sklearn.datasets import load_digits

In [12]:
X_digits, y_digits = load_digits(return_X_y=True)

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits)

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

LogisticRegression()

In [17]:
log_reg.score(X_test, y_test)

0.9466666666666667

In [18]:
from sklearn.pipeline import Pipeline

In [19]:
pipeline = Pipeline(steps = [
    ('kmeans', KMeans(n_clusters = 50)),
    ('log_reg', LogisticRegression())
])

In [20]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('kmeans', KMeans(n_clusters=50)),
                ('log_reg', LogisticRegression())])

In [21]:
pipeline.score(X_test, y_test)

0.9555555555555556

In [22]:
# Now, use CV to find the optimal number of clusters for preprocessing

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
param_grid = dict(kmeans__n_clusters = range(2, 100))

In [25]:
grid_clf = GridSearchCV(pipeline, param_grid, cv = 3, verbose = 2)

In [26]:
grid_clf.fit(X_train, y_train)

Fitting 3 folds for each of 98 candidates, totalling 294 fits
[CV] kmeans__n_clusters=2 ............................................
[CV] ............................. kmeans__n_clusters=2, total=   0.1s
[CV] kmeans__n_clusters=2 ............................................
[CV] ............................. kmeans__n_clusters=2, total=   0.1s
[CV] kmeans__n_clusters=2 ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] ............................. kmeans__n_clusters=2, total=   0.1s
[CV] kmeans__n_clusters=3 ............................................
[CV] ............................. kmeans__n_clusters=3, total=   0.1s
[CV] kmeans__n_clusters=3 ............................................
[CV] ............................. kmeans__n_clusters=3, total=   0.1s
[CV] kmeans__n_clusters=3 ............................................
[CV] ............................. kmeans__n_clusters=3, total=   0.1s
[CV] kmeans__n_clusters=4 ............................................
[CV] ............................. kmeans__n_clusters=4, total=   0.1s
[CV] kmeans__n_clusters=4 ............................................
[CV] ............................. kmeans__n_clusters=4, total=   0.1s
[CV] kmeans__n_clusters=4 ............................................
[CV] ............................. kmeans__n_clusters=4, total=   0.1s
[CV] kmeans__n_clusters=5 ............................................
[CV] .

[CV] ............................ kmeans__n_clusters=22, total=   0.2s
[CV] kmeans__n_clusters=22 ...........................................
[CV] ............................ kmeans__n_clusters=22, total=   0.2s
[CV] kmeans__n_clusters=23 ...........................................
[CV] ............................ kmeans__n_clusters=23, total=   0.2s
[CV] kmeans__n_clusters=23 ...........................................
[CV] ............................ kmeans__n_clusters=23, total=   0.2s
[CV] kmeans__n_clusters=23 ...........................................
[CV] ............................ kmeans__n_clusters=23, total=   0.2s
[CV] kmeans__n_clusters=24 ...........................................
[CV] ............................ kmeans__n_clusters=24, total=   0.2s
[CV] kmeans__n_clusters=24 ...........................................
[CV] ............................ kmeans__n_clusters=24, total=   0.2s
[CV] kmeans__n_clusters=24 ...........................................
[CV] .

[CV] ............................ kmeans__n_clusters=41, total=   0.2s
[CV] kmeans__n_clusters=42 ...........................................
[CV] ............................ kmeans__n_clusters=42, total=   0.3s
[CV] kmeans__n_clusters=42 ...........................................
[CV] ............................ kmeans__n_clusters=42, total=   0.3s
[CV] kmeans__n_clusters=42 ...........................................
[CV] ............................ kmeans__n_clusters=42, total=   0.3s
[CV] kmeans__n_clusters=43 ...........................................
[CV] ............................ kmeans__n_clusters=43, total=   0.3s
[CV] kmeans__n_clusters=43 ...........................................
[CV] ............................ kmeans__n_clusters=43, total=   0.3s
[CV] kmeans__n_clusters=43 ...........................................
[CV] ............................ kmeans__n_clusters=43, total=   0.3s
[CV] kmeans__n_clusters=44 ...........................................
[CV] .

[CV] ............................ kmeans__n_clusters=61, total=   0.3s
[CV] kmeans__n_clusters=61 ...........................................
[CV] ............................ kmeans__n_clusters=61, total=   0.3s
[CV] kmeans__n_clusters=61 ...........................................
[CV] ............................ kmeans__n_clusters=61, total=   0.3s
[CV] kmeans__n_clusters=62 ...........................................
[CV] ............................ kmeans__n_clusters=62, total=   0.3s
[CV] kmeans__n_clusters=62 ...........................................
[CV] ............................ kmeans__n_clusters=62, total=   0.3s
[CV] kmeans__n_clusters=62 ...........................................
[CV] ............................ kmeans__n_clusters=62, total=   0.3s
[CV] kmeans__n_clusters=63 ...........................................
[CV] ............................ kmeans__n_clusters=63, total=   0.3s
[CV] kmeans__n_clusters=63 ...........................................
[CV] .

[CV] ............................ kmeans__n_clusters=80, total=   0.4s
[CV] kmeans__n_clusters=80 ...........................................
[CV] ............................ kmeans__n_clusters=80, total=   0.4s
[CV] kmeans__n_clusters=81 ...........................................
[CV] ............................ kmeans__n_clusters=81, total=   0.4s
[CV] kmeans__n_clusters=81 ...........................................
[CV] ............................ kmeans__n_clusters=81, total=   0.4s
[CV] kmeans__n_clusters=81 ...........................................
[CV] ............................ kmeans__n_clusters=81, total=   0.4s
[CV] kmeans__n_clusters=82 ...........................................
[CV] ............................ kmeans__n_clusters=82, total=   0.4s
[CV] kmeans__n_clusters=82 ...........................................
[CV] ............................ kmeans__n_clusters=82, total=   0.4s
[CV] kmeans__n_clusters=82 ...........................................
[CV] .

[CV] ............................ kmeans__n_clusters=99, total=   0.5s


[Parallel(n_jobs=1)]: Done 294 out of 294 | elapsed:  1.4min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('kmeans', KMeans(n_clusters=50)),
                                       ('log_reg', LogisticRegression())]),
             param_grid={'kmeans__n_clusters': range(2, 100)}, verbose=2)

In [27]:
grid_clf.best_params_

{'kmeans__n_clusters': 51}

In [28]:
grid_clf.score(X_test, y_test)
#哈哈哈搜索了best parameter以后确实高了

0.9511111111111111

## Using Clustering for Semi-Supervised Learning 

In [29]:
import numpy as np

In [30]:
n_labeled = 50
log_reg = LogisticRegression()
log_reg.fit(X_train[:n_labeled], y_train[:n_labeled])

LogisticRegression()

In [31]:
log_reg.score(X_test, y_test)

0.8044444444444444

In [32]:
k=50
kmeans = KMeans(n_clusters=k)
X_digits_dist = kmeans.fit_transform(X_train)

In [33]:
representative_digit_idx = np.argmin(X_digits_dist, axis = 0)
X_representative_digits = X_train[representative_digit_idx]

In [34]:
#有了representative的digit以后，手动label他们（因为你现在是semi supervised）
#之后就相当于你的dataset离supervised learning又近了一步，所以准确率会提高很多

## DBSCAN

In [53]:
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons

In [65]:
X, y = make_moons(n_samples = 1000, noise=0.05)
dbscan = DBSCAN(eps=0.2, min_samples=5)
dbscan.fit(X)

DBSCAN(eps=0.2)

In [66]:
#dbscan.labels_
# Get the label of each instance

In [67]:
len(dbscan.core_sample_indices_)
#Get the number of core instances

1000

In [68]:
# dbscan.core_sample_indices_
# Get the index of core instances

In [69]:
dbscan.components_
# Get the 具体数值 of each core instances

array([[ 0.0635868 ,  0.32993941],
       [ 0.72035855,  0.71167742],
       [-1.08102071,  0.23073722],
       ...,
       [-1.03050607,  0.40768679],
       [ 1.29503227, -0.61086176],
       [ 0.45585924, -0.32180326]])

调整eps的数值来控制Model的表现，比如这里0.05就太小了，调整到0.2比较合适

`DBSCAN` class does not have a `predict()` method

In [70]:
from sklearn.neighbors import KNeighborsClassifier

In [71]:
knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(dbscan.components_, dbscan.labels_[dbscan.core_sample_indices_])

KNeighborsClassifier(n_neighbors=50)

In [72]:
#Now we are able to predict new instances


In [73]:
X_new = np.array([[-0.5, 0], [0, 0.5], [1, -0.1], [2, 1]])

In [74]:
knn.predict(X_new)

array([1, 0, 1, 0])

In [75]:
knn.predict_proba(X_new)

array([[0.12, 0.88],
       [1.  , 0.  ],
       [0.2 , 0.8 ],
       [1.  , 0.  ]])

## Gaussian Mixtures

In [76]:
from sklearn.mixture import GaussianMixture

In [77]:
gm = GaussianMixture(n_components=3, n_init=10)
gm.fit(X)
#n_components应该说的是cluster的数量
#n_init是说要run多少次，因为和KMEANS一样，会converge to poor solution

GaussianMixture(n_components=3, n_init=10)

In [78]:
gm.weights_
#每个cluster对应不同的weight，加起来是1

array([0.60275042, 0.19559614, 0.20165343])

In [79]:
gm.means_

array([[ 0.50945883,  0.2407422 ],
       [ 1.76298738, -0.03600732],
       [-0.75394844,  0.55537708]])

In [80]:
gm.covariances_

array([[[ 0.17830223, -0.11096681],
        [-0.11096681,  0.29096811]],

       [[ 0.04507744,  0.05738972],
        [ 0.05738972,  0.08756595]],

       [[ 0.05213784,  0.06119756],
        [ 0.06119756,  0.0885645 ]]])

In [81]:
gm.converged_

True

In [82]:
gm.n_iter_

16

In [84]:
# gm.predict(X)
# gm.predict_proba(X)

### Anomaly Detection Using Gaussian Mixtures

In [85]:
densities = gm.score_samples(X)
density_threshold = np.percentile(densities, 4)
anomalies = X[densities < density_threshold]

In [87]:
anomalies
#Find the anomaly we want, under the threshold of 4%

array([[-0.9834215 ,  0.03422252],
       [ 1.20355647, -0.51272969],
       [ 1.98979515, -0.06112515],
       [ 1.91785366, -0.10352719],
       [-0.12047189,  1.0866952 ],
       [ 1.90209622,  0.39532016],
       [ 1.23818497, -0.49762548],
       [-0.21443643,  0.89816682],
       [ 1.25773426, -0.46113999],
       [ 2.06664259,  0.08380427],
       [ 1.21882337, -0.42179839],
       [-0.93372901, -0.07212753],
       [ 1.26917976, -0.44565073],
       [ 1.86727705,  0.41586047],
       [-1.00563163,  0.01814774],
       [ 1.22684328, -0.41464992],
       [-0.91503868,  0.07297719],
       [-0.27531984,  0.87615251],
       [ 1.97604814,  0.4899886 ],
       [-0.27394591,  0.84915056],
       [ 1.27047612, -0.45924474],
       [-0.96757147, -0.01168791],
       [ 1.28022402, -0.40455712],
       [ 1.26204747, -0.43757209],
       [-0.9685556 ,  0.00312452],
       [ 0.1460889 , -0.30505833],
       [ 1.24778652, -0.47959412],
       [ 1.23554651, -0.4825744 ],
       [-0.9716579 ,

### Selecting the Number of Clusters

In [88]:
gm.bic(X)

2759.9528755202514

In [90]:
gm.aic(X)

2676.521035777555

### Bayesian Gaussian Mixture Models

In [91]:
from sklearn.mixture import BayesianGaussianMixture

In [92]:
bgm = BayesianGaussianMixture(n_components=10, n_init=10)
bgm.fit(X)

BayesianGaussianMixture(n_components=10, n_init=10)

In [93]:
np.round(bgm.weights_, 2)
#自动把不必要的cluster的系数设置成0!
#所以只要你设置的n_components > actual # of cluster 就好了

array([0.15, 0.12, 0.15, 0.  , 0.16, 0.11, 0.11, 0.18, 0.  , 0.  ])