# Chapter 9: Unsupervised Learning Techniques

In [1]:
import warnings
warnings.filterwarnings("ignore")

## K-Means

In [2]:
from sklearn.cluster import KMeans

In [3]:
from sklearn.datasets import load_digits

In [4]:
data = load_digits()
X = data['data']
y = data['target']

In [5]:
k = 5
kmeans = KMeans(n_clusters = k)
y_pred = kmeans.fit_predict(X)

In [6]:
kmeans.labels_#注意这里是unsupervised learning， 所以data本来就没有target，这个label是predict的，不一定是对的

array([2, 0, 0, ..., 0, 3, 0], dtype=int32)

In [7]:
kmeans.cluster_centers_ # The centroids that the algorithm found

array([[ 0.00000000e+00,  5.20547945e-01,  6.92602740e+00,
         1.31315068e+01,  9.96164384e+00,  2.78356164e+00,
         1.04109589e-01,  1.38777878e-16,  1.91780822e-02,
         2.98904110e+00,  1.05890411e+01,  1.33123288e+01,
         1.26712329e+01,  6.08493151e+00,  4.16438356e-01,
        -8.32667268e-17,  1.09589041e-02,  2.84931507e+00,
         7.99452055e+00,  9.63013699e+00,  1.26301370e+01,
         6.14794521e+00,  4.02739726e-01,  3.46944695e-17,
         2.73972603e-03,  1.07945205e+00,  4.94794521e+00,
         1.05342466e+01,  1.34164384e+01,  3.77534247e+00,
         1.23287671e-01,  8.67361738e-19,  0.00000000e+00,
         3.97260274e-01,  4.60273973e+00,  1.23150685e+01,
         1.22054795e+01,  1.98630137e+00,  3.01369863e-02,
         0.00000000e+00,  3.46944695e-18,  6.65753425e-01,
         7.25753425e+00,  1.19287671e+01,  1.01013699e+01,
         3.32054795e+00,  5.83561644e-01,  5.47945205e-03,
         8.21917808e-03,  1.02191781e+00,  1.00246575e+0

之后predict也只是给你一个label，也就是分配到的cluster

## Mini-batch K-means

In [8]:
from sklearn.cluster import MiniBatchKMeans

In [9]:
minibatch_kmeans = MiniBatchKMeans(n_clusters=5)
minibatch_kmeans.fit(X)
#Each iteration, instead of using the whole dataset, only use a batch data to train the model

MiniBatchKMeans(n_clusters=5)

In [10]:
from sklearn.metrics import silhouette_score #Calculate silhouette_score
silhouette_score(X, kmeans.labels_)

0.1380278237544065

## Using Clustering for Preprocessing

In [11]:
from sklearn.datasets import load_digits

In [12]:
X_digits, y_digits = load_digits(return_X_y=True)

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits)

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

LogisticRegression()

In [17]:
log_reg.score(X_test, y_test)

0.9466666666666667

In [18]:
from sklearn.pipeline import Pipeline

In [19]:
pipeline = Pipeline(steps = [
    ('kmeans', KMeans(n_clusters = 50)),
    ('log_reg', LogisticRegression())
])

In [20]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('kmeans', KMeans(n_clusters=50)),
                ('log_reg', LogisticRegression())])

In [21]:
pipeline.score(X_test, y_test)

0.9555555555555556

In [22]:
# Now, use CV to find the optimal number of clusters for preprocessing

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
param_grid = dict(kmeans__n_clusters = range(2, 100))

In [25]:
grid_clf = GridSearchCV(pipeline, param_grid, cv = 3, verbose = 2)

In [26]:
grid_clf.fit(X_train, y_train)

Fitting 3 folds for each of 98 candidates, totalling 294 fits
[CV] kmeans__n_clusters=2 ............................................
[CV] ............................. kmeans__n_clusters=2, total=   0.1s
[CV] kmeans__n_clusters=2 ............................................
[CV] ............................. kmeans__n_clusters=2, total=   0.1s
[CV] kmeans__n_clusters=2 ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] ............................. kmeans__n_clusters=2, total=   0.1s
[CV] kmeans__n_clusters=3 ............................................
[CV] ............................. kmeans__n_clusters=3, total=   0.1s
[CV] kmeans__n_clusters=3 ............................................
[CV] ............................. kmeans__n_clusters=3, total=   0.1s
[CV] kmeans__n_clusters=3 ............................................
[CV] ............................. kmeans__n_clusters=3, total=   0.1s
[CV] kmeans__n_clusters=4 ............................................
[CV] ............................. kmeans__n_clusters=4, total=   0.1s
[CV] kmeans__n_clusters=4 ............................................
[CV] ............................. kmeans__n_clusters=4, total=   0.1s
[CV] kmeans__n_clusters=4 ............................................
[CV] ............................. kmeans__n_clusters=4, total=   0.1s
[CV] kmeans__n_clusters=5 ............................................
[CV] .

[CV] ............................ kmeans__n_clusters=22, total=   0.2s
[CV] kmeans__n_clusters=22 ...........................................
[CV] ............................ kmeans__n_clusters=22, total=   0.2s
[CV] kmeans__n_clusters=23 ...........................................
[CV] ............................ kmeans__n_clusters=23, total=   0.2s
[CV] kmeans__n_clusters=23 ...........................................
[CV] ............................ kmeans__n_clusters=23, total=   0.2s
[CV] kmeans__n_clusters=23 ...........................................
[CV] ............................ kmeans__n_clusters=23, total=   0.2s
[CV] kmeans__n_clusters=24 ...........................................
[CV] ............................ kmeans__n_clusters=24, total=   0.2s
[CV] kmeans__n_clusters=24 ...........................................
[CV] ............................ kmeans__n_clusters=24, total=   0.2s
[CV] kmeans__n_clusters=24 ...........................................
[CV] .

[CV] ............................ kmeans__n_clusters=41, total=   0.2s
[CV] kmeans__n_clusters=42 ...........................................
[CV] ............................ kmeans__n_clusters=42, total=   0.3s
[CV] kmeans__n_clusters=42 ...........................................
[CV] ............................ kmeans__n_clusters=42, total=   0.3s
[CV] kmeans__n_clusters=42 ...........................................
[CV] ............................ kmeans__n_clusters=42, total=   0.3s
[CV] kmeans__n_clusters=43 ...........................................
[CV] ............................ kmeans__n_clusters=43, total=   0.3s
[CV] kmeans__n_clusters=43 ...........................................
[CV] ............................ kmeans__n_clusters=43, total=   0.3s
[CV] kmeans__n_clusters=43 ...........................................
[CV] ............................ kmeans__n_clusters=43, total=   0.3s
[CV] kmeans__n_clusters=44 ...........................................
[CV] .

[CV] ............................ kmeans__n_clusters=61, total=   0.3s
[CV] kmeans__n_clusters=61 ...........................................
[CV] ............................ kmeans__n_clusters=61, total=   0.3s
[CV] kmeans__n_clusters=61 ...........................................
[CV] ............................ kmeans__n_clusters=61, total=   0.3s
[CV] kmeans__n_clusters=62 ...........................................
[CV] ............................ kmeans__n_clusters=62, total=   0.3s
[CV] kmeans__n_clusters=62 ...........................................
[CV] ............................ kmeans__n_clusters=62, total=   0.3s
[CV] kmeans__n_clusters=62 ...........................................
[CV] ............................ kmeans__n_clusters=62, total=   0.3s
[CV] kmeans__n_clusters=63 ...........................................
[CV] ............................ kmeans__n_clusters=63, total=   0.3s
[CV] kmeans__n_clusters=63 ...........................................
[CV] .

[CV] ............................ kmeans__n_clusters=80, total=   0.4s
[CV] kmeans__n_clusters=80 ...........................................
[CV] ............................ kmeans__n_clusters=80, total=   0.4s
[CV] kmeans__n_clusters=81 ...........................................
[CV] ............................ kmeans__n_clusters=81, total=   0.4s
[CV] kmeans__n_clusters=81 ...........................................
[CV] ............................ kmeans__n_clusters=81, total=   0.4s
[CV] kmeans__n_clusters=81 ...........................................
[CV] ............................ kmeans__n_clusters=81, total=   0.4s
[CV] kmeans__n_clusters=82 ...........................................
[CV] ............................ kmeans__n_clusters=82, total=   0.4s
[CV] kmeans__n_clusters=82 ...........................................
[CV] ............................ kmeans__n_clusters=82, total=   0.4s
[CV] kmeans__n_clusters=82 ...........................................
[CV] .

[CV] ............................ kmeans__n_clusters=99, total=   0.5s


[Parallel(n_jobs=1)]: Done 294 out of 294 | elapsed:  1.4min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('kmeans', KMeans(n_clusters=50)),
                                       ('log_reg', LogisticRegression())]),
             param_grid={'kmeans__n_clusters': range(2, 100)}, verbose=2)

In [27]:
grid_clf.best_params_

{'kmeans__n_clusters': 51}

In [28]:
grid_clf.score(X_test, y_test)
#哈哈哈搜索了best parameter以后确实高了

0.9511111111111111

## Using Clustering for Semi-Supervised Learning 

In [29]:
import numpy as np

In [30]:
n_labeled = 50
log_reg = LogisticRegression()
log_reg.fit(X_train[:n_labeled], y_train[:n_labeled])

LogisticRegression()

In [31]:
log_reg.score(X_test, y_test)

0.8044444444444444

In [32]:
k=50
kmeans = KMeans(n_clusters=k)
X_digits_dist = kmeans.fit_transform(X_train)

In [33]:
representative_digit_idx = np.argmin(X_digits_dist, axis = 0)
X_representative_digits = X_train[representative_digit_idx]

In [34]:
#有了representative的digit以后，手动label他们（因为你现在是semi supervised）
#之后就相当于你的dataset离supervised learning又近了一步，所以准确率会提高很多

## DBSCAN

In [35]:
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons

In [36]:
X, y = make_moons(n_samples = 1000, noise=0.05)
dbscan = DBSCAN(eps=0.05, min_samples=5)
dbscan.fit(X)

DBSCAN(eps=0.05)

In [41]:
#dbscan.labels_
# Get the label of each instance

In [42]:
len(dbscan.core_sample_indices_)
#Get the number of core instances

785