## Unsupervised learning

### Preparing data

In [2]:
from sklearn.datasets import fetch_openml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

mnist = fetch_openml('mnist_784', version=1, as_frame=False, parser='auto')
mnist.target = mnist.target.astype(np.uint8)
X = mnist["data"]
y = mnist["target"]

print(X)
print(y)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[5 0 4 ... 4 5 6]


# K-Means

### Silhouette score (wskaznik sylwetkowy) 
- mierzy jak dobrze dany punkt pasuje do klastra  
  s = (b-a)/(max(a,b))      wartosc (-1,1)
im blizej 1 tym lepszy, im blizej -1 tym gorszy
a - srednia odlegosc do wszytskch punktow W TYM SAMYM klastrze
      (im mniej tym lepiej)
b - srednia odleglosc od wszystkich punktow W NAJBLIZSZYM SASIEDNIM klastrze (im wiecej tym lepiej)


In [6]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

k_number = [8,9,10,11,12]

silh_list = []


for k in k_number:
    kmeans = KMeans(n_init=10, n_clusters=k, random_state=42)
    y_pred = kmeans.fit_predict(X)
    silhouette_sc = silhouette_score(X, y_pred)
    print("Silhouette score for k = ", k, " : ", silhouette_sc)
    silh_list.append(silhouette_sc)

Silhouette score for k =  8  :  0.07337977998298922
Silhouette score for k =  9  :  0.05681625379289227
Silhouette score for k =  10  :  0.0586915389505002
Silhouette score for k =  11  :  0.05835878745275728
Silhouette score for k =  12  :  0.05817356340885259


### Mimo, ze wiadomo, ze jest 10 klastrow to KMeans nie dal najlepszego silhouete score dla k=10, lecz niewiele lepsze (tez bardzo male) dla k=8

In [8]:
import pickle

print(silh_list)

with open('kmeans_sil.pkl', 'wb') as file:
    pickle.dump(silh_list, file)

[np.float64(0.07337977998298922), np.float64(0.05681625379289227), np.float64(0.0586915389505002), np.float64(0.05835878745275728), np.float64(0.05817356340885259)]


In [20]:
from sklearn.metrics import confusion_matrix

kmeans10 = KMeans(n_init=10, n_clusters=10, random_state=42)
kmeans10.fit_predict(X)

conf_matrix = confusion_matrix(y, kmeans10.predict(X))

In [19]:
max_ind_list = []

for row in conf_matrix:
    i = np.argmax(row)
    max_ind_list.append(i)

max_ind_list = np.unique(max_ind_list)
max_ind_list.sort()


print(max_ind_list)


[0 1 2 3 5 6 8 9]


lista: [0 1 2 3 5 6 8 9]
pokazuje ze klastry K-Means o indeksach 4 i 7 nie byly glownymi reprezentantami dla zadnej z 10 prawdziwych cyfr


In [21]:
with open('kmeans_argmax.pkl', 'wb') as file:
    pickle.dump(max_ind_list, file)

# DBSCAN

In [4]:
from sklearn.cluster import DBSCAN

n_subset = 300
all_distances = []

for i in range(n_subset):
    x1 = X[i]

    for j in range(X.shape[0]):
        x2 = X[j]
        dist = np.linalg.norm(x1 - x2)
        if dist < 1e-11:
            continue
        all_distances.append(dist)

all_distances.sort()

In [None]:
all_distances_10 = all_distances[:10]
print(all_distances_10)

with open('dist.pkl', 'wb') as file:
    pickle.dump(all_distances_10, file)

[np.float64(279.26152617215286), np.float64(304.37641170103836), np.float64(317.5893575043093), np.float64(328.7658741414626), np.float64(333.4546445920344), np.float64(352.89800226127664), np.float64(355.1774204534967), np.float64(358.07401469528617), np.float64(359.64287842247063), np.float64(360.42474942767177)]


In [7]:
s = (all_distances_10[0] + all_distances_10[1] + all_distances_10[2])/3.0
print(s)


300.40909845916684


In [15]:
eps_list = []

# for eps in range(s, s+0.1*s, 0.04*s):
#     eps_list.append(eps)

i = s
stop = s+0.1*s
step = 0.04*s

while (i <= stop):
    eps_list.append(i)
    i += step

print(eps_list)

[np.float64(300.40909845916684), np.float64(312.4254623975335), np.float64(324.4418263359002)]


In [16]:
label_count_list = []

for eps in eps_list:
    db_scan = DBSCAN(eps=eps, n_jobs=-1)
    db_scan.fit(X)
    cluster_unique = np.unique(db_scan.labels_)
    label_count_list.append(len(cluster_unique))
    print("For eps=", eps, " label count : ", len(cluster_unique))
    print() 



For eps= 300.40909845916684  label count :  4

For eps= 312.4254623975335  label count :  7

For eps= 324.4418263359002  label count :  22



In [17]:
with open('dbscan_len.pkl', 'wb') as file:
    pickle.dump(label_count_list, file)