<a href="https://colab.research.google.com/github/wallisonferreira/machine-learning-pipelines/blob/main/K_means_cluster_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KMeans Cluster Pipeline

---
Classification of books based on their context text

## Import libraries

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import sklearn.metrics.cluster as metrics
import pickle

## Clustering pipeline

---
1. import documents embeddings
2. run KMeans cluster
3. Apply PCA
3. Calculate the required metrics on PCA and non-PCA data
4. Return the results
- nobs_100,
- nobs_10,
- pca_explained,
- cs_100,
- cs_10,
- vms_100,
- vms_10

In [None]:
# import documents embeddings
data = pickle.load(open("documents.p", "rb"))

# run KMeans cluster
model_kmeans = KMeans(n_clusters = 10, random_state = 2, tol=0.05, max_iter=50)
clusters_data = model_kmeans.fit_predict(data['vectors'])

# apply PCA in data vectors
pca_algo = PCA(n_components = 10, random_state = 2)
data_pca = pca_algo.fit_transform(data['vectors'])

# run KMeans cluster over the pca data
model_kmeans_pca = KMeans(n_clusters = 10, random_state = 2, tol=0.05, max_iter=50)
clusters_data_pca = model_kmeans_pca.fit_predict(data_pca)

# Calculate the required metrics

# Number of observations in each cluster (original data)
nobs_100 = [sum(1 for x in clusters_data if x == i) for i in range(10)]

# Number of observations in each cluster (PCA data)
nobs_10 = [sum(1 for x in clusters_data_pca if x == i) for i in range(10)]

# Variance explained by the first 10 principal components
pca_explained = sum(pca_algo.explained_variance_ratio_)

# Completeness Score (for original data)
cs_100 = metrics.completeness_score(data['group'], clusters_data)

# Completeness Score (for PCA transformed data)
cs_10 = metrics.completeness_score(data['group'], clusters_data_pca)

# V-measure Score (for original data)
vms_100 = metrics.v_measure_score(data['group'], clusters_data)

# V-measure Score (for PCA transformed data)
vms_10 = metrics.v_measure_score(data['group'], clusters_data_pca)

# Returning the results as a dictionary
print( {
        "nobs_100": nobs_100,
        "nobs_10": nobs_10,
        "pca_explained": pca_explained,
        "cs_100": cs_100,
        "cs_10": cs_10,
        "vms_100": vms_100,
        "vms_10": vms_10
})



{'nobs_100': [5, 1450, 1500, 1394, 969, 1672, 1398, 587, 1201, 1138], 'nobs_10': [568, 1251, 2222, 620, 1140, 1105, 744, 1179, 1308, 1177], 'pca_explained': 0.15671531856060028, 'cs_100': 0.6402464914637147, 'cs_10': 0.4897132047941561, 'vms_100': 0.6450009670318155, 'vms_10': 0.5004541694561083}


In [None]:
pca_algo.explained_variance_[0]

0.70997953

In [None]:
pca_algo.explained_variance_ratio_

array([0.0270308 , 0.01800703, 0.01658055, 0.01485487, 0.01414905,
       0.01401547, 0.01336578, 0.01327023, 0.01292253, 0.01251902],
      dtype=float32)

Solução

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import sklearn.metrics.cluster as metrics
import pickle

# You can access the `data` folder by uncommenting the following command
data = pickle.load(open("documents.p", "rb"))

# {'id': range(0, 11314), 'group': [2, 3, 3, 3, 1,...

# id, vectors: vectors with size 100, group: document groups

# cluster articles performs data clustering on data
# returns a dict containing information about the clustered data

def cluster_articles(data):

    # run KMeans cluster
    model_kmeans = KMeans(n_clusters = 10, random_state = 2, tol=0.05, max_iter=50)
    clusters_data = model_kmeans.fit_predict(data['vectors'])

    # apply PCA in data vectors
    pca_algo = PCA(n_components = 10, random_state = 2)
    data_pca = pca_algo.fit_transform(data['vectors'])

    # run KMeans cluster over the pca data
    model_kmeans_pca = KMeans(n_clusters = 10, random_state = 2, tol=0.05, max_iter=50)
    clusters_data_pca = model_kmeans_pca.fit_predict(data_pca)

    #dir(metrics)

    # must return a dict with keys
    ## nobs_100: number of observatios in each cluster
    nobs_100 = [sum(1 for x in clusters_data if x == i) for i in range(10)]

    global nobs
    nobs = nobs_100

    ## nobs_10: number of observatios in each cluster after apply PCA
    nobs_10  = [sum(1 for x in clusters_data_pca if x == i) for i in range(10)]

    ## pca_explained: variance explained by the first PCA
    pca_explained = pca_algo.explained_variance_ratio_[0]

    ## cs_100: completeness metric of cluster labelling given true values from data['group]
    cs_100 = metrics.completeness_score(data['group'], clusters_data)

    ## cs_100: completeness metric of cluster labelling given true values from data['group] after dimensionality reduction
    cs_10 = metrics.completeness_score(data['group'], clusters_data_pca)

    ## vms_100: V-measure of cluster labelling given true values from data['group] based on clustering
    vms_100 = metrics.v_measure_score(data['group'], clusters_data)

    ## vms_10: V-measure of cluster labelling given true values from data['group] based on clustering PCA
    vms_10 = metrics.v_measure_score(data['group'], clusters_data_pca)

    return {
        'nobs_100': nobs_100,
        'nobs_10': nobs_10,
        'pca_explained': pca_explained,
        'cs_100': cs_100,
        'cs_10': cs_10,
        'vms_100': vms_100,
        'vms_10': vms_10
    }

cluster_articles(data)


{'nobs_100': [5, 1450, 1500, 1394, 969, 1672, 1398, 587, 1201, 1138],
 'nobs_10': [568, 1251, 2222, 620, 1140, 1105, 744, 1179, 1308, 1177],
 'pca_explained': 0.027030801,
 'cs_100': 0.6402464914637147,
 'cs_10': 0.4897132047941561,
 'vms_100': 0.6450009670318155,
 'vms_10': 0.5004541694561083}