In [122]:

import os

%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA

import altair as alt

In [143]:
def show_clusters(data, clusters, centroids = None):
    """
    This function reduces a data set to 2 dimensions using principle component analysis (PCA) and colours clusters of points.
    Parameters


    ----------
    data : DataFrame
    Scaled data

    clusters : list, pandas Series
    corresponding cluster for X

    centroids: 2d array
    Coordinates of cluster centroids


    Returns
    -------
    plot
    A 2d principle components scatter plot coloured by cluster
    Examples
    --------
    >>> from sklearn.datasets import make_blobs
    >>> X, _ = make_blobs(n_samples=10, centers=3, n_features=2)
    >>> processed_data = preprocess(X)
    >>> optimal_K = find_elbow(processed_data)
    >>> centroids = fit(processed_data, optimal_k)
    >>> show_clusters(processed_data, centroids)

    
    """
    
    try:
        data = pd.DataFrame(data)
    except ValueError:
        raise ValueError("data should be a pandas dataframe or a numpy2darray.")
        
    try: 
        clusters = pd.Series(clusters)
    except ValueError:
        raise ValueError("clusters should be a list of numbers, a pandas series or a numpy1darray.")
        
    if data.shape[0] != clusters.shape[0]:
        raise ValueError("data should have the same number of rows as clusters")

    pca = PCA(n_components=2)
    principal_comp = pca.fit_transform(data)
    pca_df = pd.DataFrame(data=principal_comp, columns=["pca1", "pca2"])
    pca_df["cluster"] = pd.Series(clusters).apply(str)
    

    plot = alt.Chart(pca_df).mark_point(size=20).encode(
    alt.X('pca1'),
    alt.Y('pca2'),
    color='cluster')
    
    
    
        
    if type(centroids) != type(None):
        try:
            centroids = pd.DataFrame(centroids)
        except ValueError:
            raise ValueError("centroids should be a pandas dataframe or a numpy2darray.")
        #centroids_df = pd.DataFrame(data= pca.transform(centroids), columns=["pca1", "pca2"])
        centroids_df = pd.DataFrame(data= np.array(centroids), columns=["pca1", "pca2"])
        centroids_df["cluster"] = pd.Series(range(centroids.shape[0])).apply(str)

        plot_centroid = alt.Chart(centroids_df).mark_point(size = 100).encode(
            alt.X('pca1'),
            alt.Y('pca2'),
            color='cluster')
        plot = plot + plot_centroid



    return plot

In [144]:
X

array([[  5.47978152,  -8.09438738,   3.42933163],
       [  7.21011381,  -9.93526223,   1.68331509],
       [ -6.70504468,   4.86660241,  -5.81073135],
       [ -3.89693732,   4.71499606,  -6.09214584],
       [  5.89605379,  -9.8189236 ,   3.02662264],
       [  6.23776355,   0.38779993,  -4.6101094 ],
       [  5.68880169, -10.16769266,   4.04138663],
       [ -5.99138753,   4.60218872,  -6.8863314 ],
       [  4.34182348,   1.86890246,  -3.5075443 ],
       [  3.85740612,   2.15046855,  -5.31360021]])

In [145]:
X, y = make_blobs(n_samples=10, centers=3, n_features=3, random_state=10)
centroids = np.array([[-10,0], [3,3], [10,-3]])    
c= show_clusters(X, y, centroids)
c

[alt.Chart(...), alt.Chart(...)]

In [103]:
a+b

In [28]:
def test_show_clusters():
  X, y = make_blobs(n_samples=10, centers=3, n_features=3, random_state=10)
  a = show_clusters(X, y)

  assert a.encoding.x.shorthand == 'pca1', a.encoding.x.shorthand
  assert a.encoding.y.shorthand =='pca2', a.encoding.y.shorthand
  assert a.mark == 'point', a.mark

test_show_clusters()
21312

21312

In [17]:
a.encoding.x

X({
  field: 'pca1',
  type: 'quantitative'
})