First we are going to use their funciton to read in the data

In [1]:
import numpy as np
from scipy.spatial import distance
import random

In [2]:
# Copy the functions given by the PSET to save time and practice building upon functions written by other people
def read_data(infile):
    '''
    NOT MY FUNCTION
    read_data(infile)
    Read Lestrade's input file, w05-data.tbl, or a file in that format.
    Return:
       ctype[0..N-1] : cell types 0..Q-1 for each cell i
       data[i,g]     : array of count data; rows = cells i; cols = genes g
       N             : number of cells (rows of the data file)
       G             : number of genes (cols of the data file, after the first)
       Q             : number of cell types
    '''
    ctype = []
    data  = []
    with open(infile) as f:
        for line in f:
            if line[0] == '#': continue   # skip comment lines
            fields = line.split()
            ctype.append(int(fields[1]))
            data.append( [int(fields[2]), int(fields[3])])  # assumes exactly 2 genes!!
    ctype = np.array(ctype)
    data  = np.array(data)
    N, G  = np.shape(data)
    Q     = np.max(ctype) + 1
    return ctype, data, N, G, Q

def initialize_at_true():
    '''
    NOT MY FUNCTION
    initialize_at_true():
    Returns the true mu centroids, and the true proportions;
    don't say I never gave you anything.
       mu[q,g]  : array of means for mixture q, gene g
       qp[q]    : mixture coefficient for mixture q
    '''
    qp = np.array([ 0.1, 0.2, 0.4, 0.2, 0.1 ])
    mu = np.array([[   30., 2000. ],
                   [ 2000., 2000. ],
                   [  300.,  300. ],
                   [   30.,   30. ],
                   [ 2000,    30. ]])
    return mu, qp


def visualize_data(data, mu, C, outpng):
    '''
    NOT MY FUNCTION
    visualize_data():

    This might give you a starting point that saves some matplotlib
    machinations; you can certainly spiff this up from here.

    Input:
       data[i,g] : count data for each cell i, for each gene g
       mu[q,g]   : array of mean counts for mixture q, gene g
       C[i]      : assignment of cell i to a cluster 0..Q-1
       outpng    : save figure to PNG file (must end in .png; example 'foo.png')

    '''
    N, G  = np.shape(data)
    Q, G2 = np.shape(mu)
    assert G == G2
    assert len(C) == N

    # We can assign colors to up to Q=10 components. If you want more, add more.
    colormap = ['xkcd:orange', 'xkcd:olive',     'xkcd:azure',    'xkcd:rose', 'xkcd:mustard', 
                'xkcd:peach',  'xkcd:turquoise', 'xkcd:lavender', 'xkcd:rust', 'xkcd:red']

    fig, ax = plt.subplots()
    for i in range(N):
        edgecolor = colormap[ C[i]]
        fillcolor = 'w'
        shape     = 'o'
        ax.loglog( data[i,0], data[i,1], marker=shape, mec=edgecolor, mfc=fillcolor, mew=1.5)

    for q in range(Q):
        ax.loglog(mu[q,0], mu[q,1], '*k', ms=10)

    ax.set_xlabel('caraway (counts)')
    ax.set_ylabel('kiwi (counts)')

    fig.savefig(outpng)

In [3]:
test = read_data("w05-data.tbl")[1]

In [71]:
def hard_k_means_cluster(k, data, centroid_choice, n):
    """Function to impliment k-means clustering with k clusters on a 2d numpy array of data, formatted such that 
    each row is a single c-dimensional data point, where z is the number of columns"""
    num_rows = data.shape[0]
    num_cols = data.shape[1]
    
    # Initiate centroids in the way specified by the user
    if centroid_choice == "random":
        min_max_list = []
        centroids = np.ones((k, num_cols))
        # Get the min and max of data dimension
        for c in range(num_cols):
            min_max_list.append((data[:,c].min(), data[:,c].max()))
            
        # Create k random centroids with dimension values within the range of each data dimension
        for r in range(k):
            for c in range(num_cols):
                centroids[r,c] = random.uniform(min_max_list[c][0],min_max_list[c][1])   
    
    # Update the clusters n times
    new_centroids = centroids
    for iteration in range(n):
        ### Update the clusters###
        
        # Initiate a an array where each index holds the new cluster identity of the data at the correspoinding row
        # index
        new_clusters = np.ones(num_rows, dtype=int)
        # Iterate over each row of the data
        for r in range(num_rows):
            # Get the identity closest centroid to data vector r
            closest_centroid = np.argmax([distance.euclidean(i,data[r,:]) for i in centroids], )
            # Add the centroid indentity to the array new_clusters
            new_clusters[r] = closest_centroid
            
        ### Update the centroid values###
        
        clustered_data = [[] for i in range(k)]
        # Create a list containing a list of cooredinates for the kh
        for idx in range(num_rows):
            clustered_data[new_clusters[idx]].append(data[idx,:])        
        
        for idx, cluster in enumerate(clustered_data):
            if len(cluster) == 0: continue
            else: 
                centroids[idx,:] = np.mean(cluster, axis=0)
            
    return clustered_data, centroids
    
test_2, test_3 = hard_k_means_cluster(5, test, "random", 100)    

In [72]:
print(test_3)

[[2379.5        2182.89411765]
 [2959.16326531 2625.51020408]
 [ 346.99640719  643.01077844]
 [1471.16666667   43.58333333]
 [3040.625       470.7625    ]]


In [73]:
test_2 

[[],
 [array([2262, 2758]),
  array([2096, 3277]),
  array([2327, 1989]),
  array([3211, 5266]),
  array([2384, 2325]),
  array([2450, 3110]),
  array([2862, 2482]),
  array([2613, 2868]),
  array([2765, 2708]),
  array([2646, 2353]),
  array([2372, 2159]),
  array([2347, 1529]),
  array([2005, 4599]),
  array([2942, 1953]),
  array([2143, 3088]),
  array([2624, 3857]),
  array([4577, 3643]),
  array([6139, 2371]),
  array([2954, 1954]),
  array([3126, 2383]),
  array([2251, 1582]),
  array([3125, 2452]),
  array([2546, 1674]),
  array([2272, 2043]),
  array([3012, 3878]),
  array([4066, 2230]),
  array([3408, 1786]),
  array([3561, 2287]),
  array([2420, 2438]),
  array([3654, 2745]),
  array([3664, 2082]),
  array([2483, 1938]),
  array([3325, 2763]),
  array([2160, 2746]),
  array([4863, 1906]),
  array([3379, 2128]),
  array([2492, 3857]),
  array([5204, 1859]),
  array([2303, 1862]),
  array([2595, 2864]),
  array([2554, 4099]),
  array([4270, 3904]),
  array([2565, 2414]),
  arra

In [None]:
 new_cluster_idxs = [np.where(new_clusters == cluster) for cluster in range(k)]
        # Update the new centroids
        # Initiate an array to hold the new 
#         new_centroids = np.ones(k, num_cols)
                