In [155]:
import multiprocessing as mp
from functools import partial
import pandas as pd
import numpy as np
from scipy.spatial import distance as dist

In [169]:
def min_distance(d, centroids):
    """
    function return the minimum distance from point d to nearest centroids
    """
    dist = np.min(np.sum((centroids - d)**2, axis=1))
    return dist

def cost_p(data, centroids): 
    """
    function that return the cost(distance) for each observation
    """

    with mp.Pool(processes = mp.cpu_count()) as pool:
        partial_dist = partial(min_distance, centroids = centroids)
        min_dist = pool.map(partial_dist, data)
        p = min_dist/np.sum(min_dist)
    return p


def random_choice(x, a, p):
    """
    helper function like np.random.choice
    but have one less argument and shift order of arguments for future map
    """
    np.random.seed()
    return np.random.choice(a = a, size = x , p =p)


def sample_p(data, distribution, l):
    
    """ 
    Function to sample l number new centers
    """  
    
    with mp.Pool(processes = mp.cpu_count()) as pool:
        partial_rc = partial(random_choice, a = len(distribution), p=distribution)
        #create l number of size one observation
        index = pool.map(partial_rc,np.repeat(1,l))
    return np.squeeze(data[index,:],axis=(1,))


def min_index_p(d, centroids):
    
    """ 
    Return the index of the minimum distance from point d 
    to its nearest centroids.
    """
    
    index = np.argmin(np.sum((centroids - d)**2, axis=1))
    return index 

def get_weight_p(data, centroids):
    
    ''' 
        Function to return  weight for each centorid
        Input: data, an array of data. centroids, initial centroids
        Output: C, an array with length k of weight for cluster centers. 
    '''

    with mp.Pool(processes = mp.cpu_count()) as pool:
        partial_min = partial(min_index_p, centroids = centroids )
        min_index = pool.map(partial_min, data)
        count = np.array([np.sum(np.array(min_index) == i) for i in range(centroids.shape[0])])
    return count/np.sum(count)



def cdist_kmeans_pp(data, k, weights):
    ''' 
        Function to return final centers for the using k-means++ clustering algorithm
        Input: data, an array of data. k, the number of clusters. weights, weight for each initial centroids
        Output: C, an array with length k of initial cluster centers. 
    '''
    first_random = np.random.choice(data.shape[0], 1)
    C = data[first_random, :]
    
    for i in range(k-1):
        cdist = (dist.cdist(data, C))**2
        cdist_min = np.min(cdist, axis = 1)* weights
        prob = cdist_min/np.sum(cdist_min)
        new_center = np.random.choice(data.shape[0],1, p=prob)
        C = np.vstack([C, data[new_center,:]])
        
    return C

def kmeans_II_p(data, k, l):
    ''' 
        Function to return final centers for the using k-means|| clustering algorithm
        Input: data, an array of data. k, the number of clusters. l, oversampling factor
        Output: C, an array with length k of initial cluster centers. 
    '''
    
    C = data[np.random.choice(range(data.shape[0]),1), :]  
    cdist = (dist.cdist(data, C))**2
    cdist_min = np.min(cdist, axis = 1)
    cost_phi = np.sum(cdist_min)
    
    for i in range(int(round(np.log(cost_phi)))):   
        
        # Calculate the cost and new distribution
        p = cost_p(data, C)
        
        # sample new centers
        C = np.r_[C, sample_p(data, p, l)]
        
    weights = get_weight_p(data,C)
    
    centers = cdist_kmeans_pp(C, k, weights)
    
    return centers

In [147]:
spambase = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data", header=None)
df = np.array(spambase)
k = 20
l = 10

In [170]:
kmeans_II_p(df, k, l).shape

(20, 58)

In [29]:
 x = np.array([[[0], [1], [2]]])

In [31]:
np.squeeze(x)

array([0, 1, 2])

In [37]:
def randomSample(x, a, p):
    np.random.seed()
    return np.random.choice(a = a, size = x , p =p)


In [163]:
def sample_new_p(data, dist, l):
        
    with mp.Pool(processes = mp.cpu_count()) as pool:
        #partial_rc = partial(randomSample, a = len(distribution), p=distribution)
        index = pool.starmap(randomSample, [[[1]*l], [len(dist)], [dist]])
        #index = pool.map(partial_rc,[1]*l)
    return np.squeeze(data[index,:],axis=(1,))

In [76]:
 with mp.Pool(processes = mp.cpu_count()) as pool:
        #partial_rc = partial(randomSample, a = len(distribution), p=distribution)
        index = pool.starmap(np.random.choice, [[a,3],[b,4]])

In [77]:
index

[array([2, 9, 1]), array([165, 140, 130, 196])]

In [66]:
a = np.arange(1,10)
b = np.arange(100,200)

In [46]:
a.shape

(9,)

In [47]:
np.repeat(1/9,9)

array([0.11111111, 0.11111111, 0.11111111, 0.11111111, 0.11111111,
       0.11111111, 0.11111111, 0.11111111, 0.11111111])