In [1]:
import pandas as pd
import numpy as np
from line_profiler import LineProfiler

%load_ext Cython
%load_ext line_profiler

In [2]:
from sklearn.base import ClusterMixin, BaseEstimator
from scipy.spatial.distance import cdist

class KMeans(BaseEstimator, ClusterMixin): 
    
    def __init__(self, k=2, metric='euclidean', max_iter=1000, random_state=None,  eps = 1e-4):
        """
        Инициализация метода
        :k - количество кластеров
        :metric - функция расстояния между объектами
        :max_iter - максиальное количество итераций
        :random_state - seed для инициализации генератора случайных чисел
        """
        
        self.k = k
        self.metric = metric
        self.max_iter = max_iter
        self.random_state = random_state
        self.eps = eps
    
    def set_centroids(self, X, y=None):
        self.centroids = X[np.random.choice(X.shape[0], self.k, replace=False), :]
        self.centroids = np.array(self.centroids)
              
    def modify_centroids(self, X, y=None):
        for step in range (0, self.max_iter):
          
            # Все расстояния
            distances = cdist(self.centroids, X, self.metric)

            # Ближайшие центроиды к точкам
            self.labels = np.argmin(distances, axis=0)

            new_centroids = np.array([(np.mean(X[self.labels == i],axis=0)) for i in range(0,self.k)])
            if np.mean((new_centroids - self.centroids) ** 2) <= self.eps:
                break
            self.centroids = np.copy(new_centroids)
        
    def fit(self, X, y=None):
        """
        Процедура обучения k-means
        """
        
        # Инициализация генератора случайных чисел
        np.random.seed(self.random_state)
        
        # Массив с метками кластеров для каждого объекта из X
        self.labels = np.empty(X.shape[0], dtype=int)
        
        # Массив с центройдами кластеров
        self.centroids = np.empty((self.k, X.shape[1]))
        
        ## Your Code Here
        
        self.set_centroids(X)
        self.modify_centroids(X)
        
   
        return self

    def predict(self, X, y=None):
        """
        Процедура предсказания кластера
        
        Возвращает метку ближайшего кластера для каждого объекта
        """
        # Все расстояния
        distances = cdist(self.centroids, X, self.metric)

        # Ближайшие центроиды к точкам
        self.labels = np.argmin(distances, axis=0)
        return self.labels
    

      

In [3]:
df_sns = pd.read_csv('snsdata.csv', sep=',')
df_sns.head()
df_sns = df_sns.drop(['gradyear', 'gender', 'age', 'friends'], axis=1)
df_sns = df_sns.dropna()
X = df_sns.iloc[:, 0:].values
X_train = (X - X.mean(axis=0))/X.std(axis = 0)
np.where(np.isinf(X_train))

(array([], dtype=int64), array([], dtype=int64))

In [4]:
def profile_print(model_km, X):
    profiler = LineProfiler()
    profiler.add_function(model_km.set_centroids)
    profiler.add_function(model_km.modify_centroids)
    profiler.add_function(model_km.fit)
    profiler.runcall(model_km.fit, X)
    profiler.print_stats()

In [11]:
model_km = KMeans(k=9)
profile_print(model_km, X_train)

Timer unit: 1e-06 s

Total time: 0.001908 s
File: <ipython-input-2-cdbd03e42497>
Function: set_centroids at line 21

Line #      Hits         Time  Per Hit   % Time  Line Contents
    21                                               def set_centroids(self, X, y=None):
    22         1         1883   1883.0     98.7          self.centroids = X[np.random.choice(X.shape[0], self.k, replace=False), :]
    23         1           25     25.0      1.3          self.centroids = np.array(self.centroids)

Total time: 1.62989 s
File: <ipython-input-2-cdbd03e42497>
Function: modify_centroids at line 25

Line #      Hits         Time  Per Hit   % Time  Line Contents
    25                                               def modify_centroids(self, X, y=None):
    26        26          137      5.3      0.0          for step in range (0, self.max_iter):
    27                                                     
    28                                                       # Все расстояния
    29       

(Все вычисления через numpy; из циклов остался только цикл по итерациям и цикл по меткам кластеров, где нужно вычислять индексы в зависимости от значения переменной в цикле).

In [6]:
%%cython -a
import numpy as np
cimport numpy as np
from libc.math cimport sqrt
import cython

cdef class KMeansCython: 
    cdef public:
        int k, max_iter, random_state
        double eps
        np.int_t[:] labels;
        double [:,:] centroids
        double [:,:] distances
        
    def __cinit__(self, int k=2, int max_iter=1000, int random_state = 0, double eps = 1e-4):
        """
        Инициализация метода
        :k - количество кластеров
        :metric - функция расстояния между объектами
        :max_iter - максиальное количество итераций
        :random_state - seed для инициализации генератора случайных чисел
        """
        
        self.k = k
        self.max_iter = max_iter
        self.random_state = random_state
        self.eps = eps
    
    cdef set_centroids(self, X, y=None):
        self.centroids = X[np.random.choice(X.shape[0], self.k, replace=False), :]
        #print type(self.centroids)
        self.centroids = np.array(self.centroids)
        #print type(self.centroids)
              
    cdef modify_centroids(self, X, y=None):
       
        for step in range (0, self.max_iter):
          
            # Все расстояния
            distances = self.pairwise_distances(np.asarray(X), np.asarray(self.centroids))
            #print type(distances)
            # Ближайшие центроиды к точкам
            self.labels = np.asarray(np.argmin(distances, axis=1))
            #print type(self.labels)
            new_centroids = self.centroids_cython(np.asarray(X), np.asarray(self.labels), self.k)
            #print type(self.centroids)
            #print type(new_centroids)
            if np.mean((new_centroids - self.centroids) ** 2) <= self.eps:
                break
            self.centroids = np.copy(new_centroids)
        
    def fit(self, X, y=None):
        """
        Процедура обучения k-means
        """
        
        # Инициализация генератора случайных чисел
        np.random.seed(self.random_state)
        
        # Массив с метками кластеров для каждого объекта из X
        self.labels = np.empty(X.shape[0], dtype=int)
        
        # Массив с центройдами кластеров
        self.centroids = np.empty((self.k, X.shape[1]))
        
        ## Your Code Here
        
        self.set_centroids(X)
        self.modify_centroids(X)
        #print("modifying finished")
        self.labels = np.asarray(self.labels)
        self.centroids = np.asarray(self.centroids)
   
        return self

    def predict(self, X, y=None):
        """
        Процедура предсказания кластера
        
        Возвращает метку ближайшего кластера для каждого объекта
        """
        # Calculate distances
        distances = self.pairwise_distances(X, self.centroids)

        # Closest centoid to a point
        self.labels = np.argmin(distances, axis=1)
        return self.labels
    
    @cython.boundscheck(False)
    @cython.cdivision(True)
    cpdef pairwise_distances(self, np.ndarray[np.float64_t, ndim=2]  X, np.ndarray[np.float64_t, ndim=2] centroids):
        cdef int dimension, points_number, centroids_number;
        points_number = X.shape[0]
        centroids_number = centroids.shape[0]
        dimension = centroids.shape[1]
        cdef np.ndarray[np.float64_t, ndim=2]  distances = np.empty((points_number, centroids_number), dtype=np.float)
        cdef np.float64_t distance;
        cdef np.float64_t diff;
        for i in range(points_number):
            for j in range(centroids_number):
                distance = 0
                for step in range(dimension):
                    diff = X[i, step] - centroids[j, step]
                    distance = distance + diff * diff
                distances[i, j] = sqrt(distance)
        return np.asarray(distances)
    
    @cython.boundscheck(False)
    @cython.cdivision(True)
    cpdef centroids_cython(self, np.ndarray[np.float64_t, ndim=2]  X, np.ndarray[np.int_t, ndim=1]  labels, np.int_t clusters_number):
        cdef int points_number, dimension;
        points_number = labels.shape[0]
        dimension = X.shape[1]
    
        cdef np.int_t[:] cluster_counters = np.zeros((clusters_number), dtype=np.int)
        cdef np.ndarray[np.float64_t, ndim=2]  cluster_sums = np.zeros((clusters_number, dimension), dtype=np.float)

        for point in range(points_number):
            for cluster in range(clusters_number):
                if labels[point] == cluster:
                    cluster_counters[cluster] += 1;
                    for j in range(dimension):
                        cluster_sums[cluster, j] += X[point, j]
                  
        for cluster in range(clusters_number):
            for j in range(dimension):
                cluster_sums[cluster, j] /= cluster_counters[cluster]
        return np.asarray(cluster_sums)   

In [12]:
model_km_cython = KMeansCython(k=9)

In [13]:
%%timeit
model_km_cython.fit(np.asarray(X_train))

1 loop, best of 3: 1.9 s per loop


In [14]:
model_km.labels

array([7, 0, 7, ..., 7, 7, 7])

In [15]:
np.asarray(model_km_cython.labels)

array([8, 3, 8, ..., 8, 8, 8])

In [None]:
%%cython -a
import numpy as np
cimport numpy as np
from libc.math cimport sqrt
import cython

@cython.boundscheck(False)
@cython.cdivision(True)
cpdef pairwise_distances(np.ndarray[np.float64_t, ndim=2]  X, np.ndarray[np.float64_t, ndim=2] centroids):
    cdef int dimension, points_number, centroids_number;
    points_number = X.shape[0]
    centroids_number = centroids.shape[0]
    dimension = centroids.shape[1]
    cdef np.ndarray[np.float64_t, ndim=2]  distances = np.empty((points_number, centroids_number), dtype=np.float)
    cdef np.float64_t distance;
    cdef np.float64_t diff;
    for i in range(points_number):
        for j in range(centroids_number):
            distance = 0
            for step in range(dimension):
                diff = X[i, step] - centroids[j, step]
                distance = distance + diff * diff
            distances[i, j] = sqrt(distance)
    return distances

In [None]:
pairwise_distances(X_train, model_km.centroids)

In [None]:
%%cython -a
import numpy as np
cimport numpy as np
from libc.math cimport sqrt
import cython

@cython.boundscheck(False)
@cython.cdivision(True)
cpdef centroids_cython(np.ndarray[np.float64_t, ndim=2]  X, np.ndarray[np.int_t, ndim=1]  labels, np.int_t clusters_number):
    cdef int points_number, dimension;
    points_number = labels.shape[0]
    dimension = X.shape[1]
    
    cdef np.int_t[:] cluster_counters = np.zeros((clusters_number), dtype=np.int)
    cdef np.ndarray[np.float64_t, ndim=2]  cluster_sums = np.zeros((clusters_number, dimension), dtype=np.float)

    for point in range(points_number):
        for cluster in range(clusters_number):
            if labels[point] == cluster:
                cluster_counters[cluster] += 1;
                for j in range(dimension):
                    cluster_sums[cluster, j] += X[point, j]
                
    for cluster in range(clusters_number):
        for j in range(dimension):
            cluster_sums[cluster, j] /= cluster_counters[cluster]
    return cluster_sums    
            

In [None]:
centroids_cython(X_train, model_km.labels, 9)