In [1]:
import pickle as pkl
from sklearn.cluster import KMeans

In [4]:
with open('../data/tdidf_vector.pkl', 'rb') as f_open:
    tfidf_matrix = pkl.load(f_open)

clustering = KMeans(n_clusters=8).fit(tfidf_matrix.toarray())
clustering.cluster_centers_

array([[-2.43945489e-19, -3.81639165e-17,  1.06399663e-03, ...,
         1.63066035e-03, -5.42101086e-19,  1.16992416e-03],
       [-5.42101086e-20,  5.21147374e-02,  1.64742497e-03, ...,
         1.30104261e-18,  1.30852805e-03,  2.16840434e-19],
       [-2.71050543e-20,  7.56049139e-02,  1.15891901e-03, ...,
         1.30104261e-18,  1.94072966e-03,  2.16840434e-19],
       ...,
       [-4.33680869e-19,  3.46944695e-18,  6.50521303e-19, ...,
        -2.38524478e-18, -4.87890978e-19,  1.50699994e-03],
       [ 2.56622653e-03,  8.45788707e-02,  1.24207921e-03, ...,
         2.16840434e-19,  1.23878294e-03,  4.33680869e-19],
       [-2.71050543e-19,  3.18733794e-03,  4.33680869e-19, ...,
         3.50524669e-03, -3.79470760e-19,  5.42101086e-19]])

In [5]:
clustering.labels_

array([5, 0, 0, 0, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 5,
       5, 5, 0, 3, 0, 0, 5, 3, 0, 0, 0, 0, 4, 5, 5, 5, 5, 5, 5, 0, 5, 0,
       0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 0, 3, 3, 3, 3, 4, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 1, 3, 3, 5, 3, 3, 3, 3, 3, 3, 4,
       0, 1, 3, 3, 3, 1, 3, 3, 0, 3, 3, 3, 3, 1, 1, 3, 3, 3, 3, 3, 1, 3,
       3, 3, 3, 3, 4, 4, 0, 3, 3, 4, 3, 3, 1, 3, 3, 3, 0, 1, 5, 5, 0, 5,
       5, 5, 3, 3, 0, 3, 3, 0, 1, 0, 5, 4, 4, 0, 1, 4, 4, 4, 0, 1, 4, 1,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 4, 4, 1, 0, 1, 1, 4, 1, 3,
       3, 0, 0, 0, 0, 4, 5, 5, 4, 4, 4, 4, 5, 4, 5, 0, 5, 0, 5, 5, 4, 5,
       5, 3, 5, 3, 3, 5, 5, 5, 5, 5, 5, 0, 4, 5, 3, 4, 5, 5, 5, 0, 4, 1,
       0, 0, 5, 0, 0, 5, 0, 5, 5, 0, 4, 0, 4, 5, 5, 5, 5, 5, 4, 4, 0, 4,
       5, 0, 5, 5, 5, 5, 5, 5, 0, 0, 4, 0, 5, 5, 5, 5, 0, 4, 4, 4, 4, 4,
       4, 4, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
       4, 0, 7, 7, 7, 0, 7, 0, 0, 0, 5, 7, 7, 7, 0,

In [8]:
import numpy as np
from numpy.linalg import norm
from random  import randrange
tfidf_matrix.toarray().shape

(589, 8266)

In [34]:
MIN_DIST = 1e10

class KMeans():
    """
    Agglomerative Clustering
    Recursively merges the pair of clusters that minimally increases
    a given linkage distance.
    """
    def __init__(self, n_clusters=8, iters=300):
        self._n_clusters = n_clusters
        self._iters = iters
        self._dist_mat = None
        
    def distance_matrix(self, X):
        """
        Computer matrix of exponential cosine distance between each data point.
        """
        self._dist_mat = np.zeros((X.shape[0], X.shape[0]))
#       calculating cosine distances
        for i in range(X.shape[0]):
            for j in range(X.shape[0]):
                if i!=j:
                    self._dist_mat[i][j] = self.cos_sim_dist(X[i],X[j])
        np.fill_diagonal(self._dist_mat, MIN_DIST)
#         print(self._dist_mat[545][545])
#         print(self._dist_mat.shape)
        
    @staticmethod
    def cos_sim_dist(X,Y):
        """
        Return exponential cosine similarity between two vectors.
        """
        return np.exp(-1 * ((X @ Y.T)/(norm(X)*norm(Y))))
    
    def init_centroids(self, X):
        """
        Create random centroids.
        """
        np.random.RandomState(randrange(0, 1e4))
        self._centroids = X[np.random.choice(X.shape[0], self._n_clusters, replace=False)]
        
    def closest_centroid(self, X):
        """
        Return index of centroid closest to the given X vector.
        """
        min_dist_list = [self.cos_sim_dist(X, Y) for Y in self._centroids]
        return min_dist_list.index(min(min_dist_list))   
            
    
    def fit(self, X):
        """
        Method to call to fit data.
        """
        X_arr = X.toarray()
        self.init_centroids(X_arr)
        self.distance_matrix(X_arr)
        doc_labels = np.zeros((X_arr.shape[0], 1))
        for i in range(self._iters):
            centroid_points = [[] for _ in range(self._n_clusters)]
            print(f"Iteration number {i}", end="\r")
            for index, arr in enumerate(X_arr):
#                 assign closest centroid
                doc_labels[index] = self.closest_centroid(arr)
                for num in doc_labels[index]:
                    centroid_points[int(num)] = index
                
    #             compute new centroids
            new_centroids = np.zeros((self._n_clusters, X_arr.shape[1]))
            for ind in range(self._n_clusters):
                new_centroids[ind, :] = np.mean(np.take(X_arr, centroid_points[ind], axis=0), axis=0)
            
#             break if no change detected
            if np.all(self._centroids == new_centroids):
                break
            self._centroids = new_centroids
        self._results = [[] for _ in range(self._n_clusters)]
        for i, ele in enumerate(doc_labels):
            for num in ele:
                print(num)
                self._results[int(num)].append(i)
        print(self._results)
        self.save()
    
    def save(self):
        """
        Save the results in a sorted manner to ../clusters/agglomerative.txt
        """
        sorted_results = sorted(self._results, key= lambda x: min(x))
        sorted_results = [sorted(x) for x in sorted_results]
#         print(sorted_results)
        with open("../clusters/kmeans.txt", 'w') as f_open:
            for result in sorted_results:
                f_open.write(','.join([str(x) for x in result]))
                f_open.write('\n')

In [35]:
kmeans = KMeans()
kmeans.fit(tfidf_matrix)

0.0ration number 299
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0

ValueError: min() arg is an empty sequence