In [11]:
import pickle as pkl
from sklearn.cluster import KMeans

In [12]:
with open('../data/tdidf_vector.pkl', 'rb') as f_open:
    tfidf_matrix = pkl.load(f_open)

clustering = KMeans(n_clusters=8).fit(tfidf_matrix.toarray())
clustering.cluster_centers_

array([[-8.13151629e-20,  3.46944695e-18,  2.16840434e-19, ...,
         1.30104261e-18, -2.71050543e-19,  4.20615590e-03],
       [ 2.02187545e-03,  9.90770502e-02,  9.78607865e-04, ...,
        -1.08420217e-18,  8.12873487e-04,  5.42101086e-19],
       [-4.33680869e-19,  2.10154150e-03,  6.50521303e-19, ...,
         2.31115166e-03, -4.87890978e-19,  7.58941521e-19],
       ...,
       [-5.14996032e-19, -1.38777878e-17,  7.58941521e-19, ...,
        -3.03576608e-18, -5.42101086e-19,  1.08775935e-03],
       [-8.13151629e-20,  3.46944695e-18,  2.16840434e-19, ...,
         2.62943833e-03, -2.71050543e-19,  2.16840434e-19],
       [-4.87890978e-19,  3.46944695e-18,  1.46057719e-03, ...,
         1.23448457e-03, -5.42101086e-19,  8.67361738e-19]])

In [13]:
clustering.labels_

array([5, 6, 6, 6, 5, 5, 5, 5, 7, 6, 6, 7, 7, 7, 6, 6, 6, 7, 7, 7, 7, 5,
       5, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       5, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 4, 7, 7, 6, 7, 0, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5,
       5, 5, 4, 4, 4, 4, 4, 4, 5, 5, 5, 7, 0, 6, 0, 0, 0, 7, 7, 0, 0, 0,
       0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 0, 0, 4,
       7, 4, 4, 4, 4, 4, 5, 5, 0, 7, 4, 4, 5, 4, 5, 4, 5, 4, 5, 5, 4, 4,
       5, 7, 6, 0, 6, 5, 4, 5, 5, 5, 7, 5, 0, 5, 7, 4, 5, 5, 4, 5, 5, 6,
       0, 4, 5, 5, 5, 5, 4, 5, 5, 5, 0, 5, 4, 5, 5, 5, 5, 5, 5, 4, 4, 2,
       2, 5, 5, 5, 5, 4, 4, 5, 4, 4, 4, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 5, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 6,
       4, 5, 2, 2, 2, 2, 2, 5, 7, 7, 5, 2, 2, 2, 5,

In [14]:
import numpy as np
from numpy.linalg import norm
from random  import randrange
tfidf_matrix.toarray().shape

(589, 8266)

In [15]:
MIN_DIST = 1e10

class KMeans():
    """
    Agglomerative Clustering
    Recursively merges the pair of clusters that minimally increases
    a given linkage distance.
    """
    def __init__(self, n_clusters=8, iters=300, path="../clusters/kmeans.txt"):
        self._n_clusters = n_clusters
        self._iters = iters
        self._path = path
        self._dist_mat = None
        
    def distance_matrix(self, X):
        """
        Computer matrix of exponential cosine distance between each data point.
        """
        self._dist_mat = np.zeros((X.shape[0], X.shape[0]))
#       calculating cosine distances
        for i in range(X.shape[0]):
            for j in range(X.shape[0]):
                if i!=j:
                    self._dist_mat[i][j] = self.cos_sim_dist(X[i],X[j])
        np.fill_diagonal(self._dist_mat, MIN_DIST)
#         print(self._dist_mat[545][545])
#         print(self._dist_mat.shape)
        
    @staticmethod
    def cos_sim_dist(X,Y):
        """
        Return exponential cosine similarity between two vectors.
        """
        return np.exp(-1 * ((X @ Y.T)/(norm(X)*norm(Y))))
    
    def init_centroids(self, X):
        """
        Create random centroids.
        """
        np.random.RandomState(randrange(0, 1e4))
        self._centroids = X[np.random.choice(X.shape[0], self._n_clusters, replace=False)]
        
    def closest_centroid(self, X):
        """
        Return index of centroid closest to the given X vector.
        """
        min_dist_list = [self.cos_sim_dist(X, Y) for Y in self._centroids]
        return min_dist_list.index(min(min_dist_list))   
            
    
    def fit(self, X_arr):
        """
        Method to call to fit data.
        """
        self.init_centroids(X_arr)
        self.distance_matrix(X_arr)
        doc_labels = np.zeros((X_arr.shape[0], 1))
        for i in range(self._iters):
            centroid_points = [[] for _ in range(self._n_clusters)]
            print(f"Iteration number {i}", end="\r")
            for index, arr in enumerate(X_arr):
#                 assign closest centroid
                doc_labels[index] = self.closest_centroid(arr)
                for num in doc_labels[index]:
                    centroid_points[int(num)].append(index)
                
    #             compute new centroids
            new_centroids = np.zeros((self._n_clusters, X_arr.shape[1]))
            for ind in range(self._n_clusters):
                new_centroids[ind, :] = np.mean(np.take(X_arr, centroid_points[ind], axis=0), axis=0)
            
#             break if no change detected
            if np.all(self._centroids == new_centroids):
                break
            self._centroids = new_centroids
        self._results = [[] for _ in range(self._n_clusters)]
        for i, ele in enumerate(doc_labels):
            for num in ele:
                print(num)
                self._results[int(num)].append(i)
        print(self._results)
        self.save()
    
    def save(self):
        """
        Save the results in a sorted manner to ../clusters/agglomerative.txt
        """
        sorted_results = sorted(self._results, key= lambda x: min(x))
        sorted_results = [sorted(x) for x in sorted_results]
#         print(sorted_results)
        with open(self._path, 'w') as f_open:
            for result in sorted_results:
                f_open.write(','.join([str(x) for x in result]))
                f_open.write('\n')

In [16]:
kmeans = KMeans()
kmeans.fit(tfidf_matrix.toarray())

1.0ration number 9
1.0
1.0
1.0
6.0
1.0
7.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
3.0
1.0
1.0
4.0
4.0
1.0
1.0
1.0
1.0
5.0
5.0
7.0
2.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
4.0
1.0
1.0
1.0
1.0
1.0
1.0
6.0
1.0
1.0
5.0
2.0
4.0
2.0
2.0
2.0
3.0
2.0
2.0
2.0
2.0
4.0
3.0
2.0
2.0
2.0
2.0
1.0
2.0
7.0
2.0
2.0
6.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
4.0
2.0
0.0
2.0
2.0
2.0
4.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
0.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
4.0
4.0
2.0
2.0
4.0
4.0
2.0
2.0
2.0
2.0
2.0
2.0
6.0
7.0
6.0
6.0
4.0
6.0
6.0
6.0
6.0
2.0
4.0
6.0
4.0
3.0
3.0
4.0
3.0
4.0
1.0
4.0
4.0
3.0
4.0
5.0
5.0
4.0
4.0
4.0
4.0
1.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
0.0
0.0
4.0
0.0
0.0
4.0
7.0
7.0
0.0
6.0
6.0
3.0
4.0
4.0
6.0
4.0
4.0
6.0
4.0
4.0
4.0
4.0
4.0
6.0
4.0
3.0
6.0
6.0
6.0
4.0
6.0
4.0
4.0
4.0
4.0
1.0
7.0
1.0
4.0
4.0
4.0
6.0
4.0
4.0
4.0
4.0
6.0
4.0
4.0
6.0
4.0
6.0
4.0
6.0
4.0
3.0
3.0
3.0
4.0
5.0
6.0
5.0
6.0
3.0
4.0
6.0
7.0
4.0
6.0
6.0
6.0
6.0
6.0
4.0
4.0
4.0
5.0
4.0
6.0
6.0
4.0
4