In [3]:
import numpy as np
from collections import defaultdict

In [1]:
def get_data(data_path):
    with open(data_path, 'r') as f:
        lines = f.read().splitlines()
    X = []
    Y = []
    for line in lines:    # one doc
        single_dict = defaultdict(lambda:0)
        raw_X = line.split('<fff>')[-1]
        for pair in raw_X.split():
            idx = pair.split(':')[0]
            value = pair.split(':')[1]
            single_dict[int(idx)] = float(value)
            
        X.append(single_dict)
        Y.append(float(line.split('<fff>')[0]))
    return X, Y

In [4]:
X_train, Y_train = get_data("D:\\movedFromC\\123\\20192\\PRJ2\\Project2\\words_tfidfs.txt")

In [35]:
type(Y_train)

list

In [5]:
X_test, Y_test = get_data("D:\\movedFromC\\123\\20192\\PRJ2\\Project2\\words_tfidfs_test.txt")

In [6]:
import scipy.sparse as sp

def convert_to_csr(X_dict, num_of_features):
    X_mat = sp.dok_matrix((len(X_dict), num_of_features), dtype=np.float)
    for i in range(len(X_dict)):
        for key in X_dict[i].keys():
            X_mat[i, key] = X_dict[i][key]
    X_mat = X_mat.tocsr()

    return X_mat

In [7]:
X_train_mat = convert_to_csr(X_train, 10272)

In [41]:
X_test_mat = convert_to_csr(X_test, 10272)

In [42]:
X_test_mat.shape

(7532, 10272)

In [46]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 20,
               init = 'k-means++',
               n_init = 5,
               tol = 1e-3,
               random_state = 2018
               ).fit(X_train_mat)

In [30]:
np.where(kmeans.labels_ == 0)

(array([   45,   783,   795,   805,   861,   872,   873,   876,   883,
          899,   915,   959,  1030,  1049,  1127,  1208,  1266,  1328,
         1333,  1389,  1392,  1404,  1436,  1458,  1459,  1466,  1534,
         1565,  1631,  1692,  1693,  1803,  1826,  1851,  2055,  2140,
         2168,  2523,  2524,  2542,  2662,  2784,  2834,  2905,  2975,
         3021,  3073,  3086,  3121,  3336,  3337,  3474,  3738,  3941,
         4128,  4228,  4744,  4751,  4752,  4761,  4923,  4929,  4930,
         5049,  5050,  5433,  5555,  5832,  5843,  7009,  7012,  7016,
         7017,  7108,  7115,  7147,  7186,  7261,  7373,  7392,  7395,
         7436,  7454,  7493,  8434,  8479,  8582,  8866,  9084,  9222,
         9477, 10082, 10233, 10537, 10950, 10962, 10990, 11066, 11107,
        11112, 11117], dtype=int64),)

In [39]:
def compute_purity(data_set, labels_set):
    labels = kmeans.predict(data_set)
    count = 0
    for clusIdx in range(kmeans.n_clusters):
        indices = np.where(labels == clusIdx)[0]
        members_labels = [labels_set[i] for i in indices]
        count_max_label = max([members_labels.count(label) for label in range(kmeans.n_clusters)])
        count += count_max_label
    return count * 1. / data_set.shape[0]

In [43]:
print("purity of training set: ", compute_purity(X_train_mat, Y_train))
print("purity of tesing set: ", compute_purity(X_test_mat, Y_test))
print("-------------------")

purity of training set:  0.38456779211596254
purity of tesing set:  0.3462559745087626
-------------------


In [47]:
print("purity of training set: ", compute_purity(X_train_mat, Y_train))
print("purity of tesing set: ", compute_purity(X_test_mat, Y_test))
print("-------------------")

purity of training set:  0.42389959342407635
purity of tesing set:  0.36537440254912373
-------------------
