In [3]:
# read data inputs
import numpy as np
docword = np.load("science2k-doc-word.npy")
worddoc = np.load("science2k-word-doc.npy")
with open("science2k-titles.txt") as t:
    docs = t.read().splitlines()
with open("science2k-vocab.txt") as v:
    words = v.read().splitlines()

In [4]:
# part a
from sklearn.cluster import KMeans
import sys
import math

minScore = sys.float_info.max
bestK = 1
best_labels = []
best_centers = []

for i in range(2, 21):
    print(i)
    l_labels = []
    l_centers = []
    l_cstds = []
    for j in range(10):
        kmeans = KMeans(n_clusters=i).fit(docword)
        cstd = 0.0
        for k in range(len(docword)):
            cstd += np.linalg.norm(docword[k] - kmeans.cluster_centers_[kmeans.labels_[k]])
        l_labels.append(kmeans.labels_)
        l_centers.append(kmeans.cluster_centers_)
        l_cstds.append(cstd)
    l_labels = np.array(l_labels)
    l_centers = np.array(l_centers)
    
    # choose the best clustering
    best_cl = l_cstds.index(min(l_cstds))
    labels = l_labels[best_cl]
    centers = l_centers[best_cl]
    
    # initialize score
    score = 1
    
    # firstly consider similarity within same clusters, smaller is better
    sim = l_cstds[best_cl] / len(docword)
    score *= sim
    print("similarity: " + str(sim))
    
    # secondly consider number of clusters, smaller is better
    score *= np.cbrt(i)
    
    # thirdly consider dissimilarity among centers, larger is better
    dissim = np.mean(centers.std(0))
    print("dissimilarity: " + str(dissim))
    score /= dissim
    print(score)
    if score < minScore:
        minScore = score
        bestK = i
        best_labels = labels
        best_centers = centers

print("Best k is: " + str(bestK) + " for doc-word clustering")

2
similarity: 154.15486577
dissimilarity: 0.225426913446
861.578404097
3
similarity: 152.456488381
dissimilarity: 0.35234881157
624.041568009
4
similarity: 151.847383692
dissimilarity: 0.40291756488
598.243208093
5
similarity: 151.267874054
dissimilarity: 0.446592182051
579.196046265
6
similarity: 150.82953424
dissimilarity: 0.474670435761
577.401565436
7
similarity: 150.679353181
dissimilarity: 0.7769653752
370.980795927
8
similarity: 150.14961263
dissimilarity: 0.562186624514
534.162877887
9
similarity: 150.0709669
dissimilarity: 0.580514953587
537.729801154
10
similarity: 149.820465014
dissimilarity: 0.628030525075
513.953373626
11
similarity: 149.423995704
dissimilarity: 0.899294091948
369.529828422
12
similarity: 149.086376776
dissimilarity: 0.914590237879
373.197289447
13
similarity: 148.925294443
dissimilarity: 0.93940898633
372.759060004
14
similarity: 149.034289968
dissimilarity: 0.987413717009
363.772383223
15
similarity: 148.921866858
dissimilarity: 1.33451931446
275.2098843

In [5]:
# get top 10 titles of each cluster
for i in range(bestK):
    print("Cluster " + str(i+1) + ": ")
    dists = []
    titles = []
    count = 0
    for j in range(len(docword)):
        if labels[j] == i:
            dists.append(np.linalg.norm(docword[j] - centers[i]))
            titles.append(docs[j])
            count += 1
    ranks = [i[0] for i in sorted(enumerate(dists), key=lambda x:x[1])]
    for j in range(len(dists)):
        if ranks[j] < 10:
            print(titles[j])
    print(count)

Cluster 1: 
"Economic Incentives for Rain Forest Conservation across Scales"
1
Cluster 2: 
"Polarization of Chemoattractant Receptor Signaling during Neutrophil Chemotaxis"
"Mitochondrial FtsZ in a Chromophyte Alga"
"Translocation of Helicobacter pylori CagA into Gastric Epithelial Cells by Type IV Secretion"
"Requirement of the Inositol Trisphosphate Receptor for Activation of Store-Operated <latex>$Ca^{2+}$</latex> Channels"
"Distinct Roles of CONSTANS Target Genes in Reproductive Development of Arabidopsis"
"Proliferation, but Not Growth, Blocked by Conditional Deletion of 40S Ribosomal Protein S6"
"Perception of Brassinosteroids by the Extracellular Domain of the Receptor Kinase BRI1"
"Central Role for the Lens in Cave Fish Eye Degeneration"
"New Insights into an Old Modification"
"Active Remodeling of Somatic Nuclei in Egg Cytoplasm by the Nucleosomal ATPase ISWI"
187
Cluster 3: 
"General Acid-Base Catalysis in the Mechanism of a Hepatitis Delta Virus Ribozyme"
"The Catalytic Path

In [6]:
# part b
minScore = sys.float_info.max
bestK = 1
best_labels = []
best_centers = []

for i in range(2, 20):
    print(i)
    l_labels = []
    l_centers = []
    l_cstds = []
    for j in range(10):
        kmeans = KMeans(n_clusters=i).fit(worddoc)
        cstd = 0.0
        for k in range(len(docword)):
            cstd += np.linalg.norm(worddoc[k] - kmeans.cluster_centers_[kmeans.labels_[k]])
        l_labels.append(kmeans.labels_)
        l_centers.append(kmeans.cluster_centers_)
        l_cstds.append(cstd)
    l_labels = np.array(l_labels)
    l_centers = np.array(l_centers)
    
    # choose the best clustering
    best_cl = l_cstds.index(min(l_cstds))
    labels = l_labels[best_cl]
    centers = l_centers[best_cl]
    
    # initialize score
    score = 1
    
    # firstly consider similarity within same clusters, smaller is better
    sim = l_cstds[best_cl] / len(worddoc)
    score *= sim
    print("similarity: " + str(sim))
    
    # secondly consider number of clusters, smaller is better
    score *= np.cbrt(i)
    
    # thirdly consider dissimilarity among centers, larger is better
    dissim = np.mean(centers.std(0))
    print("dissimilarity: " + str(dissim))
    score /= dissim
    print(score)
    if score < minScore:
        minScore = score
        bestK = i
        best_labels = labels
        best_centers = centers

print("Best k is: " + str(bestK) + " for word-doc clustering")

2
similarity: 28.3388836446
dissimilarity: 0.383219874967
93.1704182551
3
similarity: 28.0272656525
dissimilarity: 0.530947875415
76.1323544474
4
similarity: 27.8749498219
dissimilarity: 0.619416563878
71.4361340191
5
similarity: 27.7828162479
dissimilarity: 0.685139256646
69.3405713569
6
similarity: 27.6996332566
dissimilarity: 0.794939208164
63.3175134494
7
similarity: 27.6478819417
dissimilarity: 1.27665852541
41.427284157
8
similarity: 27.5921063927
dissimilarity: 1.27297832833
43.3504731048
9
similarity: 27.560277788
dissimilarity: 0.904111345816
63.4077741099
10
similarity: 27.4757690498
dissimilarity: 1.11890559584
52.9041504449
11
similarity: 27.4666689807
dissimilarity: 2.03203260215
30.0611933601
12
similarity: 27.3810691698
dissimilarity: 1.51233541328
41.4504607639
13
similarity: 27.3667083914
dissimilarity: 1.73898407985
37.0033811552
14
similarity: 27.2850284759
dissimilarity: 1.7034576764
38.6043053608
15
similarity: 27.2755723466
dissimilarity: 1.82469991171
36.86488141

In [8]:
# get top 10 words of each cluster
for i in range(bestK):
    print("Cluster " + str(i+1) + ": ")
    dists = []
    terms = []
    count = 0
    for j in range(len(worddoc)):
        if labels[j] == i:
            dists.append(np.linalg.norm(worddoc[j] - centers[i]))
            terms.append(words[j])
            count += 1
    ranks = [i[0] for i in sorted(enumerate(dists), key=lambda x:x[1])]
    for j in range(len(dists)):
        if ranks[j] < 10:
            print(terms[j])
    print(count)

Cluster 1: 
hour
polymerase
serum
gel
deletion
endogenous
biochem
tagged
washed
electrophoresis
110
Cluster 2: 
make
people
need
came
dont
director
look
put
going
thats
49
Cluster 3: 
cores
scenarios
geology
geologic
anomaly
dating
warmer
tropics
basins
proxy
191
Cluster 4: 
concluded
campaign
planned
equipment
prize
sort
thanks
gone
intended
save
591
Cluster 5: 
thank
digestion
stimulating
glycine
elicit
clonal
digested
cdnas
stably
inhibiting
722
Cluster 6: 
produced
1
Cluster 7: 
likely
1
Cluster 8: 
expression
binding
response
activation
receptor
membrane
expressed
receptors
signaling
mediated
10
Cluster 9: 
direct
cellular
functions
phosphorylation
inhibition
induction
induce
inhibitor
regulate
activate
53
Cluster 10: 
film
interface
applications
oriented
schematic
scanning
deposited
films
diffraction
perpendicular
71
Cluster 11: 
fig
region
larger
observations
pattern
increases
relatively
significantly
estimated
decrease
116
