In [636]:
# Importing external libraries
import numpy as np
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt

plt.close("all")

In [637]:
# input files
doc_word_file = "science2k-doc-word.npy"
word_doc_file = "science2k-word-doc.npy"
vocab_file = "science2k-vocab.txt"
title_file = "science2k-titles.txt"

In [638]:
def loadNumpyData(filename):
	data = np.load(filename)
	return data

In [639]:
def loadTextData(filename):
    text_file = open(filename, "r")
    lines = text_file.read().split("\n")
    return lines[:-1]

In [640]:
def runKMeans(question, data):
    sum_squared_distances = {};
    for k in range(1,21): 
        model = KMeans(n_clusters=k, init='k-means++').fit(data)
#         Sum of squared distances of samples to their closest cluster center
        sum_squared_distances[k] = model.inertia_
    
#     Plot figure
    plt.figure(num=None, figsize=(10, 10), dpi=85, facecolor='w')
    lists = sorted(sum_squared_distances.items())
    x, y = zip(*lists)
    plt.plot(x, y, marker='o')
    ticks = list(range(1,21))
    plt.xticks(ticks)
    plt.xlabel('k')
    plt.ylabel('Sum of Squared Distances')
    plt.title("Sum of Squared Distances for K-means Clustering")
    plt.savefig("kmeans_plot_" + question + ".png")
    plt.close()

In [641]:
def findTopTen(data, n, ref):
#     TODO: is this right
    model = KMeans(n_clusters=n, init='k-means++').fit(data)
    labels = model.predict(data)
#     print (labels)
    distances = model.transform(data)
    
    top10 = {}
    
    for cluster in range(n) :
        indices = [i for i, x in enumerate(labels) if x == cluster]
        distance = distances[:,cluster]
        rankedIndex = np.argsort(distance)[::]
        topIndex = [index for index in rankedIndex if index in indices][:10]
        top10[cluster + 1] = [ref[ind] for ind in topIndex]
        
    return top10

In [642]:
# load files
doc_word = loadNumpyData(doc_word_file)
word_doc = loadNumpyData(word_doc_file)
titles = loadTextData(title_file)
vocab = loadTextData(vocab_file)

In [643]:
# Analysis for finding value of k
runKMeans("a", doc_word)

In [644]:
# Finding documents closest to the centroids
findTopTen(doc_word, 8, titles)

{1: ['"Temperatures without Fevers?"',
  '"The Global Spread of Malaria in a Future, Warmer World"',
  '"Infectious History"'],
 2: ['"Information Technology Takes a Different Tack"',
  '"Science Survives in Breakthrough States"',
  '"Vaccine Studies Stymied by Shortage of Animals"',
  '"The Violence of the Lambs"',
  '"Flushing out Nasty Viruses in the Balkans"',
  '"For \'Father\' of Abortion Drug, Vindication at Last"',
  '"New Brain Cells Prompt New Theory of Depression"',
  '"On a Slippery Slope to Mediocrity?"',
  '"Plants Join the Genome Sequencing Bandwagon"',
  '"In Europe, Hooligans Are Prime Subjects for Research"'],
 3: ['"Suppression of Mutations in Mitochondrial DNA by tRNAs Imported from the Cytoplasm"',
  '"Distinct Classes of Yeast Promoters Revealed by Differential TAF Recruitment"',
  '"Efficient Initiation of HCV RNA Replication in Cell Culture"',
  '"T Cell-Independent Rescue of B Lymphocytes from Peripheral Immune Tolerance"',
  '"Reduced Food Intake and Body Weig

In [None]:
# Analysis for finding value of k
runKMeans("b", word_doc)

In [None]:
# Finding terms closest to the centroids
findTopTen(word_doc, 7, vocab)