In [1]:
import pickle
import collections
 
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint

In [2]:
tokens = pickle.load(open('../Preprocessing/tokens.pkl', 'rb'))
data = pickle.load(open('../Preprocessing/topic-corpus.pkl', 'rb'))

In [3]:
def dummy_fun(tokens):
    return tokens

In [4]:
def cluster_texts(texts, clusters=3):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(analyzer='word',
                            tokenizer=dummy_fun,
                            preprocessor=dummy_fun,
                            max_df=0.9,
                            min_df=0.1,
                            token_pattern=None) 
    X = vectorizer.fit_transform(tokens)
    features = vectorizer.get_feature_names()
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(X)
 
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering, X, features

In [5]:
clusters, X, features = cluster_texts(data, 7)

In [6]:
import pandas as pd
items = [[key, i] for key, value in dict(clusters).items() for i in value]
df2 = pd.DataFrame(items, columns=['Cluster', 'Index'])
df2.set_index('Index', inplace=True)
df2 = df2.sort_index()
data = data.reset_index()

In [7]:
conc = pd.concat([data, df2], axis=1)
conc.to_csv('/Users/ajda/Desktop/clusters.csv', index=False)

In [8]:
print("Number of clusters found: {}".format(len(clusters)))

Number of clusters found: 7


In [32]:
from orangecontrib.text.stats import false_discovery_rate, hypergeom_p_values
import numpy as np

filter_p_value = 0.01
filter_fdr_value = 1

def getKey(item):
    return item[1]

for i in range(len(clusters)):
    mask = []
    for cl, ind in items:
        if cl == i:
            mask.append(True)
        else:
            mask.append(False)

    data = X.toarray()
    selected_data = np.compress(mask, data, axis=0)

    p_values = hypergeom_p_values(data, selected_data)
    fdr_values = false_discovery_rate(p_values)
    
    fp = lambda score: "%0.5f" % score if score > 10e-3 else "%0.1e" % score
    fpt = lambda score: "%0.9f" % score if score > 10e-3 else "%0.5e" % score
    
    print("Enrichment of cluster {}".format(i+1))
    
    result = []
    for word, pval, fval in zip(features, p_values, fdr_values):
        if pval <= filter_p_value and fval <= filter_fdr_value:
            result.append((word, fp(pval), fpt(fval)))
    result = sorted(result, key=getKey)
    for w, p, f in result:
        print("    ", w, p, f)
    print()


Enrichment of cluster 1

Enrichment of cluster 2
     result 3.8e-04 0.044656648
     framework 4.2e-03 0.116328771
     literature 4.2e-03 0.116328771
     study 4.9e-03 0.116328771
     finding 5.6e-03 0.116328771
     relationship 5.9e-03 0.116328771

Enrichment of cluster 3
     using 2.1e-04 0.025062105
     purpose 3.0e-03 0.127171465
     analysis 3.2e-03 0.127171465
     value 5.7e-03 0.145137617
     methodology 6.5e-03 0.145137617
     finding 8.5e-03 0.145137617
     experience 9.7e-03 0.145137617
     design 9.8e-03 0.145137617

Enrichment of cluster 4

Enrichment of cluster 5
     use 1.2e-03 0.139366921
     first 5.0e-03 0.296032836

Enrichment of cluster 6
     aim 1.9e-04 0.022460243

Enrichment of cluster 7
     business 4.5e-03 0.428868803

