### Text Clustering Presented with FoamTree V5

In [148]:
from __future__ import print_function
import pandas as pd
import nltk
from nltk.stem.snowball import SnowballStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import ward, dendrogram
import webbrowser
from sklearn.metrics import silhouette_score
import time
import os
import sys
stdout = sys.stdout
stdin = sys.stdin
stderr = sys.stderr
reload(sys)
sys.setdefaultencoding("utf-8")
sys.stdout = stdout
sys.stdin = stdin
sys.stderr = stderr

In [149]:
start = time.time()
# path of text file, source dataset
inputPath = 'C:/Users/liuxi/Desktop/TextClusteringPresentedWithFoamTree/publications.csv'

# path where to save figs and txt file of clustering results
savePath = 'C:/Users/liuxi/Desktop/TextClusteringPresentedWithFoamTree'

### Specify parameters for clustering

In [150]:
# number of initialization
num_init = 100

# number of interations
num_iter = 300

# tolerance with regards to convergence
tolerance = 0.0001

num_clusters = input('Number of clusters (range 2 to 20) ----- ') # specifying the number of clusters for K-Means Max = 20

if num_clusters > 20 or num_clusters<2 or type(num_clusters)!=int:
    print(' Warning !! Invalid INPUT !!! Please restart the program !!')
    sys.exit()

# specify the method of distance calculation.
# options: cityblock, cosine, euclidean, l1, l2, manhattan
distCal = 'euclidean'

Number of clusters (range 2 to 20) ----- 2


### Read in Ghent library documents

In [151]:
catalog_file = inputPath
catalog_entries = pd.read_csv(catalog_file, nrows=1000, usecols=['type', 'author', 'title', 'language'])
english_catalog_entries = catalog_entries.loc[catalog_entries['language'] == 'eng'].copy()

#### Merge multi-feature into one column using pd.DataFrame()

In [152]:
feature_dict = {0: english_catalog_entries.title,
                1: english_catalog_entries.type,
                2: english_catalog_entries.language,
                3: english_catalog_entries.author}

n = input('How many features are used for clustering (range 1 to 4) -')
print(' ')
print('type    ---------------------------------------------1\n')
print('language---------------------------------------------2\n')
print('author  ---------------------------------------------3\n') 

if n==1:
    df = pd.DataFrame({'0': feature_dict[0]})
    df['multi-feature'] = df[['0']].apply(lambda x:' '.join(x), axis=1)               
elif n==2:
    print('feature title has been chosen, choose the other feature - \n')
    i = input('feature index number - ')
    df = pd.DataFrame({'0': feature_dict[i], '1': feature_dict[0]})
    df['multi-feature'] = df[['0', '1']].apply(lambda x:' '.join(x), axis=1)
elif n==3:   
    print('feature title has been chosen, choose the other two features - \n')
    i = input('feature index number - ')
    j = input('feature index number - ')
    df = pd.DataFrame({'0': feature_dict[i], '1': feature_dict[j], '2': feature_dict[0]})
    df['multi-feature'] = df[['0', '1', '2']].apply(lambda x:' '.join(x), axis=1)
elif n==4:
    df = pd.DataFrame({'0': feature_dict[0], '1': feature_dict[1], '2': feature_dict[2], '3': feature_dict[3]})
    df['multi-feature'] = df[['0', '1', '2','3']].apply(lambda x:' '.join(x), axis=1)
else:
    print(' Warning !! Invalid INPUT !!! Please restart the program !!')
    sys.exit()

How many features are used for clustering (range 1 to 4) -2
 
type    ---------------------------------------------1

language---------------------------------------------2

author  ---------------------------------------------3

feature title has been chosen, choose the other feature - 

feature index number - 1


#### Define stopwords and stemmer for clearning text

In [153]:
# define stopwords and stemmer for clearning text
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer('english')
# To find unusual or mis-spelt words in a text corpus
english_vocab = set(w.lower() for w in nltk.corpus.words.words())

#### Define a function of generating tokens 

In [154]:
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []

    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens if len(t) > 3 and t in english_vocab]
    return stems

#### Specify the parameters for TF-IDF calculation

In [155]:
tfidf_vectorizer = TfidfVectorizer(max_features = 200000,
                                   stop_words = 'english',
                                   lowercase = True,
                                   use_idf = True, 
                                   tokenizer = tokenize_and_stem,
                                   ngram_range=(1,3))

#### tfidf_matrix returns (X, Y) Z, X is the index of docs, Y is the index of words in the dictionary, Z is the corresponding TF-IDF score of this word.    

In [156]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df['multi-feature'] ) 

#### Terms: A list of words in vocabulary. sorted by alphabetical order. given index can extract corresponding word in vocabulary.

In [157]:
terms = tfidf_vectorizer.get_feature_names()

#### Specify which K-Means to use for clustering

In [158]:
km = KMeans(n_clusters = num_clusters,n_init=num_init, max_iter=num_iter, tol=tolerance)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [159]:
kmeans_model = KMeans(n_clusters=num_clusters,n_init=num_init, max_iter=num_iter,tol=tolerance).fit(tfidf_matrix)
labels = kmeans_model.labels_
x = silhouette_score(tfidf_matrix, labels)
print("For n_clusters =", num_clusters,
          "The average silhouette_score is :", x)

For n_clusters = 2 The average silhouette_score is : 0.0140537055928


In [160]:
english_catalog_entries.loc[:,'cluster_index'] = clusters 
e2 = english_catalog_entries.set_index('cluster_index')

In [161]:
km.cluster_centers_.argsort().shape
km.cluster_centers_.argsort()[:, ::-1].shape
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

In [162]:
# initialize a list of saving the most relevant words for Foamtree representation
myInput =[]
counter = 0
# generate a list of the most relevant words for Foamtree representation
# for each cluster, save the top 10 most relevant words

for i in range(num_clusters):
    for ind in order_centroids[i, :]:
        if counter < 10*num_clusters:
            if terms[ind] in english_vocab and len(terms[ind]) > 4:      
                myInput.append(terms[ind])   
                counter = counter+1

#### generate groups and labels for FoamTree representation which can be viewed in browser    

In [163]:
filename = os.path.join(savePath, 'Text_Clustering.html')  
with open(filename, 'w') as f:     
    message2_2=''
    for j in range(num_clusters):   
        labelKey='label: "%s %s %s %s %s"'% (myInput[0],myInput[1],myInput[2], myInput[3], myInput[4])
        del myInput[:10]
        temp_article = '' 
        message21=''

        for k in (e2.ix[j]['title']):  
            article ='{label: "%s"},'% (k) 
            temp_article = temp_article + article
        Collect_article = 'groups: [%s]'% (temp_article)
        message2_1 = '{%s, %s},'%(labelKey, Collect_article)
        message2_2 = message2_2 + message2_1
        
    message2 = 'groups: [' + '%s]'%(message2_2)

    message1 = """
<!DOCTYPE html>
<html>
  <head>
    <title>FoamTree Quick Start</title>
    <meta charset="utf-8" />
  </head>

  <body>
    <div id="visualization" style="width: 1800px; height: 1200px"></div>
    <script src="C:/Users/liuxi/Downloads/textClustering/JavaScript/carrotsearch.foamtree.js"></script>
    <script>
      window.addEventListener("load", function() {
        var foamtree = new CarrotSearchFoamTree({
          id: "visualization",
          dataObject: {"""
    message3 = """
          }
        });
      });
    </script>
  </body>
</html>

"""
    m = message1 + message2 + message3
    f.write(m) 

#### options for 'metric': cityblock, cosine, euclidean, l1, l2, manhattan

In [164]:
dist = 1 - pairwise_distances(tfidf_matrix, metric=distCal)
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=None)

# Fit the data from dist, and returns the embedded coordinates.
pos = mds.fit_transform(dist) 

# select x, y axis for making a plot
xs, ys = pos[:, 0], pos[:, 1]

# sort data by x-values for a better view in figure
xs.sort()

In [165]:
#set up colors dictionary
cluster_colors = {0: 'r', 1: 'g', 2: 'k', 3: 'm', 4: 'b',5: 'r', 6: 'g', 7: 'k', 8: 'm', 9: 'b',
                  10: 'r', 11: 'g', 12: 'k', 13: 'm', 14: 'b',15: 'r', 16: 'g', 17: 'k', 18: 'm', 19: 'b'}

#set up cluster names using a dict
cluster_names = {0: 'Cluster 1',1: 'Cluster 2', 2: 'Cluster 3',3: 'Cluster 4',4: 'Cluster 5',5: 'Cluster 6', 
                 6: 'Cluster 7',7: 'Cluster 8', 8: 'Cluster 9',9: 'Cluster 10',10: 'Cluster 1',11: 'Cluster 2', 
                 12: 'Cluster 3',13: 'Cluster 4',14: 'Cluster 5',15: 'Cluster 6', 
                 16: 'Cluster 7',17: 'Cluster 8', 18: 'Cluster 9',19: 'Cluster 10'}               

In [166]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=english_catalog_entries.cluster_index, title=english_catalog_entries.title)) 

#group by cluster
groups = df.groupby('label')

fig, ax = plt.subplots(figsize=(10, 10)) 
ax.margins(0.05) 
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none')
    ax.set_aspect('auto')
    ax.tick_params( axis= 'x',
                which='both',        
                bottom='on',         
                top='on',            
                labelbottom='on')
    ax.tick_params(axis= 'y',
                which='both',       
                left='on',          
                top='on',           
                labelleft='on')    
    ax.legend(numpoints=1)  
#plt.show()
plt.close()

In [167]:
short_titles=[]
for t in english_catalog_entries.title:
    short_titles = short_titles+[t[:20]] 
    
linkage_matrix = ward(dist) 
        
fig, ax = plt.subplots(figsize=(10, 10)) 
ax = dendrogram(linkage_matrix, 
                    orientation="left",
                    labels=short_titles,
                    show_leaf_counts=True,
                    get_leaves=True,
                    p=10,
                    truncate_mode='lastp',
                    distance_sort='descending',
                    count_sort=True,
                    show_contracted=True)
plt.tick_params(axis= 'x',
        which='both',      
        bottom='on',      
        top='on',        
        labelbottom='on')
#plt.show()
plt.close()

In [168]:
filename = os.path.join(savePath, 'Text_Clustering.html')  
webbrowser.open_new_tab(filename)
print('Calculation finished !!!')
print ('It took', time.time()-start, 'seconds.')

Calculation finished !!!
It took 28.4719998837 seconds.
