In [1]:
# In this workshop we perform document clustering using sklearn

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# We are using the subnews dataset that we used last week. 
# The "Class" labels here are only used for sanity check of the clusters found later.
# Remember, in actual use of document clustering, the documents DON'T come with labeled classes.
# It's unsupervised learning.

import pandas as pd
news=pd.read_table('r8-train-all-terms.txt',header=None,names = ["Class", "Text"])
subnews=news[(news.Class=="trade")| (news.Class=='crude')|(news.Class=='money-fx') ]
subnews.head()

Unnamed: 0,Class,Text
15,trade,brazil anti inflation plan limps to anniversar...
43,crude,diamond shamrock dia cuts crude prices diamond...
55,crude,opec may have to meet to firm prices analysts ...
76,crude,texaco canada cuts crude prices canadian cts b...
77,crude,texaco canada txc lowers crude postings texaco...


In [2]:
# Let's use the similar preprocessing we used last week.
# The output of each document is a list of tokens.

import nltk
from nltk.corpus import stopwords
mystopwords=stopwords.words("English") + ['one', 'become', 'get', 'make', 'take']
WNlemma = nltk.WordNetLemmatizer()

def pre_process(text):
    tokens = nltk.word_tokenize(text)
    tokens=[ WNlemma.lemmatize(t.lower()) for t in tokens]
    tokens=[ t for t in tokens if t not in mystopwords]
    tokens = [ t for t in tokens if len(t) >= 3 ]
    text_after_process=" ".join(tokens)
    return(text_after_process)

# Apply preprocessing to every document in the training set.
text = subnews['Text']
toks = text.apply(pre_process)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline


In [4]:
# Create tfidf matrix
vectorizer = TfidfVectorizer(max_df=0.7, max_features=2500,
                             min_df=3, stop_words=mystopwords,
                             use_idf=True)
X = vectorizer.fit_transform(toks)
X.shape

(710, 2500)

In [5]:
# Use SVD to reduce dimensions
svd = TruncatedSVD(300)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X_lsa = lsa.fit_transform(X)

#set to False to perform inplace row normalization

In [6]:
# Check how much variance is explained
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))


Explained variance of the SVD step: 85%


In [12]:
# Now the actual clustering
from sklearn.cluster import KMeans
#random_state=4321
km3 = KMeans(n_clusters=3, init='k-means++', max_iter=1000, n_init=1)
%time km3.fit(X_lsa)

#‘k-means++’ : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence.
#n_init : int, default: 10. Number of time the k-means algorithm will be run with different centroid seeds.
#Maximum number of iterations of the k-means algorithm for a single run.

Wall time: 35.1 ms


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=1000,
    n_clusters=3, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=4321, tol=0.0001, verbose=0)

In [13]:
from sklearn import metrics

labels = subnews['Class']

# Silhouette: more similar within clusters, more distant between clusters
# The higher the better (-1 to 1)

print("Silhouette Coefficient for 3 clusters: %0.3f"
      % metrics.silhouette_score(X_lsa, km3.labels_))

Silhouette Coefficient for 3 clusters: 0.050


In [14]:
# We still need to see the more representative words for each cluster to understand them.

def print_terms(cm, num):
    original_space_centroids = svd.inverse_transform(cm.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(num):
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()

print_terms(km3, 3)

Cluster 0: oil crude price opec barrel dlrs mln bpd company ecuador
Cluster 1: stg mln money bank england market revised assistance shortage forecast
Cluster 2: trade billion japan bank exchange dollar currency rate japanese would


In [15]:
# Let's map the cluster label to the categories to see where is the confusion

dict = {0: 'crude', 1: 'money-fx', 2: 'trade'}
cluster_labels = [ dict[c] for c in km3.labels_]

In [16]:
import numpy as np
print(metrics.confusion_matrix(cluster_labels, labels))
print(np.mean(cluster_labels == labels) )
print(metrics.classification_report(cluster_labels, labels))

[[246   0   0]
 [  1  52   0]
 [  6 154 251]]
0.7732394366197183
             precision    recall  f1-score   support

      crude       0.97      1.00      0.99       246
   money-fx       0.25      0.98      0.40        53
      trade       1.00      0.61      0.76       411

avg / total       0.93      0.77      0.81       710

