In [1]:
%load_ext autoreload
%autoreload 2

# Short Text Clustering using CNN

The following work is mainly inspired by:
- 2014 - Convolutional Neural Networks for Sentence Classification
- 2014 - A Convolutional Neural Network for Modelling Sentences
- 2017 - Self-Taught Convolutional Neural Networks for Short Text Clustering

<br>
Word embedding from fasttext are distributed under the license:
- Creative Commons Attribution-Share-Alike License 3.0 (free commercial use)

#### Load modules

In [None]:
import os
import numpy as np
import pandas as pd
from pprint import pprint
from tqdm import tqdm_notebook
from webcolors import name_to_rgb
from matplotlib import pyplot as plt
from bokeh.io import output_notebook, reset_output

# scikit-learn
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer, normalize
from sklearn.feature_extraction.text import TfidfVectorizer

# keras
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding

# self-made function
from utils.target import laplacian_eigenmaps, binarize
from utils.metrics import map_label, cluster_quality
from utils.cnn import create_reference_model, behead
from utils.variable import COLORNAMES
from utils.embedding import Embeddor
from utils import visu

%matplotlib inline
output_notebook()

In [3]:
EMBDIR = "./embeddings/"
DATADIR = "./data/short_texts/"

print('\nShort text datasets')
pprint(os.listdir(DATADIR))

print('\nEmbedding')
pprint(os.listdir(EMBDIR))


Short text datasets
['Biomedical.txt',
 'Biomedical_gnd.txt',
 'SearchSnippets.txt',
 'SearchSnippets_gnd.txt',
 'StackOverflow.txt',
 'StackOverflow_gnd.txt']

Embedding
['glove100K.100d.vec']


# Fetching the data

#### Loading pre-trained Fasttext word embedding

Pre-trained word vectors for English languages, trained on Wikipedia using fastText. These vectors of dimension `300` were obtained using the skip-gram model described in `Bojanowski et al.(2016)` with default parameters.

We extracted the `100 000` most frequent words.

# Word embedding

In [4]:
# parameters
n_emb = int(1 * 1e5)
# emb_name = 'fasttext.wiki.en.vec'
emb_name = 'glove100k.100d.vec'
print("Embedding: %s" % emb_name)
src_emb = os.path.join(EMBDIR, emb_name)

# embeddor
embeddor = Embeddor(notebook_display=True)
emb_mat = embeddor.load_emb(src_emb, n_emb)
print("Embedding shape: {}".format(emb_mat.shape))

Embedding: glove100k.100d.vec


Widget Javascript not detected.  It may not be installed or enabled properly.



Embedding shape: (100001, 100)


In [5]:
embeddor.most_similar('napoleon')

[('napoleon', 0.99999976),
 ('bonaparte', 0.87043333),
 ('napoléon', 0.66390216),
 ('napoleonic', 0.65167707),
 ('augustus', 0.64356035),
 ('caesar', 0.64281428),
 ('1812', 0.63069737),
 ('tsar', 0.6110847),
 ('xiv', 0.61083198),
 ('emperor', 0.60819435)]

# Load text data

#### Loading data

In [6]:
#DATA = "Biomedical"
#DATA = "SearchSnippets"
DATA = "StackOverflow"

text_path = os.path.join(DATADIR, '%s.txt' % DATA)
with open(text_path, encoding="utf-8") as f:
    data = f.readlines()

label_path = os.path.join(DATADIR, '%s_gnd.txt' % DATA)
with open(label_path, encoding="utf-8") as f:
    target = f.readlines()
target = [int(label.rstrip('\n')) for label in target]
    
print("Total: %s short texts" % format(len(data), ","))

Total: 20,000 short texts


#### Random samples

In [7]:
n = len(data)
sample = 5000
print("Loading %s samples" % format(sample, ","))
index_sample = np.random.randint(0, n, sample)

data = [data[idx] for idx in index_sample]
target = [target[idx] for idx in index_sample]

Loading 5,000 samples


#### Tokenize

In [8]:

tokenizer = Tokenizer(char_level=False)
tokenizer.fit_on_texts(data)
sequences_full = tokenizer.texts_to_sequences(data)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
MAX_NB_WORDS = len(word_index)

seq_lens = [len(s) for s in sequences_full]
print("Average length: %d" % np.mean(seq_lens))
print("Max length: %d" % max(seq_lens))
MAX_SEQUENCE_LENGTH = max(seq_lens)

X = pad_sequences(sequences_full, maxlen=MAX_SEQUENCE_LENGTH)
y = target
tfidf = tokenizer.sequences_to_matrix(sequences_full, mode='tfidf')

Found 5115 unique tokens.
Average length: 8
Max length: 34


# Getting sequences and targets

In [9]:
# creating embedding matrix
EMBEDDING_DIM = embeddor.emb_dim
N_CLASSES = 21

# prepare embedding matrix
nb_words_in_matrix = 0
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding = embeddor.get_emb(word)
    if embedding is not None:
        embedding_matrix[i] = embedding
        nb_words_in_matrix = nb_words_in_matrix + 1
        
print("added %d words in the embedding matrix" % nb_words_in_matrix)

added 3481 words in the embedding matrix


# Target vectors 

In [10]:
# dictionary containing all target vectors
Y = dict()

# dicitonary containing all binarized vectors
B = dict()

#### Average embeddings (AE)

In [11]:
%%time
denom = 1 + np.sum(tfidf, axis=1)[:, None]
normed_tfidf = tfidf/denom
average_embeddings = np.dot(normed_tfidf, embedding_matrix)
Y['ae'] = average_embeddings
B['ae'] = binarize(Y['ae'])
print("Shape of average embedding: ", Y['ae'].shape)

Shape of average embedding:  (5000, 100)
CPU times: user 624 ms, sys: 502 ms, total: 1.13 s
Wall time: 1.72 s


#### Latent Semantic Analysis (LSA)

In [12]:
%%time
lsa = make_pipeline(
    TruncatedSVD(100),
    Normalizer(copy=False),
) 
lsa_vectors = lsa.fit_transform(tfidf)
Y['lsa'] = lsa_vectors
B['lsa'] = binarize(Y['lsa'])
print("Shape of latent semantic vectors: ", Y['lsa'].shape)

Shape of latent semantic vectors:  (5000, 100)
CPU times: user 4.43 s, sys: 547 ms, total: 4.98 s
Wall time: 3.58 s


#### Laplacian Eigenmaps (LE) [memory intensive]

In [13]:
%%time
laplacian_vectors = laplacian_eigenmaps(lsa_vectors, n_neighbors=15, subdim=15, n_jobs=-1)
Y['le'] = laplacian_vectors
B['le'] = binarize(Y['le'])
print(Y['le'].shape)

Fitting nearest neighbors
Creation of heat kernel affinity matrix
Spectral embedding
(5000, 15)
CPU times: user 26.7 s, sys: 2.62 s, total: 29.3 s
Wall time: 22.6 s


# CNN for deep feature learning 

#### Embedding layer

In [14]:
embedding_matrix_copy = embedding_matrix.copy()

# Embedding layer
pretrained_embedding_layer = Embedding(
    input_dim=MAX_NB_WORDS+1,
    output_dim=EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=MAX_SEQUENCE_LENGTH,
)

#### Deep feature learning

In [15]:
nb_epoch = 2 
D = dict()
for method in ['lsa', 'ae', 'le']:
    print("Method: {}".format(method))
    cnn_model = create_reference_model(
        input_dim=MAX_SEQUENCE_LENGTH,
        output_dim=B[method].shape[1],
        embedding_layer=pretrained_embedding_layer,
    )

    cnn_model.fit(X, B[method], validation_split=0.2, epochs=nb_epoch, batch_size=100, verbose=2)

    beheaded_cnn = behead(cnn_model)
    D[method] = beheaded_cnn.predict(X)
    print("Sample shape: {}".format(D[method].shape))
    print()

Method: lsa
Train on 4000 samples, validate on 1000 samples
Epoch 1/2
4s - loss: 0.6900 - mean_absolute_error: 0.4969 - val_loss: 0.6844 - val_mean_absolute_error: 0.4944
Epoch 2/2
4s - loss: 0.6837 - mean_absolute_error: 0.4926 - val_loss: 0.6799 - val_mean_absolute_error: 0.4912
Sample shape: (5000, 100)

Method: ae
Train on 4000 samples, validate on 1000 samples
Epoch 1/2
4s - loss: 0.6054 - mean_absolute_error: 0.4323 - val_loss: 0.5218 - val_mean_absolute_error: 0.3533
Epoch 2/2
4s - loss: 0.5290 - mean_absolute_error: 0.3519 - val_loss: 0.5069 - val_mean_absolute_error: 0.3464
Sample shape: (5000, 100)

Method: le
Train on 4000 samples, validate on 1000 samples
Epoch 1/2
5s - loss: 0.6737 - mean_absolute_error: 0.4816 - val_loss: 0.6546 - val_mean_absolute_error: 0.4707
Epoch 2/2
3s - loss: 0.6500 - mean_absolute_error: 0.4673 - val_loss: 0.6344 - val_mean_absolute_error: 0.4605
Sample shape: (5000, 100)



# Tools for evaluating cluster quality

## K-means 

In [16]:
true_labels = y
n_clusters = len(np.unique(y))
print("Number of classes: %d" % n_clusters)
km = KMeans(n_clusters=n_clusters, n_jobs=-1)
result = dict()
pred = dict()

Number of classes: 20


#### Deep feature from Average Embedding

In [17]:
method = 'ae'
deep_features = normalize(D[method], norm='l2')
print('Shape: {}'.format(deep_features.shape)) 
km.fit(deep_features)
y_pred = km.labels_
result['deep_ae'] = cluster_quality(true_labels, y_pred)
pred['deep_ae'] = y_pred

Shape: (5000, 100)
Homogeneity: 0.054
Completeness: 0.061
V-measure: 0.058
NMI: 0.058
Rand score: 0.012
Accuracy: 0.119


#### Deep feature from LSA

In [18]:
method = 'lsa'
deep_features = normalize(D[method], norm='l2')
print('Shape: {}'.format(deep_features.shape)) 
km.fit(deep_features)
y_pred = km.labels_
result['deep_lsa'] = cluster_quality(true_labels, y_pred)
pred['deep_lsa'] = y_pred

Shape: (5000, 100)
Homogeneity: 0.353
Completeness: 0.374
V-measure: 0.363
NMI: 0.363
Rand score: 0.190
Accuracy: 0.408


#### Deep feature from Laplacian Eigenvalues

In [19]:
method = 'le'
deep_features = normalize(D[method], norm='l2')
print('Shape: {}'.format(deep_features.shape)) 
km.fit(deep_features)
y_pred = km.labels_
result['deep_le'] = cluster_quality(true_labels, y_pred)
pred['deep_le'] = y_pred

Shape: (5000, 100)
Homogeneity: 0.424
Completeness: 0.458
V-measure: 0.440
NMI: 0.441
Rand score: 0.232
Accuracy: 0.478


#### LSA

In [20]:
lsa_features = Y['lsa']
print('Shape: {}'.format(lsa_features.shape)) 
km.fit(lsa_features)
y_pred = km.labels_
result['lsa'] = cluster_quality(true_labels, y_pred)
pred['lsa'] = y_pred

Shape: (5000, 100)
Homogeneity: 0.684
Completeness: 0.770
V-measure: 0.724
NMI: 0.726
Rand score: 0.327
Accuracy: 0.726


#### Laplacian Eigenvalues aka Spectral clustering 

In [21]:
laplacian_features = normalize(Y['le'], norm='l2')
print('Shape: {}'.format(laplacian_features.shape)) 
km.fit(laplacian_features)
y_pred = km.labels_
result['le'] = cluster_quality(true_labels, y_pred)
pred['le'] = y_pred

Shape: (5000, 15)
Homogeneity: 0.554
Completeness: 0.560
V-measure: 0.557
NMI: 0.557
Rand score: 0.491
Accuracy: 0.687


# Cluster visualization

#### Random sampling because t-SNE is slow

In [22]:
# random sample because t-sne is slow
tsne_sample = 1000
idx_sample = np.random.randint(low=0, high=sample, size=(tsne_sample,))

In [23]:
# mapping color to sample
labels = np.unique(true_labels)
colormap = {label: color for label, color in zip(labels, COLORNAMES[:20])}
colors = [colormap[true_labels[idx]] for idx in idx_sample]

## t-SNE 

<br>
Remarks:
- Using Laplacian Eigenvalues, we obtain very dense clusters
- Using LSA as target, results are also quite good
- LSA is very competitive for accuracy

In [24]:
# T-SNE model
tsne = TSNE(n_components=2, 
            perplexity=30, 
            early_exaggeration=4.,
            learning_rate=1000,
            init="pca", 
            metric="euclidean")

#### Deep feature representation from Laplacian Eigenvalues

In [25]:
%%time
rand_deep_features = deep_features[idx_sample]
tsne_deep = tsne.fit_transform(rand_deep_features)

CPU times: user 29.9 s, sys: 3.24 s, total: 33.2 s
Wall time: 35.4 s


In [26]:
visu.cluster2d(*tsne_deep.T, colors=colors, height=600, width=600, size=7)

####  LSA

In [27]:
%%time
tsne_lsa = tsne.fit_transform(lsa_features[idx_sample])

CPU times: user 28 s, sys: 3.28 s, total: 31.3 s
Wall time: 35.1 s


In [None]:
# matplotlib visualization
visu.cluster2d(*tsne_lsa.T, colors=colors, height=600, width=600, size=7)

#### Laplacian eigenvalues

In [None]:
%%time
tsne_laplacian = tsne.fit_transform(laplacian_features[idx_sample])

In [None]:
visu.cluster2d(*tsne_laplacian.T, colors=colors, height=600, width=600, size=7)