In [1]:
%load_ext autoreload
%autoreload 2

# Short Text Clustering using CNN

The following work is mainly inspired by:
- 2014 - Convolutional Neural Networks for Sentence Classification
- 2014 - A Convolutional Neural Network for Modelling Sentences
- 2017 - Self-Taught Convolutional Neural Networks for Short Text Clustering

<br>
Word embedding from fasttext are distributed under the license:
- Creative Commons Attribution-Share-Alike License 3.0 (free commercial use)

#### Load modules

In [2]:
import os
import numpy as np
import pandas as pd
from pprint import pprint
from tqdm import tqdm_notebook
from webcolors import name_to_rgb
from matplotlib import pyplot as plt

# scikit-learn
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer, normalize
from sklearn.feature_extraction.text import TfidfVectorizer

# keras
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Flatten, Reshape, merge
from keras.layers import Dense, Conv1D, Dropout, GlobalMaxPooling1D

# self-made function
from utils.target import laplacian_eigenmaps, binarize
from utils.metrics import map_label, cluster_quality
from utils.embedding import Embeddor
from utils.variable import COLOR_NAMES

%matplotlib inline

Using TensorFlow backend.


In [3]:
EMBDIR = "./embeddings/"
DATADIR = "./data/short_texts/"

print('\nShort text datasets')
pprint(os.listdir(DATADIR))

print('\nEmbedding')
pprint(os.listdir(EMBDIR))


Short text datasets
['Biomedical.txt',
 'Biomedical_gnd.txt',
 'SearchSnippets.txt',
 'SearchSnippets_gnd.txt',
 'StackOverflow.txt',
 'StackOverflow_gnd.txt']

Embedding
['glove100K.100d.vec', 'wiki.en.vec', 'wiki.en.vec.crdownload']


# Fetching the data

#### Loading pre-trained Fasttext word embedding

Pre-trained word vectors for English languages, trained on Wikipedia using fastText. These vectors of dimension `300` were obtained using the skip-gram model described in `Bojanowski et al.(2016)` with default parameters.

We extracted the `100 000` most frequent words.

# Word embedding

In [4]:
# parameters
n_emb = int(1 * 1e5)
# emb_name = 'fasttext.wiki.en.vec'
emb_name = 'glove100k.100d.vec'
print("Embedding: %s" % emb_name)
src_emb = os.path.join(EMBDIR, emb_name)

# embeddor
embeddor = Embeddor(notebook_display=True)
emb_mat = embeddor.load_emb(src_emb, n_emb)
print("Embedding shape: {}".format(emb_mat.shape))

Embedding: glove100k.100d.vec



Embedding shape: (100001, 100)


In [5]:
embeddor.most_similar('napoleon')

[('napoleon', 0.99999976),
 ('bonaparte', 0.87043333),
 ('napoléon', 0.66390216),
 ('napoleonic', 0.65167707),
 ('augustus', 0.64356035),
 ('caesar', 0.64281428),
 ('1812', 0.63069737),
 ('tsar', 0.6110847),
 ('xiv', 0.61083198),
 ('emperor', 0.60819435)]

# Load text data

#### Loading data

In [6]:
#DATA = "Biomedical"
#DATA = "SearchSnippets"
DATA = "StackOverflow"

text_path = os.path.join(DATADIR, '%s.txt' % DATA)
with open(text_path, encoding="utf-8") as f:
    data = f.readlines()

label_path = os.path.join(DATADIR, '%s_gnd.txt' % DATA)
with open(label_path, encoding="utf-8") as f:
    target = f.readlines()
target = [int(label.rstrip('\n')) for label in target]
    
print("Total: %s short texts" % format(len(data), ","))

Total: 20,000 short texts


#### Random samples

In [7]:
n = len(data)
sample = 3000
print("Loading %s samples" % format(sample, ","))
index_sample = np.random.randint(0, n, sample)

data = [data[idx] for idx in index_sample]
target = [target[idx] for idx in index_sample]

Loading 3,000 samples


#### Tokenize

In [8]:

tokenizer = Tokenizer(char_level=False)
tokenizer.fit_on_texts(data)
sequences_full = tokenizer.texts_to_sequences(data)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
MAX_NB_WORDS = len(word_index)

seq_lens = [len(s) for s in sequences_full]
print("Average length: %d" % np.mean(seq_lens))
print("Max length: %d" % max(seq_lens))
MAX_SEQUENCE_LENGTH = max(seq_lens)

X = pad_sequences(sequences_full, maxlen=MAX_SEQUENCE_LENGTH)
y = target
tfidf = tokenizer.sequences_to_matrix(sequences_full, mode='tfidf')

Found 3943 unique tokens.
Average length: 8
Max length: 34


# Getting sequences and targets

In [9]:
# creating embedding matrix
EMBEDDING_DIM = embeddor.emb_dim
N_CLASSES = 21

# prepare embedding matrix
nb_words_in_matrix = 0
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding = embeddor.get_emb(word)
    if embedding is not None:
        embedding_matrix[i] = embedding
        nb_words_in_matrix = nb_words_in_matrix + 1
        
print("added %d words in the embedding matrix" % nb_words_in_matrix)

added 2841 words in the embedding matrix


# Target vectors 

In [10]:

# dictionary containing all target vectors
Y = dict()

#### Average embeddings (AE)

In [11]:
%%time
denom = 1 + np.sum(tfidf, axis=1)[:, None]
normed_tfidf = tfidf/denom
average_embeddings = np.dot(normed_tfidf, embedding_matrix)
Y["ae"] = average_embeddings
print("Shape of average embedding: ", Y['ae'].shape)

Shape of average embedding:  (3000, 100)
CPU times: user 200 ms, sys: 69.7 ms, total: 270 ms
Wall time: 261 ms


#### Latent Semantic Analysis (LSA)

In [12]:
%%time
lsa = make_pipeline(
    TruncatedSVD(100),
    Normalizer(copy=False),
) 
lsa_vectors = lsa.fit_transform(tfidf)
Y["lsa"] = lsa_vectors
print("Shape of latent semantic vectors: ", Y['lsa'].shape)

Shape of latent semantic vectors:  (3000, 100)
CPU times: user 1.85 s, sys: 169 ms, total: 2.02 s
Wall time: 1.24 s


#### Laplacian Eigenmaps (LE) [memory intensive]

In [13]:
%%time
# eigen vectors from graph of text similarity
laplacian_vectors = laplacian_eigenmaps(lsa_vectors, n_neighbors=15, subdim=15, n_jobs=-1)
Y["le"] = laplacian_vectors
print(Y['le'].shape)

Fitting nearest neighbors
Creation of heat kernel affinity matrix
Spectral embedding
(3000, 15)
CPU times: user 7.84 s, sys: 673 ms, total: 8.51 s
Wall time: 5.8 s


#### Binarize target vector

In [14]:
reduction_name = "le"
B = binarize(Y[reduction_name])

# Last dimension in the CNN
TARGET_DIM = B.shape[1]

# Example of binarized target vector
print(B.shape)
print(B[0])

(3000, 15)
[ 1.  1.  1.  1.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  1.]


# CNN for deep feature representation learning 

In [15]:

embedding_matrix_copy = embedding_matrix.copy()
trainable_embedding = False

# Embedding layer
pretrained_embedding_layer = Embedding(
    input_dim=MAX_NB_WORDS+1,
    output_dim=EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=MAX_SEQUENCE_LENGTH,
)

# Input
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = pretrained_embedding_layer(sequence_input)

# 1st Layer
x = Conv1D(100, 5, activation='tanh', padding='same')(embedded_sequences)
x = GlobalMaxPooling1D()(x)

# Output
x = Dropout(0.5)(x)
predictions = Dense(TARGET_DIM, activation='sigmoid')(x)
model = Model(sequence_input, predictions)

# Fine-tune embeddings or not
model.layers[1].trainable=trainable_embedding

# Loss and Optimizer
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['mae'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 34)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 34, 100)           394400    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 34, 100)           50100     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 15)                1515      
Total params: 446,015
Trainable params: 51,615
Non-trainable params: 394,400
_________________________________________________________________

In [16]:
%%time
nb_epoch = 1
model.fit(X, B, validation_split=0.2,
          epochs=nb_epoch, batch_size=100, verbose=2)

Train on 2400 samples, validate on 600 samples
Epoch 1/1
2s - loss: 0.6861 - mean_absolute_error: 0.4918 - val_loss: 0.6736 - val_mean_absolute_error: 0.4844
CPU times: user 6.38 s, sys: 519 ms, total: 6.9 s
Wall time: 3.81 s


#### Deep feature representations h

In [17]:
# take the penultimate layer
input = model.layers[0].input
output = model.layers[-2].output
beheaded_model = Model(input, output)
H = beheaded_model.predict(X)
print("Sample shape: {}".format(H.shape))

Sample shape: (3000, 100)


# Tools for evaluating cluster quality

#### K-means 

In [None]:
true_labels = y
n_clusters = len(np.unique(y))
print("Number of classes: %d" % n_clusters)
km = KMeans(n_clusters=n_clusters, n_jobs=-1)
result = dict()
pred = dict()

Number of classes: 20


#### K-means on deep feature representation

In [None]:
deep_features = normalize(H, norm='l2')
print('Shape: {}'.format(deep_features.shape)) 
km.fit(deep_features)
pred['deep'] = km.labels_
result['deep'] = cluster_quality(true_labels, pred['deep'])

Shape: (3000, 100)


#### K-means on LSA representation

In [None]:
lsa_features = Y['lsa']
print('Shape: {}'.format(lsa_features.shape)) 
km.fit(lsa_features)
pred['lsa'] = km.labels_
result['lsa'] = cluster_quality(true_labels, pred['lsa'])

#### K-means on laplacian eigenvalues aka Spectral clustering 

In [None]:
laplacian_features = normalize(Y['le'], norm='l2')
print('Shape: {}'.format(laplacian_features.shape)) 
km.fit(laplacian_features)
pred['eigen'] = km.labels_
result['eigen'] = cluster_quality(true_labels, pred['eigen'])

# Cluster visualization

In [None]:
# random sample because t-sne is slow
tsne_sample = 1000
index_sample = np.random.randint(low=0, high=sample, size=(tsne_sample,))

# mapping color to sample
labels = np.unique(y)
colormap = {label: color for label, color in zip(labels, COLOR_NAMES)}
colors = [colormap[x] for x in true_labels]
color_rgb = [name_to_rgb(name) for name in colors]
color_rgb_norm = [(x/255., y/255., z/255.) for x, y, z in color_rgb]
color_sample = [color_rgb_norm[idx] for idx in index_sample]

# plot function
def plot_tsne(T, color):
    plt.figure(figsize=(10, 10))
    plt.scatter(T[:, 0], T[:, 1], c=color, s=30)

#### t-SNE 

<br>
Remarks:
- Using Laplacian Eigenvalues, we obtain very dense clusters
- Using LSA as target, results are also quite good
- LSA is very competitive for accuracy

In [None]:
# T-SNE model
tsne = TSNE(n_components=2, 
            perplexity=30, 
            early_exaggeration=4.,
            learning_rate=1000,
            init="pca", 
            metric="euclidean")

#### t-SNE on deep feature representation

In [None]:
%%time
rand_deep_features = deep_features[index_sample]
tsne_deep = tsne.fit_transform(rand_deep_features)

In [None]:
plot_tsne(tsne_deep, color=color_sample)

#### t-SNE on LSA vectors

In [None]:
%%time
tsne_lsa = tsne.fit_transform(lsa_features[index_sample])

In [None]:
# matplotlib visualization
plot_tsne(tsne_lsa, color=color_sample)

#### t-SNE on laplacian eigenvalues

In [None]:
%%time
tsne_laplacian = tsne.fit_transform(laplacian_features[index_sample])

In [None]:
plot_tsne(tsne_laplacian, color=color_sample)