# Doc2Vec test

In [None]:
import logging
import math
import os
import re
import sys
from importlib import reload
from pprint import pprint
from time import time

import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import normalize

import gensim

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

from event_detection import data_fetchers, event_detector, preprocessing

reload(logging)
logging.basicConfig(stream=sys.stdout, format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Word2Vec

In [None]:
fetcher = data_fetchers.CzechFullTexts(dataset='dec-jan')

In [None]:
class Preprocessor:
    def __init__(self, documents):
        self.documents = documents
        
    def __iter__(self):
        for doc in self.documents:
            yield gensim.utils.simple_preprocess(doc.text)

In [None]:
word2vec_path = '../event_detection/gensim/word2vec'
documents = Preprocessor(fetcher)

if os.path.exists(word2vec_path):
    word2vec_model = gensim.models.Word2Vec.load(word2vec_path)
else:
    %time word2vec_model = gensim.models.Word2Vec(documents, size=100, negative=5, hs=0, min_count=2, window=5, iter=5)
    word2vec_model.save(word2vec_path)

In [None]:
print('charlie')
pprint(word2vec_model.most_similar('charlie', topn=10))

print('terorista')
pprint(word2vec_model.most_similar('terorista', topn=10))

print('vánoce')
pprint(word2vec_model.most_similar('vánoce', topn=10))

### Direct clustering

In [None]:
from collections import defaultdict

clusters = defaultdict(list)

for i, word_vec in enumerate(word2vec_model.syn0):
    clusters[np.argmax(word_vec)].append(i)

In [None]:
n_clusters = len(clusters)
print('Clusters:', n_clusters)

In [None]:
for i, cluster in clusters.items():
    few_indices = np.random.randint(low=0, high=len(cluster), size=min(10, len(cluster)))
    word_indices = [cluster[ix] for ix in few_indices]
    
    print('----- {} -----'.format(i))
    
    for word_ix in word_indices:
        print(word2vec_model.index2word[word_ix])

### K-Means

In [None]:
n_clusters = 15
word_vectors = normalize(word2vec_model.syn0, norm='l2', copy=True)

clusterer = MiniBatchKMeans(n_clusters=n_clusters, n_init=10, random_state=1)
%time labels = clusterer.fit_predict(word_vectors)

In [None]:
clusters = [[] for _ in range(n_clusters)]

for word_ix, label in np.ndenumerate(labels):
    clusters[label].append(word_ix[0])

for i, c in enumerate(clusters):
    print('Cluster {} of {} words'.format(i, len(c)))

In [None]:
for i, cluster in enumerate(clusters):
    few_indices = np.random.randint(low=0, high=len(cluster), size=min(10, len(cluster)))
    word_indices = [cluster[ix] for ix in few_indices]
    
    print('----- {} -----'.format(i))
    
    for word_ix in word_indices:
        print(word2vec_model.index2word[word_ix])

## Doc2Vec

Copied from the project to play around with different tags:

In [None]:
class DocumentTagger:
    def __init__(self, documents):
        self.documents = documents
        self.splitter = re.compile(r'\W+')
        
    def __iter__(self):
        for i, doc in enumerate(self.documents):
            tags = [doc.date, doc.category]
            words = self.splitter.split(doc.text.lower())
            tagged_doc = gensim.models.doc2vec.TaggedDocument(words, tags)

            yield tagged_doc

In [None]:
fetcher = data_fetchers.CzechFullTexts(dataset='dec-jan', names=True, dates=True)
doc_tagger = DocumentTagger(fetcher)
logging.info('Document iterators prepared')

In [None]:
doc2vec_path = '../event_detection/gensim/doc2vec'

if os.path.exists(doc2vec_path):
    doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_path)
else:
    %time doc2vec_model = gensim.models.Doc2Vec(doc_tagger, dm=1, dm_mean=1, size=100, negative=5, hs=0, min_count=2, window=5, iter=5)
    doc2vec_model.save(doc2vec_path)

In [None]:
documents = list(fetcher)

In [None]:
doc_id = np.random.randint(doc2vec_model.docvecs.count)


print('----- DOCUMENT -----')
print(doc_id)
print(documents[doc_id].name)
print(documents[doc_id].text)

sims = doc2vec_model.docvecs.most_similar(doc_id, topn=1)

print('----- MOST SIMILAR -----')
print(sims)

print(documents[sims[0][0]].name)
print(documents[sims[0][0]].text)

## Notes
* Finds clickbaits well (10 things you have never heard about! -- tend to be similar)
* The document set contains a lot of duplicated articles

## TODO
* Compare these:
    1. classical event detection
    2. event detection with pre-clustering
    3. clustering-based event detection
    4. clustering-based event detection with pre-clustering
    5. doc2vec-similarity-based event detection
    6. doc2vec + pre-clustering
    7. doc2vec + cluster-based
    8. doc2vec + pre-clustering + cluster-based
* Try different doc2vec settings (concat, DBOW)

In [None]:
doc_id = np.random.randint(doc2vec_model.docvecs.count)

print(doc_id)
print(documents[doc_id].name)

sims = doc2vec_model.docvecs.most_similar(doc_id, topn=len(documents))

print('-' * 10 + ' MOST SIMILAR ' + '-' * 10)
for sim in sims[:10]:
    print(documents[sim[0]].name, '\t', sim)

print('-' * 10 + ' LEAST SIMILAR ' + '-' * 10)
for sim in sims[-10:]:
    print(documents[sim[0]].name, '\t', sim)

In [None]:
# Trained
document_vectors = doc2vec_model.docvecs[[i for i in range(len(documents))]]
normalize(document_vectors, norm='l2', copy=False)

In [None]:
n_clusters = 15

clusterer = MiniBatchKMeans(n_clusters=n_clusters, n_init=10, random_state=1)
%time labels = clusterer.fit_predict(document_vectors)

In [None]:
clusters = [[] for _ in range(n_clusters)]

for document_ix, label in np.ndenumerate(labels):
    clusters[label].append(document_ix[0])

for i, c in enumerate(clusters):
    print('Cluster {} of {} documents'.format(i, len(c)))

In [None]:
for i, cluster in enumerate(clusters):
    few_indices = np.random.randint(low=0, high=len(cluster), size=min(10, len(cluster)))
    doc_indices = [cluster[ix] for ix in few_indices]
    
    print('----- {} -----'.format(i))
    
    for doc_ix in doc_indices:
        print(documents[doc_ix].date, documents[doc_ix].name)

In [None]:
import warnings

import numpy as np
import scipy.sparse as sp

from sklearn.cluster import KMeans
from sklearn.cluster.k_means_ import (
    _init_centroids,
    _labels_inertia,
    _tolerance,
    _validate_center_shape,
)
from sklearn.utils import (
    check_array,
    check_random_state,
    as_float_array,
)
from sklearn.cluster import _k_means
from sklearn.preprocessing import normalize
from sklearn.externals.joblib import Parallel, delayed
from sklearn.utils.extmath import row_norms, squared_norm


def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300,
                                   init='k-means++', verbose=False,
                                   x_squared_norms=None,
                                   random_state=None, tol=1e-4,
                                   precompute_distances=True):
    '''
    Modified from sklearn.cluster.k_means_.k_means_single_lloyd.
    '''
    random_state = check_random_state(random_state)

    best_labels, best_inertia, best_centers = None, None, None

    # init
    centers = _init_centroids(X, n_clusters, init, random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()

        # labels assignment
        # TODO: _labels_inertia should be done with cosine distance
        #       since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized
        #       this doesn't really matter.
        labels, inertia = \
            _labels_inertia(X, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

        # computation of the means
        if sp.issparse(X):
            centers = _k_means._centers_sparse(X, labels, n_clusters,
                                               distances)
        else:
            centers = _k_means._centers_dense(X, labels, n_clusters, distances)

        # l2-normalize centers (this is the main contibution here)
        centers = normalize(centers)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e"
                      % (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, x_squared_norms, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

    return best_labels, best_inertia, best_centers, i + 1


def spherical_k_means(X, n_clusters, init='k-means++', n_init=10,
            max_iter=300, verbose=False, tol=1e-4, random_state=None,
            copy_x=True, n_jobs=1, algorithm="auto", return_n_iter=False):
    """Modified from sklearn.cluster.k_means_.k_means.
    """
    if n_init <= 0:
        raise ValueError("Invalid number of initializations."
                         " n_init=%d must be bigger than zero." % n_init)
    random_state = check_random_state(random_state)

    if max_iter <= 0:
        raise ValueError('Number of iterations should be a positive number,'
                         ' got %d instead' % max_iter)

    best_inertia = np.infty
    X = as_float_array(X, copy=copy_x)
    tol = _tolerance(X, tol)

    if hasattr(init, '__array__'):
        init = check_array(init, dtype=X.dtype.type, copy=True)
        _validate_center_shape(X, n_clusters, init)

        if n_init != 1:
            warnings.warn(
                'Explicit initial center position passed: '
                'performing only one init in k-means instead of n_init=%d'
                % n_init, RuntimeWarning, stacklevel=2)
            n_init = 1

    # precompute squared norms of data points
    x_squared_norms = row_norms(X, squared=True)

    if n_jobs == 1:
        # For a single thread, less memory is needed if we just store one set
        # of the best results (as opposed to one set per run per thread).
        for it in range(n_init):
            # run a k-means once
            labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd(
                X, n_clusters, max_iter=max_iter, init=init, verbose=verbose,
                tol=tol, x_squared_norms=x_squared_norms,
                random_state=random_state)

            # determine if these results are the best so far
            if best_inertia is None or inertia < best_inertia:
                best_labels = labels.copy()
                best_centers = centers.copy()
                best_inertia = inertia
                best_n_iter = n_iter_
    else:
        # parallelisation of k-means runs
        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
        results = Parallel(n_jobs=n_jobs, verbose=0)(
            delayed(_spherical_kmeans_single_lloyd)(X, n_clusters,
                                   max_iter=max_iter, init=init,
                                   verbose=verbose, tol=tol,
                                   x_squared_norms=x_squared_norms,
                                   # Change seed to ensure variety
                                   random_state=seed)
            for seed in seeds)

        # Get results with the lowest inertia
        labels, inertia, centers, n_iters = zip(*results)
        best = np.argmin(inertia)
        best_labels = labels[best]
        best_inertia = inertia[best]
        best_centers = centers[best]
        best_n_iter = n_iters[best]

    if return_n_iter:
        return best_centers, best_labels, best_inertia, best_n_iter
    else:
        return best_centers, best_labels, best_inertia


class SphericalKMeans(KMeans):
    """Spherical K-Means clustering
    Modfication of sklearn.cluster.KMeans where cluster centers are normalized
    (projected onto the sphere) in each iteration.
    Parameters
    ----------
    n_clusters : int, optional, default: 8
        The number of clusters to form as well as the number of
        centroids to generate.
    max_iter : int, default: 300
        Maximum number of iterations of the k-means algorithm for a
        single run.
    n_init : int, default: 10
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.
    init : {'k-means++', 'random' or an ndarray}
        Method for initialization, defaults to 'k-means++':
        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.
        'random': choose k observations (rows) at random from data for
        the initial centroids.
        If an ndarray is passed, it should be of shape (n_clusters, n_features)
        and gives the initial centers.
    tol : float, default: 1e-4
        Relative tolerance with regards to inertia to declare convergence
    n_jobs : int
        The number of jobs to use for the computation. This works by computing
        each of the n_init runs in parallel.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging. For n_jobs below -1,
        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.
    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.
    verbose : int, default 0
        Verbosity mode.
    copy_x : boolean, default True
        When pre-computing distances it is more numerically accurate to center
        the data first.  If copy_x is True, then the original data is not
        modified.  If False, the original data is modified, and put back before
        the function returns, but small numerical differences may be introduced
        by subtracting and then adding the data mean.
    Attributes
    ----------
    cluster_centers_ : array, [n_clusters, n_features]
        Coordinates of cluster centers
    labels_ :
        Labels of each point
    inertia_ : float
        Sum of distances of samples to their closest cluster center.
    """
    def __init__(self, n_clusters=8, init='k-means++', n_init=10,
                 max_iter=300, tol=1e-4, n_jobs=1,
                 verbose=0, random_state=None, copy_x=True):
        self.n_clusters = n_clusters
        self.init = init
        self.max_iter = max_iter
        self.tol = tol
        self.n_init = n_init
        self.verbose = verbose
        self.random_state = random_state
        self.copy_x = copy_x
        self.n_jobs = n_jobs


    def fit(self, X, y=None):
        """Compute k-means clustering.
        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
        """
        random_state = check_random_state(self.random_state)
        X = self._check_fit_data(X)

        # TODO: add check that all data is unit-normalized

        self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
            spherical_k_means(
                X, n_clusters=self.n_clusters, init=self.init,
                n_init=self.n_init, max_iter=self.max_iter, verbose=self.verbose,
                tol=self.tol, random_state=random_state, copy_x=self.copy_x,
                n_jobs=self.n_jobs,
                return_n_iter=True)

        return self

In [None]:
n_sphere_clusters = 15

sphere_clusterer = SphericalKMeans(n_clusters=n_clusters, n_init=10, random_state=1)
%time sphere_labels = sphere_clusterer.fit_predict(document_vectors)

In [None]:
sphere_clusters = [[] for _ in range(n_sphere_clusters)]

for document_ix, label in np.ndenumerate(sphere_labels):
    sphere_clusters[label].append(document_ix[0])

for i, c in enumerate(sphere_clusters):
    print('Cluster {} of {} documents'.format(i, len(c)))

In [None]:
for i, cluster in enumerate(sphere_clusters):
    few_indices = np.random.randint(low=0, high=len(cluster), size=10)
    doc_indices = [cluster[ix] for ix in few_indices]
    
    print('----- {} -----'.format(i))
    
    for doc_ix in doc_indices:
        print(documents[doc_ix].name)

## Clustering of features by trajectory (useless)

In [None]:
t = time()
documents, relative_days = data_fetchers.fetch_czech_corpus_dec_jan()

stream_length = max(relative_days) + 1  # Zero-based, hence the + 1.
logging.info('Read input in %fs.', time() - t)
logging.info('Stream length: %d', stream_length)

t = time()
vectorizer = CountVectorizer(min_df=30, max_df=0.9, binary=True, stop_words=event_detector.CZECH_STOPWORDS)
bow_matrix = vectorizer.fit_transform(documents)
id2word = {v: k for k, v in vectorizer.vocabulary_.items()}
logging.info('Done in %fs.', time() - t)
logging.info('BOW: %s, %s, storing %d elements', str(bow_matrix.shape), str(bow_matrix.dtype),
                     bow_matrix.getnnz())

In [None]:
trajectories = event_detector.construct_feature_trajectories(bow_matrix, relative_days)
dps, dp = event_detector.spectral_analysis(trajectories)

In [None]:
_, n_days = trajectories.shape
DPS_BOUNDARY = 0.03

aperiodic_feature_indices = np.where((dps > DPS_BOUNDARY) & (dp > math.floor(n_days / 2)))[0]
periodic_feature_indices = np.where((dps > DPS_BOUNDARY) & (dp <= math.floor(n_days / 2)))[0]

In [None]:
aperiodic_trajectories = trajectories[aperiodic_feature_indices]
periodic_trajectories = trajectories[periodic_feature_indices]
logging.info('Aperiodic trajectories: %s', str(aperiodic_trajectories.shape))
logging.info('Periodic trajectories: %s', str(periodic_trajectories.shape))

In [None]:
import gensim

normalize(aperiodic_trajectories, norm='l1', copy=False)
normalize(periodic_trajectories, norm='l1', copy=False)

aperiodic_pairwise = np.zeros((aperiodic_trajectories.shape[0], aperiodic_trajectories.shape[0]), dtype=float)
periodic_pairwise = np.zeros((periodic_trajectories.shape[0], periodic_trajectories.shape[0]), dtype=float)

for i in range(len(aperiodic_pairwise)):
    for j in range(len(aperiodic_pairwise)):
        aperiodic_pairwise[i, j] = event_detector.jensen_shannon_divergence(aperiodic_trajectories[i], aperiodic_trajectories[j])
        
for i in range(len(periodic_pairwise)):
    for j in range(len(periodic_pairwise)):
        periodic_pairwise[i, j] = event_detector.jensen_shannon_divergence(periodic_trajectories[i], periodic_trajectories[j])

In [None]:
from sklearn.cluster import AffinityPropagation, KMeans

# aperiodic_clusterer = KMeans(n_clusters=10, n_init=10) #DBSCAN(metric='precomputed', algorithm='auto')
# periodic_clusterer = KMeans(n_clusters=10, n_init=10) #DBSCAN(metric='precomputed', algorithm='auto')

aperiodic_clusterer = DBSCAN(metric='precomputed')
periodic_clusterer = DBSCAN(metric='precomputed')

aperiodic_labels = aperiodic_clusterer.fit_predict(np.sqrt(aperiodic_pairwise))
periodic_labels = periodic_clusterer.fit_predict(np.sqrt(periodic_pairwise))

aperiodic_n_clusters = len(set(aperiodic_labels)) - (1 if -1 in aperiodic_labels else 0)
periodic_n_clusters = len(set(periodic_labels)) - (1 if -1 in periodic_labels else 0)

logging.info('Aperiodic clusters: %d', aperiodic_n_clusters)
logging.info('Periodic clusters: %d', periodic_n_clusters)

In [None]:
aperiodic_clusters = [[] for _ in range(aperiodic_n_clusters)]

for feature_ix, label in np.ndenumerate(aperiodic_labels):
    aperiodic_clusters[label].append(feature_ix[0])

periodic_clusters = [[] for _ in range(periodic_n_clusters)]

for feature_ix, label in np.ndenumerate(periodic_labels):
    periodic_clusters[label].append(feature_ix[0])

In [None]:
for cluster in aperiodic_clusters:
    for word in cluster:
        trajectory = aperiodic_trajectories[word]
        plt.plot(trajectory)

    plt.show()

In [None]:
for cluster in periodic_clusters:
    for word in cluster:
        trajectory = periodic_trajectories[word]
        plt.plot(trajectory)
        
    plt.show()