In [None]:
import logging
import math
import os
import pickle
import re
import sys
from importlib import reload
from pprint import pprint
from time import time

import numpy as np
from scipy.optimize import curve_fit
from scipy.stats import cauchy, norm

from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.mixture as gmm
from sklearn.preprocessing import normalize

import gensim

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

from event_detection import data_fetchers, event_detector, plotting, preprocessing

reload(logging)
logging.basicConfig(stream=sys.stdout, format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
fetcher = data_fetchers.CzechFullTexts(dataset='dec-jan', names=True, dates=True)

# Load Word2Vec

In [None]:
class Preprocessor:
    def __init__(self, documents):
        self.documents = documents
        
    def __iter__(self):
        for doc in self.documents:
            yield gensim.utils.simple_preprocess(doc.text)


word2vec_path = '../event_detection/gensim/word2vec'
documents = Preprocessor(fetcher)

if os.path.exists(word2vec_path):
    print('Loading Word2Vec')
    %time word2vec_model = gensim.models.Word2Vec.load(word2vec_path)
else:
    print('Training Word2Vec')
    %time word2vec_model = gensim.models.Word2Vec(documents, size=100, negative=5, hs=0, min_count=2, window=5, iter=5)
    word2vec_model.save(word2vec_path)

# Load Doc2Vec

In [None]:
class DocumentTagger:
    def __init__(self, documents):
        self.documents = documents
        
    def __iter__(self):
        for i, doc in enumerate(self.documents):
            tags = [i]
            words = gensim.utils.simple_preprocess(doc.text)
            tagged_doc = gensim.models.doc2vec.TaggedDocument(words, tags)

            yield tagged_doc


doc2vec_path = '../event_detection/gensim/doc2vec'
doc_tagger = DocumentTagger(fetcher)

if os.path.exists(doc2vec_path):
    print('Loading Doc2Vec')
    %time doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_path)
else:
    print('Training Doc2Vec')
    %time doc2vec_model = gensim.models.Doc2Vec(doc_tagger, dm=1, dm_mean=1, size=100, negative=5, hs=0, min_count=2, window=5, iter=5)
    doc2vec_model.save(doc2vec_path)

# Prepare event detection

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

PICKLE_PATH = '../event_detection/pickle'
ID2WORD_PATH = os.path.join(PICKLE_PATH, 'vectorizer_dec_jan_full_nolimit.pickle')
BOW_MATRIX_PATH = os.path.join(PICKLE_PATH, 'term_document_dec_jan_full_nolimit.pickle')
RELATIVE_DAYS_PATH = os.path.join(PICKLE_PATH, 'relative_days_dec_jan_full_nolimit.pickle')

if os.path.exists(ID2WORD_PATH) and os.path.exists(BOW_MATRIX_PATH) and os.path.exists(RELATIVE_DAYS_PATH):
    with open(ID2WORD_PATH, mode='rb') as f:
        id2word = pickle.load(f)

    with open(BOW_MATRIX_PATH, mode='rb') as f:
        bow_matrix = pickle.load(f)

    with open(RELATIVE_DAYS_PATH, mode='rb') as f:
        relative_days = pickle.load(f)

    stream_length = max(relative_days) + 1  # Zero-based, hence the + 1.

    logging.info('Deserialized id2word, bag of words matrix and relative days')
    logging.info('BOW: %s, %s, storing %d elements', str(bow_matrix.shape), str(bow_matrix.dtype),
                 bow_matrix.getnnz())
    logging.info('Stream length: %d', stream_length)
else:
    if not os.path.exists(PICKLE_PATH):
        os.makedirs(PICKLE_PATH)

    t = time()
    relative_days = fetcher.fetch_relative_days()

    stream_length = max(relative_days) + 1  # Zero-based, hence the + 1.
    logging.info('Read input in %fs.', time() - t)
    logging.info('Stream length: %d', stream_length)
    
    documents = Preprocessor(fetcher)

    t = time()
    vectorizer = CountVectorizer(min_df=2, binary=True, tokenizer=lambda doc: doc, preprocessor=lambda doc: doc)
    bow_matrix = vectorizer.fit_transform(documents)
    id2word = {v: k for k, v in vectorizer.vocabulary_.items()}

    with open(ID2WORD_PATH, mode='wb') as f:
        pickle.dump(id2word, f)

    with open(BOW_MATRIX_PATH, mode='wb') as f:
        pickle.dump(bow_matrix, f)

    with open(RELATIVE_DAYS_PATH, mode='wb') as f:
        pickle.dump(relative_days, f)

    logging.info('Created and serialized id2word, bag of words matrix and relative days in %fs.', time() - t)
    logging.info('BOW: %s, %s, storing %d elements', str(bow_matrix.shape), str(bow_matrix.dtype),
                 bow_matrix.getnnz())

trajectories = event_detector.construct_feature_trajectories(bow_matrix, relative_days)
dps, dp = event_detector.spectral_analysis(trajectories)

In [None]:
doc2vec_model.syn0.shape

# Individual event detectors

## 1. Vanilla

In [None]:
def unsupervised_greedy_event_detection(global_indices, bow_matrix, feature_trajectories, dps, dp):
    def cost_function(feature_indices):
        with np.errstate(divide='ignore'):  # Denominator == 0 means no document overlap and return infinity.
            return event_detector.set_similarity(feature_indices, divergences) / (
                event_detector.set_df_overlap(feature_indices, overlaps) * np.sum(dps[feature_indices]))

    def minimizing_feature(event_so_far, feature_indices):
        index = feature_indices[0]
        min_cost = cost_function(event_so_far + [feature_indices[0]])

        for f in feature_indices[1:]:
            added = event_so_far + [f]
            added_cost = cost_function(added)

            if added_cost < min_cost:
                index, min_cost = f, added_cost

        return index, min_cost

    logging.info('Examining %d features.', len(feature_trajectories))

    # Sort feature indices by DPS in ascending order.
    indices = list(sorted(range(len(feature_trajectories)), key=lambda i: dps[i]))

    t = time()
    divergences = event_detector.precompute_divergences(feature_trajectories, dp)
    logging.info('Precomputed information divergences in %fs.', time() - t)

    t = time()
    overlaps = event_detector.precompute_df_overlaps(bow_matrix)
    logging.info('Precomputed document overlaps in %fs.', time() - t)

    t = time()
    found_events = 0

    while len(indices) > 0:
        feature = indices.pop(0)
        event = [feature]
        event_cost = 1 / dps[feature]

        while len(indices) > 0:
            m, cost = minimizing_feature(event, indices)

            if cost < event_cost:
                event.append(m)
                indices.remove(m)
                event_cost = cost
            else:
                break

        yield global_indices[event]
        found_events += 1

    logging.info('Detected %d events in %fs.', found_events, time() - t)

## 2. Vanilla, cluster-based

In [None]:
def unsupervised_greedy_event_detection_clusters(global_indices, bow_matrix, feature_trajectories, dps, dp):
    logging.info('Examining %d features.', len(feature_trajectories))

    t0 = time()
    t = time()
    divergences = event_detector.precompute_divergences(feature_trajectories, dp)
    logging.info('Precomputed information divergences in %fs.', time() - t)

    t = time()
    overlaps = event_detector.precompute_df_overlaps(bow_matrix).A.astype(float)
    overlaps[overlaps < 1e-8] = 1e-12
    logging.info('Precomputed document overlaps in %fs.', time() - t)

    np.divide(divergences, overlaps, out=divergences)
    del overlaps

    logging.info('Created similarity matrix in %fs.', time() - t0)

    from sklearn.cluster import DBSCAN

    t = time()
    clusterer = DBSCAN(metric='precomputed', min_samples=2, eps=0.25)
    divergences[np.isinf(divergences)] = 1e15

    labels = clusterer.fit_predict(divergences)
    logging.info('Performed clustering in %fs.', time() - t)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

    if n_clusters == 0:  # TODO: Temp fix.
        logging.warning('Found 0 clusters.')
        return []

    logging.info('Found %d clusters.', n_clusters)
    events = [[] for _ in range(n_clusters)]

    for feature_ix, label in np.ndenumerate(labels):
        if label >= 0:
            events[label].append(global_indices[feature_ix[0]])

    logging.info('Detected %d events in %fs.', len(events), time() - t0)
    logging.info('Total features covered: %d.', sum(len(event) for event in events))

    yield from events

## 3. Word2Vec

In [None]:
from scipy.stats import entropy

def event_detection_word2vec(global_indices, bow_matrix, feature_trajectories, dps, dp, id2word):
    def cost_function(old_indices, new_index):
        with np.errstate(divide='ignore'):  # Denominator == 0 means no document overlap and return infinity.
            old_traj = np.mean(feature_trajectories[old_indices], axis=0)
            new_traj = feature_trajectories[new_index]
            traj_div = event_detector.jensen_shannon_divergence(old_traj, new_traj)
            
            old_words = [id2word[global_indices[word_ix]] for word_ix in old_indices]
            new_word = id2word[global_indices[new_index]]
            doc_sim = math.exp(word2vec_model.n_similarity(old_words, [new_word]))
            
            dps_score = np.sum(dps[old_indices + [new_index]])
            
            return traj_div / (doc_sim * dps_score)

    def minimizing_feature(event_so_far, feature_indices):
        index = feature_indices[0]
        min_cost = cost_function(event_so_far, feature_indices[0])

        for f in feature_indices[1:]:
            added_cost = cost_function(event_so_far, f)

            if added_cost < min_cost:
                index, min_cost = f, added_cost

        return index, min_cost

    logging.info('Examining %d features.', len(feature_trajectories))

    # Sort feature indices by DPS in ascending order.
    indices = list(sorted(range(len(feature_trajectories)), key=lambda i: dps[i]))

    t = time()
    found_events = 0

    while len(indices) > 0:
        feature = indices.pop(0)
        event = [feature]
        event_cost = 1 / dps[feature]

        while len(indices) > 0:
            m, cost = minimizing_feature(event, indices)

            if cost < event_cost:
                event.append(m)
                indices.remove(m)
                event_cost = cost
            else:
                break

        yield global_indices[event]
        found_events += 1

    logging.info('Detected %d events in %fs.', found_events, time() - t)

## 4. Word2Vec, cluster-based

In [None]:
def event_detection_word2vec_clusters(global_indices, bow_matrix, feature_trajectories, dps, dp, id2word):
    logging.info('Examining %d features.', len(feature_trajectories))

    t0 = time()
    t = time()
    divergences = event_detector.precompute_divergences(feature_trajectories, dp)
    logging.info('Precomputed information divergences in %fs.', time() - t)
    
    overlaps = np.zeros((bow_matrix.shape[1], bow_matrix.shape[1]), dtype=float)
    logging.info('Overlaps: %s', str(overlaps.shape))
    
    for i in range(len(overlaps)):
        for j in range(len(overlaps)):
            if i != j:
                word1 = id2word[global_indices[i]]
                word2 = id2word[global_indices[j]]
                similarity = word2vec_model.similarity(word1, word2)
                overlaps[i, j] = math.exp(similarity)
            

    logging.info('Precomputed word similarities in %fs.', time() - t)

    np.divide(divergences, overlaps, out=divergences)
    del overlaps
    
    divergences[np.isnan(divergences)] = 1000.0
    
    logging.info('Created similarity matrix in %fs.', time() - t0)

    from sklearn.cluster import DBSCAN

    t = time()
    clusterer = DBSCAN(metric='precomputed', min_samples=2, eps=100.0)

    labels = clusterer.fit_predict(divergences)
    logging.info('Performed clustering in %fs.', time() - t)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

    if n_clusters == 0:  # TODO: Temp fix.
        logging.warning('Found 0 clusters.')
        return []

    logging.info('Found %d clusters.', n_clusters)
    events = [[] for _ in range(n_clusters)]
    
    for feature_ix, label in np.ndenumerate(labels):
        events[label].append(global_indices[feature_ix[0]])

    logging.info('Detected %d events in %fs.', len(events), time() - t0)

    yield from events

## 5. Word vectors from Doc2Vec

In [None]:
from scipy.stats import entropy

def event_detection_word_doc2vec(global_indices, bow_matrix, feature_trajectories, dps, dp, id2word):
    def cost_function(old_indices, new_index):
        with np.errstate(divide='ignore'):  # Denominator == 0 means no document overlap and return infinity.
            old_traj = np.mean(feature_trajectories[old_indices], axis=0)
            new_traj = feature_trajectories[new_index]
            traj_div = event_detector.jensen_shannon_divergence(old_traj, new_traj)
            
            old_words = [id2word[global_indices[word_ix]] for word_ix in old_indices]
            new_word = id2word[global_indices[new_index]]
            doc_sim = math.exp(doc2vec_model.n_similarity(old_words, [new_word]))
            
            dps_score = np.sum(dps[old_indices + [new_index]])
            
            return traj_div / (doc_sim * dps_score)

    def minimizing_feature(event_so_far, feature_indices):
        index = feature_indices[0]
        min_cost = cost_function(event_so_far, feature_indices[0])

        for f in feature_indices[1:]:
            added_cost = cost_function(event_so_far, f)

            if added_cost < min_cost:
                index, min_cost = f, added_cost

        return index, min_cost

    logging.info('Examining %d features.', len(feature_trajectories))

    # Sort feature indices by DPS in ascending order.
    indices = list(sorted(range(len(feature_trajectories)), key=lambda i: dps[i]))
    
    t = time()
    found_events = 0

    while len(indices) > 0:
        feature = indices.pop(0)
        event = [feature]
        event_cost = 1 / dps[feature]

        while len(indices) > 0:
            m, cost = minimizing_feature(event, indices)

            if cost < event_cost:
                event.append(m)
                indices.remove(m)
                event_cost = cost
            else:
                break

        yield global_indices[event]
        found_events += 1

    logging.info('Detected %d events in %fs.', found_events, time() - t)

## 6. Word vectors from Doc2Vec, cluster-based

In [None]:
def event_detection_word_doc2vec_clusters(global_indices, bow_matrix, feature_trajectories, dps, dp, id2word):
    logging.info('Examining %d features.', len(feature_trajectories))

    t0 = time()
    t = time()
    divergences = event_detector.precompute_divergences(feature_trajectories, dp)
    logging.info('Precomputed information divergences in %fs.', time() - t)
    
    overlaps = np.zeros((bow_matrix.shape[1], bow_matrix.shape[1]), dtype=float)
    logging.info('Overlaps: %s', str(overlaps.shape))
    
    for i in range(len(overlaps)):
        for j in range(len(overlaps)):
            if i != j:
                word1 = id2word[global_indices[i]]
                word2 = id2word[global_indices[j]]
                similarity = doc2vec_model.similarity(word1, word2)
                overlaps[i, j] = math.exp(similarity)
                
    logging.info('Precomputed word similarities in %fs.', time() - t)

    np.divide(divergences, overlaps, out=divergences)
    del overlaps
    
    divergences[np.isnan(divergences)] = 1e-3
    
    logging.info('Created similarity matrix in %fs.', time() - t0)

    from hdbscan import HDBSCAN

    t = time()
    clusterer = HDBSCAN(metric='precomputed', min_samples=2, min_cluster_size=3)
    
    labels = clusterer.fit_predict(divergences)
    logging.info('Performed clustering in %fs.', time() - t)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

    if n_clusters == 0:  # TODO: Temp fix.
        logging.warning('Found 0 clusters.')
        return []

    logging.info('Found %d clusters.', n_clusters)
    events = [[] for _ in range(n_clusters)]
    
    for feature_ix, label in np.ndenumerate(labels):
        if label >= 0:
            events[label].append(global_indices[feature_ix[0]])

    logging.info('Detected %d events in %fs.', len(events), time() - t0)
    logging.info('Total features covered: %d.', sum(len(event) for event in events))

    yield from events

# Choose algorithm

In [None]:
method = 'vanilla_clusters'

if method == 'vanilla':
    function = unsupervised_greedy_event_detection
elif method == 'vanilla_clusters':
    function = unsupervised_greedy_event_detection_clusters
elif method == 'word2vec':
    function = lambda global_indices, bow_matrix, feature_trajectories, dps, dp: \
        event_detection_word2vec(global_indices, bow_matrix, feature_trajectories, dps, dp, id2word)
elif method == 'word2vec_clusters':
    function = lambda global_indices, bow_matrix, feature_trajectories, dps, dp: \
        event_detection_word2vec_clusters(global_indices, bow_matrix, feature_trajectories, dps, dp, id2word)
elif method == 'word_doc2vec':
    function = lambda global_indices, bow_matrix, feature_trajectories, dps, dp: \
        event_detection_word_doc2vec(global_indices, bow_matrix, feature_trajectories, dps, dp, id2word)
elif method == 'word_doc2vec_clusters':
    function = lambda global_indices, bow_matrix, feature_trajectories, dps, dp: \
        event_detection_word_doc2vec_clusters(global_indices, bow_matrix, feature_trajectories, dps, dp, id2word)
else:
    raise ValueError('Unknown method')
    
DPS_BOUNDARY = 0.03

def detect_events(bow_matrix, feature_trajectories, dps, dp, aperiodic):
    _, n_days = feature_trajectories.shape

    if aperiodic:
        feature_indices = np.where((dps > DPS_BOUNDARY) & (dp > math.floor(n_days / 2)))[0]
    else:
        feature_indices = np.where((dps > DPS_BOUNDARY) & (dp <= math.floor(n_days / 2)))[0]

    if len(feature_indices) == 0:
        logging.warning('No features to detect events from.')
        return []

    logging.info('Detecting %s events from %d features.', 'aperiodic' if aperiodic else 'periodic', len(feature_indices))
    
    bow_slice = bow_matrix[:, feature_indices]
    features = feature_trajectories[feature_indices, :]
    feature_dps = dps[feature_indices]
    feature_dp = dp[feature_indices]

    return function(feature_indices, bow_slice, features, feature_dps, feature_dp)

# Perform event detection

In [None]:
# Aperiodic events
aperiodic_events = detect_events(bow_matrix, trajectories, dps, dp, aperiodic=True)
plotting.plot_events(trajectories, aperiodic_events, id2word, dps, dp, dirname=('./' + method + '_aperiodic'))
logging.info('Aperiodic done')

# Periodic events
periodic_events = detect_events(bow_matrix, trajectories, dps, dp, aperiodic=False)
plotting.plot_events(trajectories, periodic_events, id2word, dps, dp, dirname=('./' + method + '_periodic'))
logging.info('Periodic done')