In [None]:
import logging
import math
import os
import sys
from imp import reload
from time import time

import numpy as np

from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import normalize

import gensim


module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

from event_detection import data_fetchers, event_detector, plotting

reload(logging)
logging.basicConfig(stream=sys.stdout, format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


%matplotlib inline

In [None]:
total_time = time()

t = time()
documents, relative_days = data_fetchers.fetch_czech_corpus_dec_jan()

stream_length = max(relative_days) + 1  # Zero-based, hence the + 1.
print('Read input in %fs.' % (time() - t))
print('Stream length: %d' % stream_length)

t = time()
vectorizer = CountVectorizer(min_df=30, max_df=100000, binary=True, stop_words=event_detector.CZECH_STOPWORDS)
bow_matrix = vectorizer.fit_transform(documents).tocsr()
id2word = {v: k for k, v in vectorizer.vocabulary_.items()}
print('Created bag of words in %fs.' % (time() - t))
print('BOW:', bow_matrix.shape)

In [None]:
%time bow_matrix = TfidfTransformer().fit_transform(bow_matrix)

In [None]:
NUM_TOPICS = 100

corpus = gensim.matutils.Sparse2Corpus(bow_matrix, documents_columns=False)
dictionary = gensim.corpora.Dictionary.from_corpus(corpus, id2word=id2word)

In [None]:
LSI_PATH = ('./dec_jan_%d_topics_tfidf.lsi' % NUM_TOPICS)

if os.path.exists(LSI_PATH):
    lsi = gensim.models.LsiModel.load(LSI_PATH)
    print('Loaded %d LSI topics from file' % NUM_TOPICS)
else:
    %time lsi = gensim.models.LsiModel(corpus, id2word=dictionary, num_topics=NUM_TOPICS, onepass=False, power_iters=5)
    lsi.save(LSI_PATH)
    print('Generated LSI model for %d topics and saved to file' % NUM_TOPICS)

In [None]:
lsi_gensim = gensim.matutils.corpus2dense(lsi[corpus], len(lsi.projection.s)).T / lsi.projection.s
normalize(lsi_gensim, norm='l2', copy=False)

In [None]:
lsi.print_topics(10)

## KMeans

In [None]:
NUM_CLUSTERS = 15

kmeans = MiniBatchKMeans(n_clusters=NUM_CLUSTERS, n_init=10, random_state=1)

%time kmeans.fit(lsi_gensim)

clusters = [[] for _ in range(NUM_CLUSTERS)]

for doc, label in np.ndenumerate(kmeans.labels_):
    clusters[label].append(doc[0])

In [None]:
for cluster, docs in enumerate(clusters):
    print(cluster, len(docs))

In [None]:
def detect_events(X, D, length, inverse, cluster_num):
    DPS_BOUNDARY = 0.1
    
    trajectories = event_detector.construct_feature_trajectories(X, D)
    dps, dp = event_detector.spectral_analysis(trajectories)
    
    # Aperiodic events
    aperiodic_indices = np.where((dps > DPS_BOUNDARY) & (dp > math.ceil(length / 2)))[0]
    aperiodic_bow = X[:, aperiodic_indices]
    aperiodic_features = trajectories[aperiodic_indices, :]
    aperiodic_dps = dps[aperiodic_indices]
    aperiodic_dp = dp[aperiodic_indices]
    
    if len(aperiodic_indices > 0):
        aperiodic_events = event_detector.unsupervised_greedy_event_detection(aperiodic_indices, aperiodic_bow,
                                                                               aperiodic_features, aperiodic_dps,
                                                                               aperiodic_dp)
        plotting.plot_events(trajectories, aperiodic_events, inverse, dps, dp, dirname='./aperiodic_{}'.format(cluster_num))
        print('Aperiodic done')
    else:
        print('No high power aperiodic features detected')
    
    # Periodic events
    periodic_indices = np.where((dps > DPS_BOUNDARY) & (dp <= math.ceil(length / 2)))[0]
    periodic_bow = X[:, periodic_indices]
    periodic_features = trajectories[periodic_indices, :]
    periodic_dps = dps[periodic_indices]
    periodic_dp = dp[periodic_indices]
    
    if len(periodic_indices > 0):
        periodic_events = event_detector.unsupervised_greedy_event_detection(periodic_indices, periodic_bow,
                                                                              periodic_features, periodic_dps, periodic_dp)
    
        plotting.plot_events(trajectories, periodic_events, inverse, dps, dp, dirname='./periodic_{}'.format(cluster_num))
        print('Periodic done')
    else:
        print('No high power periodic features detected')

In [None]:
for cluster, docs_indices in enumerate(clusters):
    print('---------- Cluster {:d} ----------'.format(cluster))
    cluster_docs = [documents[doc_id] for doc_id in docs_indices]
    cluster_days = [relative_days[doc_id] for doc_id in docs_indices]
    
    cluster_stream_len = max(cluster_days) + 1  # Zero-based, hence the + 1.

    t = time()
    vectorizer = CountVectorizer(min_df=30, max_df=0.9, binary=True, stop_words=event_detector.CZECH_STOPWORDS)
    X = vectorizer.fit_transform(cluster_docs)
    inverse = {v: k for k, v in vectorizer.vocabulary_.items()}    
    detect_events(X, cluster_days, cluster_stream_len, inverse, cluster)