In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import numpy as np
from scipy.stats import norm
# from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models, similarities
from seismic.observations import ObservationDAO, ObservationDAOError
from seismic.detector import SaxDetect, StaLtaDetect, DetectorError
from seismic.sax import Paa, Sax
from seismic.detector.utils import make_series

base_dir = "../sample/_vsm/"
obs_list = os.listdir(base_dir)
bandpass = (5, 10)
alphabet = "abcde"
paa_int = 50
rows = 3
cols = 3
obs = {}
for f in obs_list:
    obs[f] = ObservationDAO(os.path.join(base_dir, f))
series = {}
for n, o in obs.items():  # Use SaxDetect series property to get a series
    series[n] = make_series(o.stream[0].data, o.stats.sampling_rate)

## Detect Events using SaxDetect and recalculate SAX strings for events only

In [2]:
sax_str = {}
for n, o in obs.items():
    # Assuming one event per obs as this is known for our test daat
    o.bandpass(*bandpass)
    det = SaxDetect(o.stream[0].data, o.stats.sampling_rate)
    evt = det.slice(*det.detect(alphabet, paa_int).__next__())
    p = Paa(evt)
    s = Sax(p(50))
    sax_str[n] = "".join([i for i in s(alphabet)])

## Generate bags of words

In [3]:
raw_corpus = []
docs_in_corpus = []
w = 10  # Length of words
for name in sorted(sax_str.keys()):
    s = sax_str[name]
    bow = []
    for i in range(0, len(s) - w): # -1
        bow.append(s[i:i+w])
    raw_corpus.append(bow)
    docs_in_corpus.append(name)
docs_in_corpus

['cal.z',
 'cao.z',
 'cda.z',
 'cdv.z',
 'cmn.z',
 'cps.z',
 'cva.z',
 'cvl.z',
 'cvy.z',
 'elk.z',
 'knb.z',
 'lac.z',
 'mnv.z']

## TF-IDF

In [7]:
dictionary = corpora.Dictionary(raw_corpus)
corpus = [dictionary.doc2bow(t) for t in raw_corpus]
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

for i in range(len(raw_corpus)):
    print("{}\n{}".format(docs_in_corpus[i], 40*"="))
    query = dictionary.doc2bow(raw_corpus[2])
    vec_qry = lsi[query]
    index = similarities.MatrixSimilarity(lsi[corpus])
    sims = index[vec_qry]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    for s in sims:
        print("  {}\t{:.5f}".format(docs_in_corpus[s[0]], s[1]))

cal.z
  cda.z	1.00000
  cps.z	0.99989
  cvy.z	0.99954
  cdv.z	0.99668
  cao.z	0.99546
  cva.z	0.99050
  elk.z	0.98876
  mnv.z	0.98644
  lac.z	0.93044
  cmn.z	0.89007
  cvl.z	0.63125
  cal.z	0.36955
  knb.z	0.32980
cao.z
  cda.z	1.00000
  cps.z	0.99989
  cvy.z	0.99954
  cdv.z	0.99668
  cao.z	0.99546
  cva.z	0.99050
  elk.z	0.98876
  mnv.z	0.98644
  lac.z	0.93044
  cmn.z	0.89007
  cvl.z	0.63125
  cal.z	0.36955
  knb.z	0.32980
cda.z
  cda.z	1.00000
  cps.z	0.99989
  cvy.z	0.99954
  cdv.z	0.99668
  cao.z	0.99546
  cva.z	0.99050
  elk.z	0.98876
  mnv.z	0.98644
  lac.z	0.93044
  cmn.z	0.89007
  cvl.z	0.63125
  cal.z	0.36955
  knb.z	0.32980
cdv.z
  cda.z	1.00000
  cps.z	0.99989
  cvy.z	0.99954
  cdv.z	0.99668
  cao.z	0.99546
  cva.z	0.99050
  elk.z	0.98876
  mnv.z	0.98644
  lac.z	0.93044
  cmn.z	0.89007
  cvl.z	0.63125
  cal.z	0.36955
  knb.z	0.32980
cmn.z
  cda.z	1.00000
  cps.z	0.99989
  cvy.z	0.99954
  cdv.z	0.99668
  cao.z	0.99546
  cva.z	0.99050
  elk.z	0.98876
  mnv.z	0.98644
  lac.z	0.