In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from patternly.detection import AnomalyDetection
from patternly._utils import UnionFind, DirectedGraph
from zedsuite.zutil import Llk

In [5]:
%%time

# Prepare data
quantized_time_series = pd.read_csv(
    "./data/example1.dat", sep=" ", header=None, low_memory=False
).dropna(how="all", axis=1)

# Fit detection pipeline to training data
pipeline = AnomalyDetection(anomaly_sensitivity=2, n_clusters=5, reduce_clusters=True, quantize=False, eps=0.1, verbose=True)
pipeline = pipeline.fit(quantized_time_series)

Calculating distance matrix...
Clustering distance matrix...
Generating cluster PFSA 1/5...
Generating cluster PFSA 2/5...
Generating cluster PFSA 3/5...
Generating cluster PFSA 4/5...
Generating cluster PFSA 5/5...
Attempting to reduce clusters...
Reduced clusters from 5 to 4.
Clustering distance matrix...
Generating cluster PFSA 1/4...
Generating cluster PFSA 2/4...
Generating cluster PFSA 3/4...
Generating cluster PFSA 4/4...
Attempting to reduce clusters...
Reduced clusters from 4 to 2.
Clustering distance matrix...
Generating cluster PFSA 1/2...
Generating cluster PFSA 2/2...
Attempting to reduce clusters...
Calculating cluster PFSA means and stds...
Model fit.
CPU times: user 8.69 s, sys: 24.4 ms, total: 8.72 s
Wall time: 3.6 s


In [29]:
all_cluster_likelihoods = np.empty(shape=(pipeline.n_clusters, pipeline.n_clusters), dtype=np.float32)
all_ranked_likelihoods = np.empty(shape=(pipeline.n_clusters, pipeline.n_clusters), dtype=np.int32)

for i in range(pipeline.n_clusters):
    cluster_llks = []
    for pfsafile in pipeline.cluster_PFSA_files:
        cluster_data = pipeline.quantized_data[pipeline.quantized_data["cluster"] == i].drop(columns=["cluster"], axis=1)
        cluster_llks.append(np.asarray(Llk(data=cluster_data, pfsafile=pfsafile).run(), dtype=np.float32))
        
    # which cluster PFSA each sequence most likely maps back to
    closest_matches = np.argmin(cluster_llks, axis=0)
    # the likelihoods of the sequences generated by the current PFSA mapping back to each cluster PFSA 
    cluster_likelihoods = np.count_nonzero(
        (closest_matches.reshape(-1, 1) == np.arange(pipeline.n_clusters).reshape(1, -1)), 
        axis=0
#     ) / pipeline.quantized_data[pipeline.quantized_data["cluster"] == i].shape[0]
    ) / pipeline.cluster_counts[i]
    # list of cluster PFSAs sorted in descending order of likelihood
    ranked_likelihoods = np.argsort(cluster_likelihoods)[::-1]
    all_cluster_likelihoods[i] = cluster_likelihoods
    all_ranked_likelihoods[i] = ranked_likelihoods

In [30]:
print(all_cluster_likelihoods)
print(all_ranked_likelihoods)
print(pipeline.cluster_counts)

[[0.15       0.4        0.35       0.1        0.        ]
 [0.8        0.13333334 0.06666667 0.         0.        ]
 [1.         0.         0.         0.         0.        ]
 [0.         0.5714286  0.14285715 0.2857143  0.        ]
 [0.         0.         0.         0.         1.        ]]
[[1 2 0 3 4]
 [0 1 2 4 3]
 [0 4 3 2 1]
 [1 3 2 4 0]
 [4 3 2 1 0]]
[20, 15, 8, 7, 5]


In [38]:
# for i in range(len(all_cluster_likelihoods)):
#     if all_ranked_likelihoods[i][0] != i:
#         all_cluster_likelihoods[all_ranked_likelihoods[i][0]][i] += 0.1
# print(all_cluster_likelihoods)
graph = DirectedGraph(5)
graph.from_matrix(all_cluster_likelihoods, threshold=0)
print(graph.find_scc())
len(set(graph.low_links))
graph.graph
graph = DirectedGraph(5)
graph.from_matrix(all_cluster_likelihoods >= 0.1)
graph.graph
# graph.find_scc()

2


defaultdict(set, {0: {0, 1, 2, 3}, 1: {0, 1}, 2: {0}, 3: {1, 2, 3}, 4: {4}})

In [32]:
graph = UnionFind(pipeline.n_clusters)
for i in range(pipeline.n_clusters):
    best_match = all_ranked_likelihoods[i][0]
    second_best_match = all_ranked_likelihoods[i][1]
    if best_match != i:
        graph.union(i, best_match, ranks=(all_cluster_likelihoods[i]+all_cluster_likelihoods[best_match]))
    if second_best_match != i and all_cluster_likelihoods[i][second_best_match] > 2 * (1 / pipeline.n_clusters):
        graph.union(i, second_best_match, ranks=(all_cluster_likelihoods[i]+all_cluster_likelihoods[second_best_match]))
    print(graph.roots)
    
print(f"\n{graph.compress_all().roots}: {graph.n_components} components")
print(set(graph.roots))

[0 0 2 3 4]
[0 0 2 3 4]
[0 0 0 3 4]
[0 0 0 0 4]
[0 0 0 0 4]

[0 0 0 0 4]: 2 components
{0, 4}


In [4]:
%%time

predictions = pd.DataFrame(pipeline.predict())
anomalies = predictions[predictions[0] == True]
print(anomalies.shape[0])
anomalies

0
CPU times: user 143 ms, sys: 1.01 ms, total: 144 ms
Wall time: 144 ms


Unnamed: 0,0


In [5]:
from IPython.display import Image, display
from IPython.core.display import HTML
for i, file in enumerate(pipeline.cluster_PFSA_pngs):
    print(f"Cluster {i} PFSA")
    display(Image(url=f"{file}.png", width=300))
    
pipeline.print_PFSAs()

Cluster 0 PFSA


Cluster 1 PFSA


Cluster 0 PFSA:
    %ANN_ERR: 11.1166
    %MRG_EPS: 0.1
    %SYN_STR: 
    %SYM_FRQ: 0.42765 0.57235 
    %PITILDE: size(2)
    #PITILDE
    0.599018 0.400982 
    0.299581 0.700419 
    %CONNX: size(2)
    #CONNX
    0.599018 0.400982 
    0.299581 0.7004

Cluster 1 PFSA:
    %ANN_ERR: 11.1159
    %MRG_EPS: 0.1
    %SYN_STR: 
    %SYM_FRQ: 0.427375 0.572625 
    %PITILDE: size(2)
    #PITILDE
    0.603012 0.396988 
    0.296256 0.703744 
    %CONNX: size(2)
    #CONNX
    0.603012 0.396988 
    0.296256 0.7037

