In [101]:
#!/usr/bin/env python
# coding: utf-8

# In[157]:


import scipy.io as sio
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
import copy
from scipy.interpolate import interp1d
import time
import seaborn as sns
from datetime import date
import os
import sys
from sklearn.manifold import MDS
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder   
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances
from sklearn.metrics import silhouette_score
from sklearn.neighbors import KernelDensity
from collections import Counter
from itertools import combinations
from matplotlib.ticker import FormatStrFormatter
from sklearn.preprocessing import minmax_scale
from tqdm import tqdm
import sys
from copy import deepcopy
sys.path.append("../xstream")
from xstream import *
import os


class Parameters():
    def __init__(self, 
                 json_file_name,
                 ):
        with open(json_file_name, 'r') as openfile:
            json_ = json.load(openfile)
        self.dataset_name = json_["dataset_name"]
        self.has_label = json_["has_label"]
        self.top_features = json_["top_features"]


#encode the categorical features to numbers with LabelEncoder
def encode_catgorical_column(data,column):
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    return le


""" Definition of Plot """
class Plot:
	def __init__( self, id ):
		self.id = id
		self.value = 0.0

	def get_id( self ): # Unique identifier of the plot
		return self.id

	def get_value( self ): # Total influence of the plot
		return self.value

	def update_value( self, value ):
		self.value = value

def get_topk_prediction(scores, top_k):
    ind = np.argpartition(scores, -top_k)[-top_k:]
    return ind


def load_all_result(path):
    """
    Read all results from path
    """
    return pd.read_csv(path, delimiter=",",index_col = "index")


def get_inference(all_result, X, feature_names=None):
    """
    Gets  explain value information
    :param result: all_results procesed by Xstream
    :param feature_name: optional feature names
    :return:
    """
    col_names = list(all_result.columns)
    if feature_names is None:
        feature_names = list(X.columns)
  
    explain_ = [feat + "_ex" for feat in feature_names]     

    #print(feature_names)
    assert len(explain_) == len(feature_names)
    assert len(explain_) == len(list(X.columns))
    for i in explain_:
        assert(i in col_names)
   
    outlier_explain = all_result[explain_]
    outlier_explain.index.names = ["index"]
    return outlier_explain

def get_anomaly_score_data(all_result):
    """
    get xstream anomaly scores
    :param result: all_results procesed by Xstream
    :return: array with anomaly scores
    """
    assert "anomaly_scores" in list(all_result.columns)
    return  all_result["anomaly_scores"]

# Get scatter plot outlier scores and figure
def get_scores(original_data, feature_X, feature_Y, make_plot=True, get_log = False):
    data = original_data[:,(feature_X, feature_Y)]
    #print(data.shape)
    #you need to feed a subset of features, retraining is required
    new_model = XStream()
    new_model.fit(data)
    #print("We acquire the anomaly scores")
    
    scores = new_model.predict_proba(data)
    scores = minmax_scale(scores)
    ids = list(range(data.shape[0]))
    tuples = [(ids[i], scores[i]) for i in range(0, len(ids))]
    scores = sorted( tuples, key = lambda x: x[1], reverse = True )
    return scores

def findPairs(n): 
    return list(Counter(combinations(n, 2)))


def print_format(ls):
    str_= str(ls)
    str_ = str_.replace(" ","")
    str_ = str_.replace(",","_")
    str_ = str_[1:-1]
    return str_

#load data, feature importances, y and etc.
dataset_name = "pkdd1998"
has_label = True
topk = 70
top_features = 10
all_result = load_all_result("../data/%s/concatenate_result.txt" % dataset_name)
original_data = load_all_result("../data/%s/generated_synthetic.txt"% dataset_name)
if has_label:
    feature_names = list(original_data.columns)[0:-1]
else:
    feature_names = list(original_data.columns)

X = original_data[feature_names]
data_index = list(X.index)
cat_dim_lst = []
for idx,ival in enumerate(X.iloc[0]):
    if type(ival) == str:
        cat_dim_lst.append(list(X.columns)[idx])

explanation_value = get_inference(all_result= all_result, 
                                    X= X,
                                    feature_names=feature_names)
scores = get_anomaly_score_data(all_result = all_result)
anomaly_scores = np.array(scores)
scores_index = list(scores.index)

anomaly_index = [int(item) for item in scores_index]
normal_index = [int(item) for item in data_index if item not in scores_index]

In [102]:
explanation_value

Unnamed: 0_level_0,type_ex,operation_ex,amount_ex,balance_ex,bank_ex,k_symbol_ex
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
162,0.017209,0.060122,0.054827,0.040790,0.030677,0.037336
442,0.018331,0.059855,0.055882,0.041678,0.029908,0.036211
226,0.017806,0.060225,0.055729,0.041430,0.030056,0.036145
1028,0.017932,0.059831,0.055426,0.041261,0.030128,0.036651
1020,0.017932,0.059831,0.055426,0.041261,0.030128,0.036651
...,...,...,...,...,...,...
340,0.017796,0.060153,0.055655,0.041285,0.052002,0.036234
1002,0.017775,0.059616,0.055445,0.041103,0.052033,0.036670
281,0.017743,0.060431,0.055574,0.041320,0.030011,0.059243
364,0.017796,0.060153,0.055655,0.041285,0.052002,0.036234


In [103]:
#transform data with labelencoder
X_encoded = deepcopy(X)
encoders = []
for feat in cat_dim_lst:
    encoder = encode_catgorical_column(X_encoded,feat)
    encoders.append(encoder)
X_encoded = np.array(X_encoded)

In [104]:
#find clusters
#

clusters = ["2 clusters","3 clusters"] #,"4 clusters","5 clusters","6 clusters","7 clusters","8 clusters","9 clusters"]
cluster_int_index = [int(col[0]) for col in clusters if col.endswith("clusters")]
cluster_indices = {}
for cluster in clusters:
    for c in all_result[cluster].unique():
        cluster_indices[cluster, c] = list((all_result[all_result[cluster] == c].index)) 

In [105]:
cluster_features = {}

for cluster in cluster_indices.keys():
    cluster_val = cluster_indices[cluster]
    features_list = np.argsort(np.mean(np.array(explanation_value.loc[cluster_val]),axis=0))[::-1][0:top_features]
    cluster_features[cluster] = features_list

In [106]:
cluster_features

{('2 clusters', 0): array([1, 2, 5, 3, 4, 0]),
 ('2 clusters', 1): array([1, 2, 4, 3, 5, 0]),
 ('3 clusters', 2): array([1, 2, 3, 5, 4, 0]),
 ('3 clusters', 0): array([1, 2, 4, 3, 5, 0]),
 ('3 clusters', 1): array([1, 5, 2, 3, 4, 0])}

In [107]:
cluster_feature_pairs = {}

for features_list in cluster_features.keys():
    cluster_feature_pairs[features_list] = findPairs(cluster_features[features_list])

clusters_rank_plot = {}
#from tqdm import tqdm
for feature_pairs in tqdm(cluster_feature_pairs.keys()):
    cluster_scores = []
    for pair in cluster_feature_pairs[feature_pairs]:
        cluster_scores.append(get_scores(X_encoded, pair[0],pair[1]))
    clusters_rank_plot[feature_pairs] = cluster_scores    

  0%|                                                     | 0/5 [00:00<?, ?it/s]

Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...


 20%|████████▊                                   | 1/5 [04:17<17:08, 257.01s/it]

Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...


 40%|█████████████████▌                          | 2/5 [08:36<12:55, 258.38s/it]

Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...


 60%|██████████████████████████▍                 | 3/5 [13:11<08:52, 266.11s/it]

Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...


 80%|███████████████████████████████████▏        | 4/5 [17:14<04:17, 257.05s/it]

Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...
Starting training...


100%|████████████████████████████████████████████| 5/5 [21:25<00:00, 257.06s/it]


In [108]:
#from copy import deepcopy
clusters_rank_plot2 = {}
for i in clusters_rank_plot.keys():
    clusters_rank_plot2[str(i)] = deepcopy(clusters_rank_plot[i])

isExist = os.path.exists("../assets/%s" % dataset_name)
if not isExist:
    os.makedirs("../assets/%s" % dataset_name)    
    
np.savez('../assets/%s/xstream%s.npz' % (dataset_name, print_format(cluster_int_index)), **clusters_rank_plot2, allow_pickle=True)
df_xstreams_val = np.load('../assets/%s/xstream%s.npz' % (dataset_name, print_format(cluster_int_index)), allow_pickle=True)

#encoder mapping
encoder_mapping = {}
for i,le in enumerate(encoders):
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    encoder_mapping[cat_dim_lst[i]] = le_name_mapping
print(encoder_mapping)

{'type': {'CREDIT': 0, 'ISSUE': 1}, 'operation': {'TRANSFER FROM ACCOUNT': 0, 'TRANSFER TO ACCOUNT': 1}, 'bank': {'AB': 0, 'CD': 1, 'EF': 2, 'GH': 3, 'IJ': 4, 'KL': 5, 'MN': 6, 'OP': 7, 'QR': 8, 'ST': 9, 'UV': 10, 'WX': 11, 'YZ': 12}, 'k_symbol': {'DUCHOD': 0, 'POJISTNE': 1, 'SIPO': 2, 'UVER': 3, 'unknown': 4}}


In [119]:
def str_to_tuple(val):
    sep = val.split(",")
    if len(sep) >1:
        i = sep[0]
        i = i.replace("('","")
        i = i.replace("'","")
        j = sep[1]
        j = j.replace(" ","")
        j = j.replace("'","")
        j = j.replace(")","")
        tup = (i, int(j))
        return tup
    else:
        return None

df_xstreams = {}
for key in df_xstreams_val.keys():
    tup = str_to_tuple(key)
    if tup is not None:
        df_xstreams[tup] = df_xstreams_val[key]




def assign_scores(cluster_rank, outliers):
    plot_best = {}
    for plot_n,plot in enumerate(cluster_rank):
        for outlier in plot:
            idx = outlier[0]
            score = outlier[1]
            if idx in outliers: 
                if idx not in plot_best.keys():
                    plot_best[idx]= (plot_n,score)
                elif plot_best[idx][1] <= score:
                    plot_best[idx] = (plot_n,score)
#     print("assign_scores")
#     print(plot_best)
#     print(len(list(plot_best.keys())))
#     print("others")
#     print(len(outliers))
    return plot_best


def LookOut(budget,cluster_rank_plot,sorted_plots,best_graphs,outliers):
    budget_best_graphs = best_graphs[0:budget]
    budget_plots = []
    for plot,score in sorted_plots:
        if budget > 0:
            budget_plots.append(cluster_rank_plot[plot])
            budget-=1
        else: break
    plot_max = {}
    #print(budget_plots)
    for key,value in assign_scores(budget_plots,outliers).items():
        anomaly_id = key
        plot_id = sorted_plots[value[0]][0]
        score = value[1]
        if plot_id not in plot_max.keys():
            plot_max[plot_id] = [score,[anomaly_id]]
        else:
            plot_max[plot_id][0] += score
            plot_max[plot_id][1] += [anomaly_id]
#     print(len(plot_max.keys()))
#     print(plot_max.keys())
#     print("budget")
#     print(budget_best_graphs)
#     print("budget plots")
#     print(len(budget_plots))
    return plot_max,budget_best_graphs


def plot(budget,cluster_rank_plot,sorted_plots,best_graphs,cluster,outliers,X_index):
    lo = LookOut(budget,cluster_rank_plot,sorted_plots,best_graphs,outliers)
    #lo[0] is the max_explained values
    #lo[1] is the graphs until the maximum budgets
    figure_list = []
    for plot in lo[1]:
        score = lo[0][plot][1]
        x,y = cluster_feature_pairs[cluster][plot]
        dfx = pd.DataFrame(X_index)
        dfx["color"] = [0 if i not in outliers else 1 if i in score else 2 for i in dfx[0]]
        newPal = {0 :'black' , 1 :'red', 2 :'cyan'}
        fig = plt.figure(figsize = (4,3),dpi = 400)
        ax = fig.add_subplot(111)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        #features = np.arange(X_encoded.shape[1])
        #features = np.load("data/data/dataset_features.npy",allow_pickle=True)
        plt.xlabel(feature_names[x])
        if feature_names[x] in cat_dim_lst:
            plt.xticks([encoder_mapping[feature_names[x]][i] for i in encoder_mapping[feature_names[x]].keys()],\
                       list(encoder_mapping[feature_names[x]].keys()),rotation=20) 
        
        plt.ylabel(feature_names[y])
        if feature_names[y] in cat_dim_lst:
            plt.yticks([encoder_mapping[feature_names[y]][i] for i in encoder_mapping[feature_names[y]].keys()],\
                       list(encoder_mapping[feature_names[y]].keys()),rotation=20) 
        plt.scatter(X_encoded[:,(x)],X_encoded[:,(y)],c=dfx["color"].map(newPal),edgecolor='black',linewidth=0.3,alpha=0.5)
        figure_list.append(fig)
    return figure_list




# In[199]:


cluster_plot_scores = {}

for cluster_ix in df_xstreams.keys():
    plot_scores = {}
    cluster_rank = df_xstreams[cluster_ix]
    outliers = cluster_indices[cluster_ix]
    for key,value in assign_scores(cluster_rank,outliers).items():
        anomaly_id = key
        plot_id = value[0]
        score = value[1]
        if plot_id not in plot_scores.keys():
            plot_scores[plot_id] = [score,[anomaly_id]]
        else:
            plot_scores[plot_id][0] += score
            plot_scores[plot_id][1] += [anomaly_id]

    sorted_plots = sorted(plot_scores.items(), key=lambda item: item[1][0],reverse=True)

    best_graphs = [plot[0] for plot in sorted_plots]

    cluster_plot_scores[cluster_ix]= (cluster_rank,sorted_plots,best_graphs)

# In[ ]:






In [122]:

cluster_plot_scores = {}

for cluster_ix in df_xstreams.keys():
    plot_scores = {}
    cluster_rank = df_xstreams[cluster_ix]
    outliers = cluster_indices[cluster_ix]
    for key,value in assign_scores(cluster_rank,outliers).items():
        anomaly_id = key
        plot_id = value[0]
        score = value[1]
        if plot_id not in plot_scores.keys():
            plot_scores[plot_id] = [score,[anomaly_id]]
        else:
            plot_scores[plot_id][0] += score
            plot_scores[plot_id][1] += [anomaly_id]
    sorted_plots = sorted(plot_scores.items(), key=lambda item: item[1][0],reverse=True)
    best_graphs = [plot[0] for plot in sorted_plots]
    cluster_plot_scores[cluster_ix]= (cluster_rank,sorted_plots,best_graphs)

for key in cluster_plot_scores.keys():
    cluster_id = int(key[0][0])
    sub_cluster_id = key[1]
    a,b,c = cluster_plot_scores[key]
    #lo = LookOut(budget,a,b,c)
    #print(len(lo[0]),len(lo[1]))
    for budget in range(1,6):
        figures = plot(budget,a,b,c,key, outliers = cluster_indices[key], X_index = data_index)
        for i,figure in enumerate(figures):
            fname = "../assets/"+dataset_name+"/{0}-{1}-{2}-{3}-{4}.png".format("lookout",cluster_id,sub_cluster_id+1,budget, i+1)
            figure.savefig(fname,bbox_inches = 'tight')
            plt.close(figure)


In [112]:
cluster_id

2

In [113]:
plot_scores

{7: [1.0, [1043.0]],
 3: [8.0, [1049.0, 1052.0, 1044.0, 1040.0, 1047.0, 1048.0, 1058.0, 1059.0]],
 5: [3.980896873842468, [1000.0, 1017.0, 1018.0, 1019.0]],
 6: [1.7084342468895377, [281.0, 226.0]],
 12: [2.959335783855708, [903.0, 162.0, 442.0]],
 13: [2.0, [1020.0, 1028.0]],
 4: [12.0,
  [1021.0,
   1024.0,
   1026.0,
   1027.0,
   1029.0,
   1032.0,
   1033.0,
   1034.0,
   1035.0,
   1036.0,
   1037.0,
   1038.0]]}

In [114]:
cluster_rank

array([[[1.04300000e+03, 1.00000000e+00],
        [1.04900000e+03, 1.00000000e+00],
        [1.05200000e+03, 1.00000000e+00],
        ...,
        [1.03700000e+03, 0.00000000e+00],
        [1.03800000e+03, 0.00000000e+00],
        [1.03900000e+03, 0.00000000e+00]],

       [[6.10000000e+01, 1.00000000e+00],
        [7.20000000e+01, 1.00000000e+00],
        [2.14000000e+02, 1.00000000e+00],
        ...,
        [9.95000000e+02, 0.00000000e+00],
        [9.96000000e+02, 0.00000000e+00],
        [9.98000000e+02, 0.00000000e+00]],

       [[1.62000000e+02, 1.00000000e+00],
        [4.42000000e+02, 9.29318106e-01],
        [9.03000000e+02, 7.90062301e-01],
        ...,
        [6.48000000e+02, 0.00000000e+00],
        [6.84000000e+02, 0.00000000e+00],
        [7.13000000e+02, 0.00000000e+00]],

       ...,

       [[1.62000000e+02, 1.00000000e+00],
        [4.42000000e+02, 9.93432573e-01],
        [9.03000000e+02, 9.65903211e-01],
        ...,
        [1.27000000e+02, 0.00000000e+00],
     

In [115]:
# dataset_name = "pkdd1998"
# for key in cluster_plot_scores.keys():
#     cluster_id = int(key[0][0])
#     sub_cluster_id = key[1]
#     a,b,c = cluster_plot_scores[key]
#     budget =1
#     figures = plot(budget,a,b,c,key, outliers = cluster_indices[key], X_index = data_index)
#     #lo = LookOut(budget,a,b,c)
#     #print(len(lo[0]),len(lo[1]))
#    # for budget in range(5):
#    #     figures = plot(budget,a,b,c,key)
#    #     for i,figure in enumerate(figures):
#    #         fname = "assets/"+dataset_name+"/{0}-{1}-{2}-{3}-{4}.png".format("lookout",cluster_id,sub_cluster_id+1,budget+1, i+1)
#    #         figure.savefig(fname,bbox_inches = 'tight')
#    #         plt.close(figure)



In [116]:
cluster_plot_scores

{('2 clusters',
  0): (array([[[1.04300000e+03, 1.00000000e+00],
          [1.04900000e+03, 1.00000000e+00],
          [1.05200000e+03, 1.00000000e+00],
          ...,
          [1.03700000e+03, 0.00000000e+00],
          [1.03800000e+03, 0.00000000e+00],
          [1.03900000e+03, 0.00000000e+00]],
  
         [[6.10000000e+01, 1.00000000e+00],
          [7.20000000e+01, 1.00000000e+00],
          [2.14000000e+02, 1.00000000e+00],
          ...,
          [9.95000000e+02, 0.00000000e+00],
          [9.96000000e+02, 0.00000000e+00],
          [9.98000000e+02, 0.00000000e+00]],
  
         [[1.62000000e+02, 1.00000000e+00],
          [4.42000000e+02, 9.29318106e-01],
          [9.03000000e+02, 7.90062301e-01],
          ...,
          [6.48000000e+02, 0.00000000e+00],
          [6.84000000e+02, 0.00000000e+00],
          [7.13000000e+02, 0.00000000e+00]],
  
         ...,
  
         [[1.62000000e+02, 1.00000000e+00],
          [4.42000000e+02, 9.93432573e-01],
          [9.03000000e+02