In [None]:
#--------------#
###构建模型###
#--------------#

In [None]:
# define the parameters
graph_name = 'GDELT'
root_path = 'data/'
dataset = graph_name+'/'

In [None]:
# load graph

import pickle
from graph import Graph
graph = Graph(graph_name, idify=False)
pickle.dump(graph, open(root_path + dataset + "graph_new.pickle", "wb"))
# 4m

In [None]:
# create a Searcher object to search for a model (set of rules) and build the raw model
from searcher import Searcher
searcher = Searcher(graph)
model = searcher.build_model()
pickle.dump(model, open(root_path + dataset + "static_model_new.pickle", "wb"))
model.print_stats()
#1m40s

In [None]:
# build the temporal model
temporal_model, candidate_p, candidate_t = searcher.build_temporal_model(model)
pickle.dump(temporal_model, open(root_path + dataset + "temporal_model_new.pickle", "wb"))
temporal_model.print_stats(temporal_model)
#10m

In [None]:
### directly read preserved files

import pickle
graph_name = 'YAGO'
root_path = 'data/'
dataset = graph_name+'/'

graph = pickle.load(open(root_path + dataset + "graph_new.pickle", "rb"))
model = pickle.load(open(root_path + dataset + "static_model_new.pickle", "rb"))
temporal_model = pickle.load(open(root_path + dataset + "temporal_model_new.pickle", "rb"))


In [None]:
# initlize the detection function
import pickle
from anomaly_detector import AnomalyDetector
from model_updater import ModelUpdater

detector = AnomalyDetector(temporal_model)

def read_file(input_file):
    raw_data = []
    for fact in input_file.readlines():
        s, r, o, t = fact.strip().split('	')[:4]
        s = int(s)
        r = int(r)
        o = int(o)
        t = int(t)
        raw_data.append((s, r, o, t))
    return raw_data

valid_pos, test_pos = read_file(open(root_path + dataset + 'valid.txt', 'r')), read_file(open(root_path + dataset + 'test.txt', 'r'))
valid_t_2_C_neg, test_t_2_C_neg = pickle.load(open(root_path + dataset + '/conceptual_errors.pickle', 'rb'))
valid_t_2_T_neg, test_t_2_T_neg = pickle.load(open(root_path + dataset + '/time_errors.pickle', 'rb'))
valid_t_2_M_neg, test_t_2_M_neg = pickle.load(open(root_path + dataset + '/missing_errors.pickle', 'rb'))

valid_t_2_pos = {}
test_t_2_pos = {}
for sample in valid_pos:
    s, r, o, t = sample
    if t not in valid_t_2_pos.keys():
        valid_t_2_pos[t] = []
    valid_t_2_pos[t].append((int(s), int(r), int(o), int(t)))


for sample in test_pos:
    s, r, o, t = sample
    if t not in test_t_2_pos.keys():
        test_t_2_pos[t] = []
    test_t_2_pos[t].append((int(s), int(r), int(o), int(t)))

In [None]:
# detect concept+time+missing anomaly
# next_step_rules_list = list(next_step_rules)
# for rule_pair in next_step_rules_list[:max_rule]:
# #for rule_pair in rules.keys():
        #    self.updater.update_temporal_model(self.temporal_model, rule_pair[0], rule_pair[1], rules[rule_pair], sample_size)

import itertools
import numpy as np
from tqdm import tqdm
import multiprocessing as mp
from multiprocessing.dummy import freeze_support, Manager
from sklearn import metrics
from sklearn.metrics import precision_recall_curve, accuracy_score, f1_score

def sigmoid(z):
    return 1/(1 + np.exp(-z))

def get_f1(p, r):
        if (p+r) == 0:
            return 0
        else:
            return 2*(p*r)/(p+r)

def get_f05(p, r):
        if (p+r) == 0:
            return 0
        else:
            return 1.25*(p*r)/(0.25*p+r)
        
def get_metric(pred, y, anomaly_type):
    precision, recall, threshold = precision_recall_curve(y, pred)
    # find the best threshold by F1 score

    f1s = [get_f1(precision[i], recall[i]) for i in range(len(precision))]
    f05s = [get_f05(precision[i], recall[i]) for i in range(len(precision))]
    optimal_index = np.argmax(f05s)
    '''
    while precision[optimal_index] == 1.0:
        f05s[optimal_index] = 0
        optimal_index = np.argmax(f05s)
    '''

    fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    

    optimal_P_by_fscore, optimal_R_by_fscore, optimal_F1_by_fscore, optimal_F05_by_fscore, optimal_T_by_fscore = precision[optimal_index], recall[optimal_index], f1s[optimal_index], f05s[optimal_index], threshold[optimal_index]
    optimal_ACC_by_fscore = accuracy_score(y, pred > optimal_T_by_fscore)

    print('----- Result stats -----' + anomaly_type)
    print('Number of test samples: ' + str(len(y)))
    print('Number of anomaly samples: ' + str(sum(y)))
    print('-----------------------')
    print('Best result by F1-score:')
    print('P: ' + str(optimal_P_by_fscore))
    print('F05: ' + str(optimal_F05_by_fscore))
    print('AUC: ' + str(auc))

     # find the best threshold by 0.95 Precision
    
    filter_precision = np.array(precision)
    filter_precision[filter_precision < 0.80] = 10
    f1s = [get_f1(precision[i], recall[i]) for i in range(len(precision))]
    f05s = [get_f05(precision[i], recall[i]) for i in range(len(precision))]
    optimal_index = np.argmin(filter_precision)
    if precision[optimal_index] == 1.0:
        filter_precision = np.array(precision)
        filter_precision[filter_precision == 1.0] = 0
        optimal_index = np.argmax(filter_precision)
    
    for ind in range(len(precision)):
        if recall[ind] >= 0.30 and precision[ind] >= 0.80:
            optimal_index = ind
            break

    optimal_P_by_acc, optimal_R_by_acc, optimal_F1_by_acc, optimal_T_by_acc = precision[optimal_index], recall[optimal_index], f1s[optimal_index], threshold[min(optimal_index, len(threshold) - 1)]
    optimal_ACC_by_acc = accuracy_score(y, pred > optimal_T_by_acc)

# init parameters
hop = 2 # ICEWS14 2; ICEWS05 2; YAGO 10 GD 2
span = 2000 # ICEWS14 1000; ICEWS05 2000; YAGO 50 GD 12000
time_specific = False # ICEWS14 F; ICEWS05 F; YAGO F
span_t = 2000 # ICEWS14 200; ICEWS05 2000; YAGO 50 GD 12000
span_m = 2000 # ICEWS14 150; ICEWS05 2000; YAGO 50 GD 12000
max_rule = 10000 # ICEWS14 10000; ICEWS05 10000; YAGO 10000 GD 10000
step = 2 # ICEWS14 2; ICEWS05 2; YAGO 1 GD 2
aux_score = True # ICEWS14 T; ICEWS05 T; YAGO F GD F
aux_score_m = False # ICEWS14 F; ICEWS05 F; YAGO F GD F
distribution_aware = False # ICEWS14 T; ICEWS05 F; YAGO F GD F
and_or = 'and' # ICEWS14 or; ICEWS05 and; YAGO or GD and
update_sample_size = 10 # ICEWS14 10; ICEWS05 10; YAGO 20 GD 5 
update_span = 3000 # ICEWS14 200; ICEWS05 3000; YAGO 20 GD 12000
pred_list_C = [] # for concept anomaly
label_list_C = []

pred_list_T = [] # for time anomaly
label_list_T = []

pred_list_M = [] # for missing anomaly
label_list_M = []

time_specific_pred_list_C = []
time_specific_label_list_C = []
time_specific_pred_list_T = []
time_specific_label_list_T = []
time_specific_pred_list_M = []
time_specific_label_list_M = []

all_proj_rules = []
def update_helper_valid(m):
    raw_score, project_rules = detector.score_edge_temporal((int(m[0]), int(m[1]), int(m[2]), int(m[3])), hop = hop, max_span = span_t, and_or = and_or, max_step = step, aux_score = aux_score, dsa = distribution_aware, max_rule = max_rule, file_type='valid')
    detector.update(m, project_rules, update_sample_size, update_span)

def update_helper_test(x_y):
    m = x_y[0]
    project_rules = x_y[1]
    detector.update(m, project_rules, update_sample_size, update_span)

def concept_detect_helper(fact_label_t):
    m = fact_label_t[0]
    label = fact_label_t[1]
    t = fact_label_t[2]
    raw_score = detector.score_edge((int(m[0]), int(m[1]), int(m[2]), int(t)), hop = hop, max_span = span, and_or = and_or, time_specific = time_specific)
    pred = sigmoid(raw_score)

    return [pred, label]

def temporal_detect_helper(fact_label_t):
    m = fact_label_t[0]
    label = fact_label_t[1]
    t = fact_label_t[2]
    raw_score, project_rules = detector.score_edge_temporal((int(m[0]), int(m[1]), int(m[2]), int(t)), hop = hop, max_span = span_t, and_or = and_or, max_step = step, aux_score = aux_score, dsa = distribution_aware, max_rule = max_rule, file_type='test')
    pred = sigmoid(raw_score)

    return [pred, label, m, project_rules]

def missing_detect_helper(fact_label_t):
    m = fact_label_t[0]
    label = fact_label_t[1]
    t = fact_label_t[2]
    raw_score = detector.score_edge_missing((int(m[0]), int(m[1]), int(m[2]), int(t)), hop_c = hop, max_span_c = span, max_span_t = span_m, and_or = and_or, max_step = step, aux_score = aux_score_m, dsa = distribution_aware)
    pred = sigmoid(-raw_score)

    return [pred, label]

p = mp.Pool(50)
# update model in validate set
for t in tqdm(valid_t_2_pos.keys()):
    pos_samples = valid_t_2_pos[t]
    #p.map(update_helper_valid, pos_samples)

# detect anomalies in test set
for t in tqdm(list(test_t_2_C_neg.keys())[:10]):
    pos_samples = test_t_2_pos[t]
    pos_missing = test_t_2_M_neg[1][t]

    tmp_pred_C = []
    tmp_label_C = []
    tmp_pred_T = []
    tmp_label_T = []
    tmp_pred_M = []
    tmp_label_M = []

    concept_temporal_p = [[pos_samples[i], 0, t] for i in range(len(pos_samples))]
    missing_p = [[pos_missing[i], 1, t] for i in range(len(pos_missing))]
    concept_score_label_p = p.map(concept_detect_helper, concept_temporal_p)
    temporal_score_label_proj_p = p.map(temporal_detect_helper, concept_temporal_p)
    missing_score_label_p = p.map(missing_detect_helper, missing_p)
    for i in range(len(concept_score_label_p)):
        pred_list_C.append(concept_score_label_p[i][0])
        label_list_C.append(concept_score_label_p[i][1])
        tmp_pred_C.append(concept_score_label_p[i][0])
        tmp_label_C.append(concept_score_label_p[i][1])
    
    for i in range(len(temporal_score_label_proj_p)):
        pred_list_T.append(temporal_score_label_proj_p[i][0])
        label_list_T.append(temporal_score_label_proj_p[i][1])
        all_proj_rules.append([temporal_score_label_proj_p[i][2], temporal_score_label_proj_p[i][3]])
        tmp_pred_T.append(temporal_score_label_proj_p[i][0])
        tmp_label_T.append(temporal_score_label_proj_p[i][1])
    
    for i in range(len(missing_score_label_p)):
        pred_list_M.append(missing_score_label_p[i][0])
        label_list_M.append(missing_score_label_p[i][1])
        tmp_pred_M.append(missing_score_label_p[i][0])
        tmp_label_M.append(missing_score_label_p[i][1])
    

    concept_n = [[test_t_2_C_neg[t][i], 1, t] for i in range(len(test_t_2_C_neg[t]))]
    temporal_n = [[test_t_2_T_neg[t][i], 1, t] for i in range(len(test_t_2_T_neg[t]))]
    missing_n = [[test_t_2_M_neg[0][t][i], 0, t] for i in range(len(test_t_2_M_neg[0][t]))]
    concept_score_label_n = p.map(concept_detect_helper, concept_n)
    temporal_score_label_proj_n = p.map(temporal_detect_helper, temporal_n)
    missing_score_label_n = p.map(missing_detect_helper, missing_n)
    

    for i in range(len(concept_score_label_n)):
        pred_list_C.append(concept_score_label_n[i][0])
        label_list_C.append(concept_score_label_n[i][1])
        tmp_pred_C.append(concept_score_label_n[i][0])
        tmp_label_C.append(concept_score_label_n[i][1])
    
    for i in range(len(temporal_score_label_proj_n)):
        pred_list_T.append(temporal_score_label_proj_n[i][0])
        label_list_T.append(temporal_score_label_proj_n[i][1])
        tmp_pred_T.append(temporal_score_label_proj_n[i][0])
        tmp_label_T.append(temporal_score_label_proj_n[i][1])
    
    for i in range(len(missing_score_label_n)):
        pred_list_M.append(missing_score_label_n[i][0])
        label_list_M.append(missing_score_label_n[i][1])
        tmp_pred_M.append(missing_score_label_n[i][0])
        tmp_label_M.append(missing_score_label_n[i][1])
    
    time_specific_pred_list_C.append(tmp_pred_C)
    time_specific_label_list_C.append(tmp_label_C)
    time_specific_pred_list_T.append(tmp_pred_T)
    time_specific_label_list_T.append(tmp_label_T)
    time_specific_pred_list_M.append(tmp_pred_M)
    time_specific_label_list_M.append(tmp_label_M)
    
    # update model in test set
    p.map(update_helper_test, all_proj_rules)
    all_proj_rules = []

get_metric(np.array(pred_list_C), np.array(label_list_C), 'concept') # concept
get_metric(np.array(pred_list_T), np.array(label_list_T), 'time') # time
get_metric(np.array(pred_list_M), np.array(label_list_M), 'missing') # missing
detector.re_fresh()
p.close()
# 19min