In [None]:
import json
import pandas as pd
import networkx as nx
import numpy as np
from math import dist
import copy
import matplotlib.pyplot as plt
##### model
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import pairwise_distances
from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph
from gensim.models import Word2Vec
from sklearn.cluster import DBSCAN


import warnings
import collections
from stellargraph import datasets
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import torch
from torch_geometric.utils.convert import from_networkx
from itables import init_notebook_mode,show
# init_notebook_mode(all_interactive=True)

%matplotlib inline



In [None]:
starts_with_di = {
    'CreateObject': ['accept','request','create','copy','run','purchase','allocate','import','startinstances'],
    'ModifyExistingResource': ['modify','update','set','tag','deregister','deprecate','un','reject','register'],
    'ListResources' : ['list'],
    'Download/UploadObjects': ['enc','getobjects','upload'],
    'GetInfo': ['describe','get','search'],
    'AssociateResources' : ['associate','put'],
    'Login' : ['assume','login','switch','renewrole','renewdelegate','checkmfa'],
    'RemovePermissions': ['remove'],
    'GrantPermissions': ['add','authorize']

}

contains_di = {
    'Delete': ['delet','terminate','revoke','drop','releaseaddress'] ,
    'DisableObjects': ['disabl','stop','cancel','unlink','suspend'],
    'EnableObjects': ['enabl','start','invoke','subscribe','test','complete'],
    'SensitiveInfo': ['send','accesskey','secretkey','token','invite','exportapi','decode'],
    'Logout': ['exit'],
    'CreateObject': ['create','copy'],
    'ModifyExistingResource': ['failover','change','confirm','promote','reboot','rotate','replace','retire','detach','modify','update','set','tag','deregister','Deprecate','attach','upgrade','wipe','transfer','validate','publish'],
    'ListResources' : ['list'],
    'Download/UploadObjects': ['getobjects','upload'],
    'GetInfo': ['describe','get','view','decrypt','generate','lookup','scan','simulate'],
    'AssociateResources' : ['associate','put'],
    'Login' : ['assume','login','renewrole','sign','forgot'],
    'Logout' : ['logout']

}

In [None]:

# crate a list to add dataframes to
awsc_list = list()

# list of files
files_list = ['../Datasets/attack3.json']

# Load event names
category_file = open("../Utils/event_category.json")
event_categories = json.load(category_file)
category_file.close()


In [None]:
for file in files_list:
    with open(file, 'r', encoding='utf-8') as f:
        data = json.loads(f.read())
        
        # Append the dataframes
        awsc_list.append(pd.DataFrame.from_records(data['Records']))

In [None]:
awsc = pd.concat(awsc_list).reset_index(drop=True)

In [None]:
len(awsc)

In [None]:
awsc[(awsc['eventSource']=='cognito-idp.amazonaws.com')]

In [None]:
users = pd.json_normalize(awsc['userIdentity']).merge(awsc[['eventTime','eventName','eventSource','awsRegion','errorCode']], left_index=True,right_index=True)

In [None]:
users['userName'] = users['userName'].fillna(users['arn']).fillna(users['invokedBy']) \
.fillna(users['invokedBy']).fillna(users['principalId']).fillna('accountId')

In [None]:
users = users[['eventTime','type','userName','eventName','eventSource','awsRegion','errorCode']]
users = users.rename(columns = {'eventTime':'timestamp','userName':'Identifier','eventSource':'TargetService','errorCode':'Error'})


In [None]:
users[users['Identifier']=='cloud_user']


In [None]:
li = []
def change_name(name):
    if name in event_categories:
        return event_categories[name]
    for key,value in starts_with_di.items():
        for event_name in value:
            if name.lower().startswith(event_name):
                return key
    for key,value in contains_di.items():
        for event_name in value:
            if event_name in name.lower():
                return key
    li.append(name)

users['eventName'] = users['eventName'].apply(lambda name: change_name(name))

In [None]:
set(li)

In [None]:
users['timestamp'] = pd.to_datetime(users['timestamp']).map(pd.Timestamp.timestamp).map(int)
users = users.sort_values('timestamp',ascending=True,ignore_index=True)

In [None]:
users['Identifier'] = users['Identifier'].apply(lambda x : x if (len(x.split(":"))!=2) else x.split(":")[1])
users.nunique()

In [None]:
# users['identity-target'] = "source-"+users['Identifier'] + "|" + "target-"+ users['TargetService']
users['identity-event'] = "source-"+users['Identifier'] + "|" + "action-"+users['eventName']
users['event-target'] = "action-"+ users['eventName'] + "|" + "target-"+ users['TargetService']
users = users[users['eventName'].notna()]

In [None]:
users.head()

In [None]:
users['Identifier'] = users['Identifier'].apply(lambda x : f"source-{x}")
users['TargetService'] = users['TargetService'].apply(lambda x : f"target-{x}")
users['eventName'] = users['eventName'].apply(lambda x : f"action-{x}")

In [None]:
len(users['TargetService'].unique())

In [None]:
def get_time_df(start,end,minutes,df,col='timestamp'):
    minutes = minutes * 60
    df_list = []
    start_time = start
    df_len = 0
    while start < end:
        cur_df = df[(df[col] >= start_time) & (df[col] < start+minutes)]
        if len(cur_df) != df_len:
            df_list.append(df[(df[col] >= start_time) & (df[col] < start+minutes)])
            df_len = len(cur_df)
        start+=minutes
    return df_list

In [None]:
df_list = get_time_df(users['timestamp'].iloc[0],users['timestamp'].iloc[-1],60*24*30,users)

In [None]:
len(df_list)

# Create the graph

In [None]:
nodes = ['Identifier','eventName','TargetService']
edges = ['identity-event','event-target']


In [None]:
def add_nodes(g,df,col,x,jump=10):
    y = 10
    unique_nodes = df[col].unique()
    for i in unique_nodes:
        if i!= None:
            g.add_node(f'{i}',pos=(x,y))
            y+=jump
#     for i in range(len(unique_nodes)-1):
#         g.add_edge(unique_nodes[i],unique_nodes[i+1])

def add_edges(g,df,col):
    edges = df[col].value_counts()
    edges = edges/edges.max()
    for node,value in edges.items():
        start_node,end_node = node.split('|')
        if "None" not in start_node and "None" not in end_node:
            g.add_edge(start_node, end_node, weight=value)

In [None]:
def generate_graph(G,df_nodes,df_edges):  
    x = 10
#     nodes_li = []
    for node in nodes:
        add_nodes(G,df_nodes,node,x,100)
        x+=1
#         nodes_li.append(df_nodes[node].unique())
#     for i in range(len(nodes_li)-1):
#         for node1 in nodes_li[i]:
#             for node2 in nodes_li[i+1]:
#                 G.add_edge(node1,node2,weight=0)
    for edge in edges:
        add_edges(G,df_edges,edge)

In [None]:
G = nx.Graph()

In [None]:
generate_graph(G,users,users)

In [None]:
def generate_graphs(df,df_list):
    li = []
    len_data = 0
    for data in df_list:
        if len(data) != len_data:
            G = nx.Graph()
            generate_graph(G,df,data)
            li.append(G)
            len_data = len(data)
    return li

In [None]:
graphs = generate_graphs(users,df_list)

In [None]:
len(graphs)

In [None]:
# from pylab import rcParams
# rcParams['figure.figsize'] = 14, 10
# pos=nx.get_node_attributes(G,'pos')
# # pos = nx.spring_layout(G, scale=20, k=3/np.sqrt(G.order()))
# d = dict(G.degree)
# nx.draw(G, pos, node_color='lightblue', 
#         with_labels=True, 
#         nodelist=d, 
#         node_size=[d[k]*300 for k in d])
# labels = nx.get_edge_attributes(G,'weight')
# nx.draw_networkx_edge_labels(G,pos,edge_labels=labels,font_size=5)
# import matplotlib.pyplot as plt
# plt.savefig(f'plotgraph.png', dpi=300, bbox_inches='tight')


# pos=nx.get_node_attributes(G,'pos')
# nx.draw(G,pos)
# plt.show()

In [None]:
# from sklearn.manifold import SpectralEmbedding
# embedding = SpectralEmbedding(n_components=74)
# X = embedding.fit_transform(df)
walk_length = 20  # maximum length of a random walk to use throughout this notebook

In [None]:
def gen_stellar(graphs):
    stellar_list = []
    for G in graphs:
        G = StellarGraph.from_networkx(G)
        rw = BiasedRandomWalk(G)
        stellar_list.append((G,rw))
    return stellar_list

In [None]:
def weighted_walks_graphs(SL):
    WW = []
    for G,rw in SL:
        weighted_walks = rw.run(
          nodes=G.nodes(),  # root nodes
          length=walk_length,  # maximum length of a random walk
          n=10,  # number of random walks per root node
          p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
          q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
          weighted=True,  # for weighted random walks
          seed=42,  # random seed fixed for reproducibility
        )
        WW.append(weighted_walks)
    return WW

In [None]:
def weighted_models(WW):
    w2v_models = []
    for weighted_walks in WW:
        weighted_model = Word2Vec(
          weighted_walks, vector_size=128, window=5, min_count=0, sg=1, workers=1,
        )
        w2v_models.append(weighted_model)
    return w2v_models

In [None]:
def node_embedding(weighted_model_list):
    embedding = []
    for weighted_model in weighted_model_list:
        node_ids = weighted_model.wv.index_to_key  # list of node IDs
        weighted_node_embeddings = (
          weighted_model.wv.vectors
        ) 
        embedding.append((node_ids,weighted_node_embeddings))
    return embedding

In [None]:
def d2_graphs(embedding):
    embedding2d_annotation = []
    for node_ids,weighted_node_embeddings in embedding:
        tsne = TSNE(n_components=2, random_state=42)
        weighted_node_embeddings_2d = tsne.fit_transform(weighted_node_embeddings)
        embedding2d_annotation.append((node_ids,weighted_node_embeddings_2d))
    return embedding2d_annotation

In [None]:
SL = gen_stellar([G])
WW = weighted_walks_graphs(SL)
weighted_model_list = weighted_models(WW)
embedding = node_embedding(weighted_model_list)
d2_graphs_list = d2_graphs(embedding)
X = d2_graphs_list[0][1]

In [None]:
SL_graphs = gen_stellar(graphs)
WW_graphs = weighted_walks_graphs(SL_graphs)
weighted_model_list_graphs = weighted_models(WW_graphs)
embedding_graphs = node_embedding(weighted_model_list_graphs)
d2_graphs_li = d2_graphs(embedding_graphs)

In [None]:
# d2_graphs_li

In [None]:
# count = 0
# for j in range(len(d2_graphs_li)):
#     i = d2_graphs_li[j]
#     G1 = nx.Graph()
#     color_map = []
#     for k in range(len(i[0])):
#         if i[0][k] == "source-cloud_user":
#             color_map.append("red")
#             G1.add_node(i[0][k],pos=i[1][k])
#         else:
#             color_map.append("green")
#             G1.add_node(i[0][k],pos=i[1][k])
#     pos=nx.get_node_attributes(G1,'pos')
# #     if j == len(d2_graphs_li)-1:
#     nx.draw(G1,pos,node_color=color_map)#node_color=color_map)
# #     else:
# #         nx.draw(G1,pos)#node_color=color_map)
# #     plt.savefig(f"figure{count}.png")
#     plt.show()
#     count+=1

In [None]:
df_from_graphs = []
for graph in graphs:
    df_from_graphs.append(nx.to_pandas_adjacency(graph, dtype=float))

In [None]:
distance_df = pd.DataFrame(0, index=embedding[0][0], columns=embedding[0][0])

In [None]:
# d2_graphs_list[0][0].index('source-cloud_user')

In [None]:
fig, ax = plt.subplots()
ax.scatter(X[:, 0], X[:, 1])

for i, txt in enumerate(X):
    ax.annotate(i, (txt[0], txt[1]))

In [None]:
X = embedding[0][1]

In [None]:
X

In [None]:
from sklearn.neighbors import NearestNeighbors
a = -1
ans_di = {}
for df_graph in df_from_graphs:
    a+=1
    target_cols = [col for col in df_graph.columns if 'action-' in col]
#     if a == 2:
#         print(target_cols)
    for col in target_cols:
        index_di = {}
        index_to_pd_index = {}
        nodes = df_graph[col].loc[df_graph[col]>0]
        node_di = {key:value for key,value in nodes.items() if 'source-' in key and 'resource-' not in key}
#         if "source-cloud_user" in node_di.keys():
#             print(col)
        for i,other in enumerate(node_di.items()):
            ind,_=other
            pd_index = embedding_graphs[a][0].index(ind)
            index_di[pd_index] = ind
            index_to_pd_index[i] = ind
        if len(index_di) > 1:
            X = embedding_graphs[a][1]
            x = X[list(index_di.keys())]
            nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(x)
            distances, indices = nbrs.kneighbors(x)
            for i,other in enumerate(index_di.items()):
                key,value = other
                nearest_neighbor = indices[i][1]
                nearest_neighbor_ind = index_to_pd_index[nearest_neighbor]
                if ans_di.get(value,False) == False:
                    ans_di[value] = {}
                if ans_di[value].get(a,False) == False: 
                    ans_di[value][a] = 0
                ans_di[value][a] = max(ans_di[value][a],distances[i][1])
                if distance_df.loc[value,nearest_neighbor_ind] ==0:#<  distances[i][1]:
                    distance_df.loc[value,nearest_neighbor_ind] = distances[i][1]

In [None]:
cols = [c for c in distance_df.columns if 'action' not in c and 'target' not in c]

distance_df = distance_df[cols].drop(distance_df.columns.difference(cols))

In [None]:
result = nx.from_pandas_adjacency(distance_df)

In [None]:
pos_di = {}
for i in range(len(d2_graphs_list[0][0])):
    if 'source-' in d2_graphs_list[0][0][i] and 'resource-' not in d2_graphs_list[0][0][i]:
        pos_di[d2_graphs_list[0][0][i]] = d2_graphs_list[0][1][i]

In [None]:

color_map = ['red' if node == 'source-cloud_user' or node == 'source-sec-check' else 'green' for node in result]        
# pos=nx.spring_layout(result) # pos = nx.nx_agraph.graphviz_layout(G)
nx.draw(result,pos=pos_di,node_color=color_map,with_labels = True, node_size=60,font_size=8)
# labels = nx.get_edge_attributes(result,'weight')
# nx.draw_networkx_edge_labels(result,pos=pos)
plt.savefig("res.png")

In [None]:
# distance_df.loc['source-cloud_user']

In [None]:
# distance_df['source-cloud_user']

In [None]:
distance_df.head()

In [None]:
ans = distance_df.T.max().reset_index().rename(columns={0:'Anomaly Score'})

In [None]:
ans.sort_values('index',inplace = True,ignore_index=True)

In [None]:
ans['Node'] = ans.index

In [None]:
ax = ans.plot.bar(x='Node',
                      y='Anomaly Score',ylim=(0,2.7))

In [None]:
for i in enumerate(ans['index']):
    print(i)

In [None]:
ans[ans['Anomaly Score'] > ans['Anomaly Score'].mean() + 2*ans['Anomaly Score'].std()]

In [None]:
ans['Anomaly Score'].mean() + 2*ans['Anomaly Score'].std()

In [None]:
ans['Anomaly Score'].mean() + 1*ans['Anomaly Score'].std()

In [None]:
ans['Anomaly Score'].mean()

In [None]:
ans['Anomaly Score'].std()*ans['Anomaly Score'].std()