In [None]:
import json
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
import networkx as nx
import numpy as np
from math import dist
import copy
import matplotlib.pyplot as plt
##### model
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import pairwise_distances
from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph
from gensim.models import Word2Vec
from sklearn.cluster import DBSCAN


import warnings
import collections
from stellargraph import datasets
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import torch
from torch_geometric.utils.convert import from_networkx


%matplotlib inline



In [None]:

# crate a list to add dataframes to
awsc_list = list()

# list of files
files_list = ['../Datasets/attack3.json']

# Load event names
category_file = open("../Utils/event_category.json")
event_categories = json.load(category_file)
category_file.close()

encoder = OneHotEncoder(handle_unknown='ignore')
label_encoder = LabelEncoder()

In [None]:
for file in files_list:
    with open(file, 'r', encoding='utf-8') as f:
        data = json.loads(f.read())
        
        # Append the dataframes
        awsc_list.append(pd.DataFrame.from_records(data['Records']))

In [None]:
awsc = pd.concat(awsc_list).reset_index(drop=True)

In [None]:
awsc[(awsc['eventSource']=='cognito-idp.amazonaws.com')]

In [None]:
users = pd.json_normalize(awsc['userIdentity']).merge(awsc[['eventTime','eventName','eventSource','awsRegion','errorCode']], left_index=True,right_index=True)

In [None]:
users['userName'] = users['userName'].fillna(users['arn']).fillna(users['invokedBy']) \
.fillna(users['invokedBy']).fillna(users['principalId']).fillna('accountId')

In [None]:
users = users[['eventTime','type','userName','eventName','eventSource','awsRegion','errorCode']]
users = users.rename(columns = {'eventTime':'timestamp','userName':'Identifier','eventSource':'TargetService','errorCode':'Error'})


In [None]:
users.head()


In [None]:
users['timestamp'] = pd.to_datetime(users['timestamp']).map(pd.Timestamp.timestamp).map(int)
users = users.sort_values('timestamp',ascending=True,ignore_index=True)

In [None]:
users['Identifier'] = users['Identifier'].apply(lambda x : x if (len(x.split(":"))!=2) else x.split(":")[1])
users.nunique()

In [None]:
users['identity-target'] = "source-"+users['Identifier'] + "|" + "target-"+ users['TargetService']

In [None]:
users.head()

In [None]:
users['Identifier'] = users['Identifier'].apply(lambda x : f"source-{x}")
users['TargetService'] = users['TargetService'].apply(lambda x : f"target-{x}")

In [None]:
def get_time_df(start,end,minutes,df,col='timestamp'):
    minutes = minutes * 60
    df_list = []
    start_time = start
    df_len = 0
    while start < end:
        cur_df = df[(df[col] >= start_time) & (df[col] < start+minutes)]
        if len(cur_df) != df_len:
            df_list.append(df[(df[col] >= start_time) & (df[col] < start+minutes)])
            df_len = len(cur_df)
        start+=minutes
    return df_list

In [None]:
df_list = get_time_df(users['timestamp'].iloc[0],users['timestamp'].iloc[-1],60*24*30,users)

In [None]:
len(df_list)

# Create the graph

In [None]:
nodes = ['Identifier','TargetService']
edges = ['identity-target']


In [None]:
def add_nodes(g,df,col,x,jump=10):
    y = 10
    unique_nodes = df[col].unique()
    for i in unique_nodes:
        if i!= None:
            g.add_node(f'{i}',pos=(x,y))
            y+=jump
#     for i in range(len(unique_nodes)-1):
#         g.add_edge(unique_nodes[i],unique_nodes[i+1])

def add_edges(g,df,col):
    edges = df[col].value_counts()
    for node,value in edges.items():
        start_node,end_node = node.split('|')
        if "None" not in start_node and "None" not in end_node:
            g.add_edge(start_node, end_node, weight=1)

In [None]:
def generate_graph(G,df_nodes,df_edges):  
    x = 10
    nodes_li = []
    for node in nodes:
        add_nodes(G,df_nodes,node,x,100)
        x+=1
        nodes_li.append(df_nodes[node].unique())
    for i in range(len(nodes_li)-1):
        for node1 in nodes_li[i]:
            for node2 in nodes_li[i+1]:
                G.add_edge(node1,node2,weight=0)
    for edge in edges:
        add_edges(G,df_edges,edge)

In [None]:
G = nx.Graph()

In [None]:
generate_graph(G,users,users)

In [None]:
def generate_graphs(df,df_list):
    li = []
    for data in df_list:
        G = nx.Graph()
        generate_graph(G,df,data)
        li.append(G)
    return li

In [None]:
graphs = generate_graphs(users,df_list)

In [None]:
# from pylab import rcParams
# rcParams['figure.figsize'] = 14, 10
# pos=nx.get_node_attributes(G,'pos')
# # pos = nx.spring_layout(G, scale=20, k=3/np.sqrt(G.order()))
# d = dict(G.degree)
# nx.draw(G, pos, node_color='lightblue', 
#         with_labels=True, 
#         nodelist=d, 
#         node_size=[d[k]*300 for k in d])
# labels = nx.get_edge_attributes(G,'weight')
# nx.draw_networkx_edge_labels(G,pos,edge_labels=labels,font_size=5)
# import matplotlib.pyplot as plt
# plt.savefig(f'plotgraph.png', dpi=300, bbox_inches='tight')


# pos=nx.get_node_attributes(G,'pos')
# nx.draw(G,pos)
# plt.show()

In [None]:
df = nx.to_pandas_adjacency(G, dtype=float)

In [None]:
df.head()

In [None]:
from sklearn.manifold import SpectralEmbedding
embedding = SpectralEmbedding(n_components=74)
X = embedding.fit_transform(df)

In [None]:
X

In [None]:
import umap
# Create a UMAP object with 2 components (i.e., 2D)
umap_2d = umap.UMAP(n_components=2)

# Fit and transform the dataset using UMAP
X_umap_2d = umap_2d.fit_transform(X)

# Visualize the results
plt.scatter(X_umap_2d[:, 0], X_umap_2d[:, 1])
for i,other in enumerate(X_umap_2d):
    x,y = other
    plt.text(x,y,i)
plt.title('2D UMAP visualization')
plt.show()

In [None]:
df_from_graphs = []
for graph in graphs:
    df_from_graphs.append(nx.to_pandas_adjacency(graph, dtype=float))

In [None]:
distance_df = df.replace(1,0).copy(deep=True)

In [None]:
distance_df

In [None]:
from sklearn.neighbors import NearestNeighbors
a = 1
for df_graph in df_from_graphs:
#     a+=1
    target_cols = [col for col in df_graph.columns if 'target' in col]
    for col in target_cols:
        index_di = {}
        index_to_pd_index = {}
        nodes = df_graph[col].loc[df_graph[col]==1]
        for i,other in enumerate(nodes.items()):
            ind,_=other
            pd_index = df_graph.index.get_loc(ind)
            index_di[pd_index] = ind
            index_to_pd_index[i] = ind
        if len(index_di) > 1:
            x = X[list(index_di.keys())]
            nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(x)
            distances, indices = nbrs.kneighbors(x)
            for i,other in enumerate(index_di.items()):
                key,value = other
                nearest_neighbor = indices[i][1]
                nearest_neighbor_ind = index_to_pd_index[nearest_neighbor]
                if distance_df.loc[value,nearest_neighbor_ind] ==  0:
                    distance_df.loc[value,nearest_neighbor_ind] = distances[i][1]*a

In [None]:
cols = [c for c in distance_df.columns if 'target' not in c]

distance_df = distance_df[cols].drop(distance_df.columns.difference(cols))

In [None]:
result = nx.from_pandas_adjacency(distance_df)

In [None]:
color_map = ['red' if node == 'source-cloud_user' else 'green' for node in result]        
# pos=nx.spring_layout(result) # pos = nx.nx_agraph.graphviz_layout(G)
nx.draw(result,node_color=color_map)
# labels = nx.get_edge_attributes(result,'weight')
# nx.draw_networkx_edge_labels(result,pos=pos)
plt.savefig("res.png")

In [None]:
distance_df

In [None]:
ans = distance_df.T.max().reset_index().rename(columns={0:'Anomaly Score'})

In [None]:
ans.sort_values('index',inplace = True,ignore_index=True)

In [None]:
# a, b = 0, 2.5
# x, y = ans['Anomaly Score'].min(), ans['Anomaly Score'].max()
# ans['Anomaly Score'] = (ans['Anomaly Score'] - x) / (y - x) * (b - a) + a

In [None]:
ans['Node'] = ans.index

In [None]:
ax = ans.plot(kind = "bar",x='Node',
                      y='Anomaly Score',ylim=(0,2.5))

In [None]:
for i in enumerate(ans['index']):
    print(i)

In [None]:
ans['Anomaly Score'].std()

In [None]:
ans['Anomaly Score'].mean()

In [None]:
ans[ans['Anomaly Score'] > ans['Anomaly Score'].mean() + 2*ans['Anomaly Score'].std()] 

In [None]:
ans['Anomaly Score'].mean()

In [None]:
ans['Anomaly Score'].mean() + 2*ans['Anomaly Score'].std()

In [None]:
ans['Anomaly Score'].mean() + 1*ans['Anomaly Score'].std()

In [None]:
ans['Anomaly Score'].std()*ans['Anomaly Score'].std()