# Extract features from the dataset

In [None]:
import json
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
import networkx as nx
import numpy as np

import copy
import matplotlib.pyplot as plt
##### model
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import pairwise_distances
from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph
from gensim.models import Word2Vec
from sklearn.cluster import DBSCAN


import warnings
import collections
from stellargraph import datasets
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import torch
from torch_geometric.utils.convert import from_networkx


%matplotlib inline



In [None]:

# crate a list to add dataframes to
awsc_list = list()

# list of files
files_list = ['../Datasets/flaws_cloudtrail00.json']

# Load event names
category_file = open("../Utils/event_category.json")
event_categories = json.load(category_file)
category_file.close()

encoder = OneHotEncoder(handle_unknown='ignore')
label_encoder = LabelEncoder()

In [None]:
starts_with_di = {
    'CreateObject': ['request','create','copy','run','purchase','allocate','import'],
    'ModifyExistingResource': ['modify','update','set','tag','deregister','Deprecate','un','reject','register'],
    'ListResources' : ['list'],
    'Download/UploadObjects': ['getobjects','upload'],
    'GetInfo': ['describe','get','search'],
    'AssociateResources' : ['associate','put'],
    'Login' : ['assume','login','switch','renewrole','renewdelegate'],
    'RemovePermissions': ['remove'],
    'GrantPermissions': ['add','authorize']

}

contains_di = {
    'Delete': ['delet','terminate','revoke','drop','releaseaddress'] ,
    'DisableObjects': ['disabl','stop','cancel','unlink','suspend'],
    'EnableObjects': ['enabl','start','invoke','subscribe','test','complete'],
    'SensitiveInfo': ['send','accesskey','secretkey','token','invite','exportapi'],
    'Logout': ['exit'],
    'CreateObject': ['create','copy'],
    'ModifyExistingResource': ['failover','change','confirm','promote','reboot','rotate','replace','retire','detach','modify','update','set','tag','deregister','Deprecate','attach','upgrade','wipe','transfer','validate','publish'],
    'ListResources' : ['list'],
    'Download/UploadObjects': ['getobjects','upload'],
    'GetInfo': ['describe','get','view'],
    'AssociateResources' : ['associate','put'],
    'Login' : ['assume','login','renewrole','sign','forgot'],
    'Logout' : ['logout']

}

In [None]:
for file in files_list:
    with open(file, 'r', encoding='utf-8') as f:
        data = json.loads(f.read())
        
        # Append the dataframes
        awsc_list.append(pd.DataFrame.from_records(data['Records']))

In [None]:
# Merge data frames into one 
awsc = pd.concat(awsc_list).reset_index(drop=True)

In [None]:
awsc[(awsc['eventSource']=='cognito-idp.amazonaws.com')]

In [None]:
users = pd.json_normalize(awsc['userIdentity']).merge(awsc[['eventTime','eventName','eventSource','awsRegion','errorCode']], left_index=True,right_index=True)

In [None]:
users['userName'] = users['userName'].fillna(users['arn']).fillna(users['invokedBy']) \
.fillna(users['invokedBy']).fillna(users['principalId']).fillna('accountId')

In [None]:
users = users[['eventTime','type','userName','eventName','eventSource','awsRegion','errorCode']]
users = users.rename(columns = {'eventTime':'timestamp','userName':'Identifier','eventSource':'TargetService','errorCode':'Error'})


In [None]:
users.head()


In [None]:
awsc.head()


In [None]:
li = []
def change_name(name):
    if name in event_categories:
        return event_categories[name]
    for key,value in starts_with_di.items():
        for event_name in value:
            if name.lower().startswith(event_name):
                return key
    for key,value in contains_di.items():
        for event_name in value:
            if event_name in name.lower():
                return key
    li.append(name)

users['eventName'] = users['eventName'].apply(lambda name: change_name(name))

# users['eventName'] = pd.DataFrame({'eventName':encoder.fit_transform(users[['eventName']]).toarray().tolist()},)
# users = users.join(encoder_df)
# users = users.drop('eventName', axis=1)

In [None]:
users['timestamp'] = pd.to_datetime(users['timestamp']).map(pd.Timestamp.timestamp).map(int)
users = users.sort_values('timestamp',ascending=True,ignore_index=True)

In [None]:
def get_time_df(start,end,minutes,df,col='timestamp'):
    minutes = minutes * 60
    df_list = []
    while start < end:
        cur_df = df[(df[col] >= start) & (df[col] < start+minutes)]
        if len(cur_df):
            df_list.append(df[(df[col] >= start) & (df[col] < start+minutes)])
        start+=minutes
    return df_list

In [None]:
users['Identifier'] = users['Identifier'].apply(lambda x : x if (len(x.split(":"))!=2) else x.split(":")[1])
users.nunique()

In [None]:
# With region
# users['identity-event'] = "source-"+users['Identifier'] + "|" + users['eventName']
# users['event-region'] = users['eventName'] + "|" + users['awsRegion']
# users['region-target'] = users['awsRegion'] + "|" + "target-"+ users['TargetService']

# Without region

users['identity-event'] = "source-"+users['Identifier'] + "|" + users['eventName']
users['event-target'] = users['eventName'] + "|" + "target-"+ users['TargetService']
users = users[users['eventName'].notna()]

# print("Number of nodes")

In [None]:
users.head()

In [None]:
users['Identifier'] = users['Identifier'].apply(lambda x : f"source-{x}")
users['TargetService'] = users['TargetService'].apply(lambda x : f"target-{x}")
df_list = get_time_df(users['timestamp'].iloc[0],users['timestamp'].iloc[-1],60*60*24,users)
for i in df_list:
    print(len(i))
# print(len(df_list))

In [None]:
t = users.drop(columns=['Error'])
df1 = t[t.isna().any(axis=1)]
df1

In [None]:
def add_nodes(g,df,col,x,jump=10):
    y = 10
    unique_nodes = df[col].unique()
    for i in unique_nodes:
        if i!= None:
            g.add_node(f'{i}',pos=(x,y))
            y+=jump
    for i in range(len(unique_nodes)-1):
        g.add_edge(unique_nodes[i],unique_nodes[i+1])

def add_edges(g,df,col):
    edges = df[col].value_counts()
    for node,value in edges.items():
        start_node,end_node = node.split('|')
        if "None" not in start_node and "None" not in end_node:
            g.add_edge(start_node, end_node, weight=value)

nodes = ['Identifier','eventName','TargetService']
edges = ['identity-event','event-target']
def generate_graph(G,df_nodes,df_edges):  
    x = 10
    for node in nodes:
        add_nodes(G,df_nodes,node,x,100)
        x+=1
        
#     for node in nodes:
#         if node!='eventName':
#             add_nodes(G,df,node,x,100)
#         else:
#             add_nodes(G,df,node,x,100)
#         x+=10
    # With region
#     add_nodes(G,df,'awsRegion')
#     add_edges(G,df,'identity-event')
#     add_edges(G,df,'event-region')
#     add_edges(G,df,'region-target')
    # Without region
    for edge in edges:
        add_edges(G,df_edges,edge)
        add_edges(G,df_edges,edge)
    

In [None]:
def generate_graphs(df,df_list):
    li = []
    for data in df_list:
        G = nx.Graph()
        generate_graph(G,df,data)
        li.append(G)
    return li

In [None]:
graphs = generate_graphs(users,df_list)

In [None]:
from pylab import rcParams
for i,G in enumerate(graphs):
    rcParams['figure.figsize'] = 14, 10
    pos=nx.get_node_attributes(G,'pos')
    # pos = nx.spring_layout(G, scale=20, k=3/np.sqrt(G.order()))
    d = dict(G.degree)
    nx.draw(G, pos, node_color='lightblue', 
            with_labels=True, 
            nodelist=d, 
            node_size=[d[k]*300 for k in d])
    labels = nx.get_edge_attributes(G,'weight')
    nx.draw_networkx_edge_labels(G,pos,edge_labels=labels,font_size=5)
    import matplotlib.pyplot as plt
    plt.savefig(f'{i}-plotgraph.png', dpi=300, bbox_inches='tight')


    pos=nx.get_node_attributes(G,'pos')
    nx.draw(G,pos)
    plt.show()

In [None]:
# from pylab import rcParams
# rcParams['figure.figsize'] = 14, 10
# pos=nx.get_node_attributes(G,'pos')
# # pos = nx.spring_layout(G, scale=20, k=3/np.sqrt(G.order()))
# d = dict(G.degree)
# nx.draw(G, pos, node_color='lightblue', 
#         with_labels=True, 
#         nodelist=d, 
#         node_size=[d[k]*300 for k in d])
# labels = nx.get_edge_attributes(G,'weight')
# nx.draw_networkx_edge_labels(G,pos,edge_labels=labels)
# pos=nx.get_node_attributes(G,'pos')
# nx.draw(G,pos)

In [None]:
walk_length = 20  # maximum length of a random walk to use throughout this notebook

In [None]:
def gen_stellar(graphs):
    stellar_list = []
    for G in graphs:
        G = StellarGraph.from_networkx(G)
        rw = BiasedRandomWalk(G)
        stellar_list.append((G,rw))
    return stellar_list

In [None]:
SL = gen_stellar(graphs)

In [None]:
def weighted_walks_graphs(SL):
    WW = []
    for G,rw in SL:
        weighted_walks = rw.run(
          nodes=G.nodes(),  # root nodes
          length=walk_length,  # maximum length of a random walk
          n=10,  # number of random walks per root node
          p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
          q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
          weighted=True,  # for weighted random walks
          seed=42,  # random seed fixed for reproducibility
        )
        WW.append(weighted_walks)
    return WW

In [None]:
WW = weighted_walks_graphs(SL)

In [None]:
# G = StellarGraph.from_networkx(G)
# rw = BiasedRandomWalk(G)

In [None]:
# weighted_walks = rw.run(
#   nodes=G.nodes(),  # root nodes
#   length=walk_length,  # maximum length of a random walk
#   n=10,  # number of random walks per root node
#   p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
#   q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
#   weighted=True,  # for weighted random walks
#   seed=42,  # random seed fixed for reproducibility
# )

In [None]:
# weighted_model = Word2Vec(
#   weighted_walks, vector_size=128, window=5, min_count=0, sg=1, workers=1,
# )

In [None]:
def weighted_models(WW):
    w2v_models = []
    for weighted_walks in WW:
        weighted_model = Word2Vec(
          weighted_walks, vector_size=128, window=5, min_count=0, sg=1, workers=1,
        )
        w2v_models.append(weighted_model)
    return w2v_models

In [None]:
weighted_model_list = weighted_models(WW)

In [None]:
def node_embedding(weighted_model_list):
    embedding = []
    for weighted_model in weighted_model_list:
        node_ids = weighted_model.wv.index_to_key  # list of node IDs
        weighted_node_embeddings = (
          weighted_model.wv.vectors
        ) 
        embedding.append((node_ids,weighted_node_embeddings))
    return embedding
embedding = node_embedding(weighted_model_list)

In [None]:
# node_ids = weighted_model.wv.index_to_key  # list of node IDs
# weighted_node_embeddings = (
#   weighted_model.wv.vectors
# ) 

In [None]:
# tsne = TSNE(n_components=2, random_state=42)
# weighted_node_embeddings_2d = tsne.fit_transform(weighted_node_embeddings)

In [None]:
def d2_graphs(embedding):
    embedding2d_annotation = []
    for node_ids,weighted_node_embeddings in embedding:
        tsne = TSNE(n_components=2, random_state=42)
        weighted_node_embeddings_2d = tsne.fit_transform(weighted_node_embeddings)
        embedding2d_annotation.append((node_ids,weighted_node_embeddings_2d))
    return embedding2d_annotation
d2_graphs_list = d2_graphs(embedding)

In [None]:
# import seaborn as sns
pos_list = []
for node_ids,weighted_node_embeddings_2d in d2_graphs_list:
    alpha = 0.7
    #fig = plt.figure(figsize=(5,5))
    #plt.subplot(1,len(files),num+1)


    plt.figure(figsize=(4, 4))
    plt.scatter(
        weighted_node_embeddings_2d[:, 0],
        weighted_node_embeddings_2d[:, 1],
        #c=node_targets.cat.codes,
        cmap="jet",
        alpha=0.7,
    )
    di = {}
    for i,txt in enumerate(node_ids):
        plt.annotate(i, (weighted_node_embeddings_2d[i][0], weighted_node_embeddings_2d[i][1]))
        print((i, (weighted_node_embeddings_2d[i][0], weighted_node_embeddings_2d[i][1])))
        di[txt] = (weighted_node_embeddings_2d[i][0], weighted_node_embeddings_2d[i][1])
    pos_list.append(di)
    #plt.axis('off')
    plt.title(file)
    plt.show()

In [None]:
from pylab import rcParams
plt.show()
for i,G in enumerate(graphs):
    rcParams['figure.figsize'] = 14, 10
#     pos=nx.get_node_attributes(G,'pos')
    # pos = nx.spring_layout(G, scale=20, k=3/np.sqrt(G.order()))
    d = dict(G.degree)
    pos = pos_list[i]
    nx.draw(G, pos, node_color='lightblue', 
            with_labels=True, 
            nodelist=d, 
            node_size=[d[k]*300 for k in d])
    labels = nx.get_edge_attributes(G,'weight')
    nx.draw_networkx_edge_labels(G,pos,edge_labels=labels,font_size=5)
    import matplotlib.pyplot as plt
    plt.savefig(f'{i}-embedded-plotgraph.png', dpi=300, bbox_inches='tight')


    pos=nx.get_node_attributes(G,'pos')
#     nx.draw(G,pos)
        
    plt.show()    


In [None]:
list(enumerate(node_ids))

In [None]:
models = []
for _,weighted_node_embeddings_2d in d2_graphs_list:
    clustering = DBSCAN(eps=1.5, min_samples=5).fit(weighted_node_embeddings_2d)
    models.append(clustering)

In [None]:
for clustering in models:
    print(len(clustering.labels_))

In [None]:
for clustering in models:
    print(clustering)

In [None]:
for clustering in models:
    print(clustering.labels_)

In [None]:
# import seaborn as sns
for li,clustering in zip(d2_graphs_list,models):
    node_ids,weighted_node_embeddings_2d = li[0],li[1]
    alpha = 0.7
    #fig = plt.figure(figsize=(5,5))
    #plt.subplot(1,len(files),num+1)


    plt.figure(figsize=(4, 4))
    plt.scatter(
        weighted_node_embeddings_2d[:, 0],
        weighted_node_embeddings_2d[:, 1],
        c=clustering.labels_,
        cmap="jet",
        alpha=0.7,
    )
    for i,txt in enumerate(node_ids):
        plt.annotate(clustering.labels_[i], (weighted_node_embeddings_2d[i][0], weighted_node_embeddings_2d[i][1]))
        print((i, (weighted_node_embeddings_2d[i][0], weighted_node_embeddings_2d[i][1])))
    #plt.axis('off')
    plt.title(file)
    plt.show()    
    

In [None]:
di = {0:node_ids}
for i,clustering in enumerate(models,start=1):
    di[i] = clustering.labels_
df = pd.DataFrame(data=di)

In [None]:
df.to_csv("test.csv")