<a href="https://colab.research.google.com/github/venkiharvgit/csci-e-599a-2024-cybersecurity/blob/main/graph_similarity_using_karateclub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
!pip install karateclub



In [10]:
import random as random
import math

import networkx as nx
import karateclub as kc
from sklearn.metrics.pairwise import cosine_similarity

import numpy
import pandas as pd
# import dask.dataframe as dd

In [11]:
dir_data = '/content/drive/MyDrive/capstone-project/data/chunked_data/'
dir_model = '/content/drive/MyDrive/capstone-project/model'

cols = ['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT','L4_DST_PORT', 'PROTOCOL', 'IN_BYTES', 'OUT_BYTES',
       'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS','Label']

In [12]:
# dataset = "NF-BoT-IoT-v2_chunks"
# chunk_no = 0
# raw_data=pd.read_parquet(dir_data + dataset + f"/part.{chunk_no}.parquet", columns = cols)

raw_data = pd.read_parquet("/content/drive/MyDrive/capstone-project/data/parquet/NF-BoT-IoT-v2.parquet",
                           columns = cols)

raw_data['src_id'] = raw_data['IPV4_SRC_ADDR'].astype(str) + ":" + raw_data['L4_SRC_PORT'].astype(str)
raw_data['dst_id'] = raw_data['IPV4_DST_ADDR'].astype(str) + ":" + raw_data['L4_DST_PORT'].astype(str)
raw_data.drop(['IPV4_SRC_ADDR', 'L4_SRC_PORT', 'IPV4_DST_ADDR', 'L4_DST_PORT'], axis=1, inplace=True)

In [13]:
def calculate_sampled_list_of_embeddings(raw_data: pd.DataFrame, frac: float, n_samples: int, model_type: kc.graph_embedding, starting_seed: int = 10) -> numpy.ndarray:
  """
  Calculates embeddings sampled from a give input dataset.
  Returns a list of embeddings where the first element of the list is the embedding from the full graph
  """

  all_graphs = []

  #Full Graph
  print('...Creating Full Graph')
  graph = nx.from_pandas_edgelist(raw_data, 'src_id', 'dst_id', create_using=nx.DiGraph(),edge_attr=True)
  mapping = {node: i for i, node in enumerate(graph.nodes())}
  graph = nx.relabel_nodes(graph, mapping)
  all_graphs.append(graph)


  #Graph Sample
  for i in range(n_samples):

    print(f"...Creating Sample Graph {i+1} of {n_samples}")
    random.seed(starting_seed + i)
    temp = raw_data.sample(frac=frac, replace=True)
    temp.shape

    graph = nx.from_pandas_edgelist(temp, 'src_id', 'dst_id', create_using=nx.DiGraph(),edge_attr=True)
    mapping = {node: i for i, node in enumerate(graph.nodes())}
    graph = nx.relabel_nodes(graph, mapping)

    all_graphs.append(graph)


  print(f"...Calculating embeddings using {model_type}")
  model = model_type()
  model.fit(all_graphs)
  X = model.get_embedding()

  return X

In [None]:
first_test = calculate_sampled_list_of_embeddings(raw_data, frac=0.1, n_samples = 12, model_type = kc.GL2Vec)

...Creating Full Graph


In [88]:
[cosine_similarity(first_test[0].reshape(1, -1), x.reshape(1, -1)) for x in first_test[1:]]

[array([[0.08887498]], dtype=float32),
 array([[0.11706911]], dtype=float32),
 array([[0.22197089]], dtype=float32),
 array([[0.21863353]], dtype=float32),
 array([[0.2261459]], dtype=float32),
 array([[0.23415993]], dtype=float32),
 array([[0.20656711]], dtype=float32),
 array([[0.24609858]], dtype=float32),
 array([[0.21648082]], dtype=float32),
 array([[0.1825842]], dtype=float32),
 array([[0.17975272]], dtype=float32),
 array([[0.07656963]], dtype=float32)]

In [45]:
raw_one = raw_data.sample(frac = 0.1, replace=True)
graph_one = nx.from_pandas_edgelist(raw_one, 'src_id', 'dst_id', create_using=nx.DiGraph(),edge_attr=True)
mapping = {node: i for i, node in enumerate(graph_one.nodes())}
graph_one = nx.relabel_nodes(graph_one, mapping)


raw_two = raw_data.sample(frac = 0.1, replace=True)
graph_two = nx.from_pandas_edgelist(raw_two, 'src_id', 'dst_id', create_using=nx.DiGraph(),edge_attr=True)
mapping = {node: i for i, node in enumerate(graph_two.nodes())}
graph_two = nx.relabel_nodes(graph_two, mapping)

model = kc.GL2Vec()
model.fit([graph_one, graph_two])

In [50]:
X = model.get_embedding()

In [61]:
X[0].reshape(-1, 1)

cosine_similarity(X[0].reshape(1, -1), X[1].reshape(1, -1))

array([[-0.03822189]], dtype=float32)

In [54]:
X[0].shape()


embeddings1 = embeddings1.reshape(-1, embeddings1.shape[1])
embeddings2 = embeddings2.reshape(-1, embeddings2.shape[1])




(128,)

In [31]:
[cosine_similarity(x[0], emb) for emb in embeddings]

[array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32)]

In [34]:
graph = nx.from_pandas_edgelist(raw_data, 'src_id', 'dst_id', create_using=nx.DiGraph(),edge_attr=True)

mapping = {node: i for i, node in enumerate(graph.nodes())}
graph = nx.relabel_nodes(graph, mapping)

In [12]:
X

array([[ 0.14285797,  0.1674583 ,  0.1427555 , -0.13704993, -0.15173559,
        -0.08096453, -0.07658111,  0.08233593, -0.11917418, -0.15119554,
        -0.08267602,  0.17365056, -0.08331024,  0.0268129 ,  0.1572449 ,
         0.07747612, -0.01737475,  0.16686738, -0.10635813,  0.09885111,
         0.07211719, -0.06293347, -0.01668796, -0.15748975,  0.00094269,
        -0.04746135,  0.17727219, -0.08440731, -0.12084486,  0.02008694,
        -0.13981624, -0.14246827, -0.11725336, -0.10903737, -0.07776565,
        -0.03511777, -0.04440962,  0.10313946,  0.12222339, -0.08110882,
        -0.11345202,  0.0139545 , -0.0806457 ,  0.00046206,  0.12835187,
         0.06947093, -0.09677208, -0.02294553,  0.00327204, -0.14455448,
         0.17638294,  0.14872295, -0.00053341,  0.10463204,  0.08060015,
         0.1549062 ,  0.06529004, -0.16598831, -0.04384575,  0.06416641,
        -0.01024376,  0.08766906,  0.06306204,  0.0791473 ,  0.09914115,
         0.10318878, -0.0892904 ,  0.08281241,  0.1

In [97]:
# from karateclub import FeatherGraph

model = kc.GL2Vec()
model.fit([graph])
X = model.get_embedding()

RuntimeError: you must first build vocabulary before training the model

In [96]:
X.shape

(1, 160)

In [36]:
def parquet_to_graph(path: str) -> nx.classes.digraph.DiGraph:

  raw_data=pd.read_parquet(path)
  raw_data['src_id'] = raw_data['IPV4_SRC_ADDR'].astype(str) + ":" + raw_data['L4_SRC_PORT'].astype(str)
  raw_data['dst_id'] = raw_data['IPV4_DST_ADDR'].astype(str) + ":" + raw_data['L4_DST_PORT'].astype(str)
  raw_data.drop(['IPV4_SRC_ADDR', 'L4_SRC_PORT', 'IPV4_DST_ADDR', 'L4_DST_PORT'], axis=1, inplace=True)

  graph = nx.from_pandas_edgelist(raw_data, 'src_id', 'dst_id',
                             create_using=nx.DiGraph(),
                             edge_attr=True)

  # Have to remap the node indices because the embedding functions require
  # integer, monotonically increasing node ids

  mapping = {node: i for i, node in enumerate(graph.nodes())}
  graph = nx.relabel_nodes(graph, mapping)

  return graph