In [1]:
#Required library
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
plt.switch_backend('agg')
from networkx.algorithms import bipartite
import numpy as np
from datetime import datetime
from scipy.stats import cauchy, gamma
from random import randint, uniform
%matplotlib inline
from sklearn import preprocessing
from scipy.interpolate import make_interp_spline
from scipy.sparse import csr_matrix
import math
import copy
import multiprocessing
import time
import random
from sdv.tabular import CTGAN
from multiprocessing import Pool

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-c4fr1ksa because the default path (/home/jovyan/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
print("Number of cpu count:"+str(multiprocessing.cpu_count()))

Number of cpu count:128


In [3]:
#################################---G2A2 - Dynamic Graph Generation -----#######################################

In [4]:
################################---------Important functions-------------######################################

In [5]:
# Cauchy distribution
def cauchyDistribution(y, l, s):
    y1 = cauchy.pdf(y, loc = l, scale = s)
    return y1

In [6]:
#Gamma distribution
def gammaDistribution(y, a, l, s):
    y1 = gamma.pdf(y, a, loc = l, scale = s)
    return y1

In [7]:
# Samples node and its probability for the graph snapshot
def sampleNodes(x, size, p):  
    
    p[len(p) - 1] = 1 - np.sum(p[0:len(p)-1]) 
    
    sampledNodes = np.random.choice(x, size, p = p, replace = False)
    
    for i in range(len(x)):
        if x[i] not in sampledNodes:
            p[i] = 0
    
    p = probToOne(p)
    
    return p

In [8]:
# Sample degree by sampling nodes with repetation. Then counts the nodes to create the node sequence.
def sampleDegree(x, size, p):
    
    p = probToOne(p)
    p[len(p) - 1] = 1 - np.sum(p[0:len(p)-1]) 
    
    if p[len(p) - 1] < 0:
        p[len(p) - 1] = 0
        p = probToOne(p)

    sampleEdges = np.random.choice(x, size, p = p)
    
    seq = np.zeros(len(x), dtype = np.int32)
    
    for i in range(len(sampleEdges)):
        seq[sampleEdges[i]] += 1
    
    return seq

In [9]:
# Make the array sum equal to one
def probToOne(x):
    for i in range(len(x)):
        if x[i] < 0:
            x[i] = 0
            
    x = x/x.sum()
    return x

In [10]:
# Generates graph snapshot for the time stamp t

def genSS(p_u, p_v, c_u, c_v, c_e, uNodeIndex, vNodeIndex, t):   

    if c_u[t] != 0 and c_v[t] != 0 and c_e[t] != 0:
        P_su = sampleNodes(uNodeIndex, c_u[t], copy.deepcopy(p_u))
        P_sv = sampleNodes(vNodeIndex, c_v[t], copy.deepcopy(p_v))

        Seq_u = sampleDegree(uNodeIndex, c_e[t], P_su)
        Seq_v = sampleDegree(vNodeIndex - len(uNodeIndex), c_e[t], P_sv)

        bGraph = bipartite.configuration_model(Seq_u, Seq_v)

        biEdgeList = nx.to_pandas_edgelist(bGraph)
        biEdgeList['t'] = t
        biEdgeList['isAnomaly'] = 0
        
    DyBAM[t] = biEdgeList  

In [11]:
################################-----------Main code----------------######################################

In [124]:
#Cauchy parameters
l_e_lower = 12
l_e_upper = 13
s_e_lower = 15
s_e_upper = 16

l_u_lower = 12
l_u_upper = 13
s_u_lower = 15
s_u_upper = 16

l_v_lower = 12
l_v_upper = 13
s_v_lower = 15
s_v_upper = 16

#Gamma parameters

a_gu = 0.01
l_gu = 0.75
s_gu = 10


a_gv = 0.001
l_gv = 3
s_gv = 300

#Other parameters
#Total snapshots
T = 144

#cycle duration
x = 24

#Total U nodes
p = 200000

#Total V nodes
q = 20000

#Total number of edges
m = 1000000

#Total number of cores
c = 28

#Number of cycles
N_c = T//x

#Storing graph
#snapshot = csr_matrix((p, q), dtype=np.int32)

manager = multiprocessing.Manager()
DyBAM = manager.list(range(T))

#DyBAM = np.repeat(snapshot, T)

D_e = []
D_u = []
D_v = []

y = np.linspace(0, x - 1, x, dtype = np.int32)

#Generate index for U and V
uNodeIndex = np.linspace(0, p - 1, p, dtype = np.int32)
vNodeIndex = np.linspace(p, p + q - 1, q, dtype = np.int32)

In [125]:
#Appending Cauchy distribution
for i in range(N_c):
    l_e = random.uniform(l_e_lower, l_e_upper)
    s_e = random.uniform(s_e_lower, s_e_upper)
    l_u = random.uniform(l_u_lower, l_u_upper)
    s_u = random.uniform(s_u_lower, s_u_upper)
    l_v = random.uniform(l_v_lower, l_v_upper)
    s_v = random.uniform(s_v_lower, s_v_upper)
    
    D_e.extend(cauchyDistribution(y, l_e, s_e))
    D_u.extend(cauchyDistribution(y, l_u, s_u))
    D_v.extend(cauchyDistribution(y, l_v, s_v))

In [126]:
#Convert back to numpy
D_e = np.array(D_e)
D_u = np.array(D_u)
D_v = np.array(D_v)

In [127]:
#Temporal count of edges, Node U and Node V
C_e = probToOne(D_e) * m
C_u = probToOne(D_u) * p
C_v = probToOne(D_v) * q

#Convert to int
C_e = np.ceil(C_e).astype(int)
C_u = np.ceil(C_u).astype(int)
C_v = np.ceil(C_v).astype(int)

In [128]:
#Calculate the probability of all nodes using gamma distribution
P_u = gamma.rvs(a = a_gu, loc = l_gu, scale = s_gu, size = p)
P_v = gamma.rvs(a = a_gv, loc = l_gv, scale = s_gv, size = q)

#Normalize the probability so that its equal to 1
P_u = probToOne(P_u)
P_v = probToOne(P_v)

In [129]:
#Each loop is processed by a different core
jobs = []
for i in range(T):

    jobs.append([P_u, P_v, C_u, C_v, C_e, uNodeIndex, vNodeIndex, i])

print("Number of processes: "+str(c))

stime = time.time()
print()
pool = Pool(processes=c)
pool.starmap(genSS, jobs)
pool.close()
etime = time.time()

Number of processes: 28



In [130]:
import bicm

In [131]:
print("Time taken:", etime - stime)

Time taken: 60.86045813560486


In [132]:
#Initialize Feature list with number of edges(m) rows and 4 columns (u, v, t, label). 
F_org = DyBAM[0]

for i in range(1, T):
    F_org = pd.concat([F_org, DyBAM[i]], axis=0)

In [133]:
#Before anomaly injection graph stats
nU = F_org['source'].nunique()
nV = F_org['target'].nunique()
nE = F_org.shape[0]

In [134]:
#Graph Stats:
print("Number of Nodes U: "+str(nU))
print("Number of Nodes V: "+str(nV))
print("Number of Edges: "+str(nE))

Number of Nodes U: 55792
Number of Nodes V: 6141
Number of Edges: 1000365


In [135]:
#Convert to numpy
F_org = F_org.to_numpy()

#Clear all zero entry
F_org = F_org[~np.all(F_org == 0, axis=1)]

F = copy.deepcopy(F_org)

In [136]:
##################################--------Anomaly Injection--------------################################

In [137]:
################################---------Important functions-------------######################################

In [138]:
# Samples node and its probability for the graph snapshot
def sampleNodesAnomaly(x, size, p):  
    
    p[len(p) - 1] = 1 - np.sum(p[0:len(p)-1])    
    sampledNodes = np.random.choice(x, size, p = p, replace = False)
    
    return sampledNodes

In [139]:
def sampleEdgeAnomaly(L_u, L_v, P_u, P_v, count):
    
    P_u[len(P_u) - 1] = 1 - np.sum(P_u[0:len(P_u)-1]) 
    P_v[len(P_v) - 1] = 1 - np.sum(P_v[0:len(P_v)-1]) 
    
    sE_u = np.random.choice(L_u, count, p = P_u)
    sE_v = np.random.choice(L_v, count, p = P_v)
    
    return sE_u, sE_v

In [140]:
#Sum the number of edges in a graph snapshot
def graphSum(DyBAM, t):
    return DyBAM[t].sum()

In [141]:
#Update Feature Matrix
def updateFeatureMatrix(F, L_u, L_v, t):
    
    #Source and destination
    temp = np.concatenate((np.array(L_u).reshape(-1,1) , np.array(L_v).reshape(-1,1)), axis = 1).reshape(-1,2) 
    
    #Set time 
    time_column = np.repeat(t, temp.shape[0])
    #Set label
    label_column = np.ones(temp.shape[0], dtype=np.int32)
    
    #Merge the edge list, time and label together
    temp = np.insert(temp, 2, time_column, axis=1)
    temp = np.insert(temp, 3, label_column, axis=1)
    
    #Combine it with the original edgelist
    F = np.concatenate((F, temp), axis=0)
    
    return F

In [142]:
################################-----------Main code----------------######################################

In [143]:
#Flag to check if U and V are anomalous
isU_a = True
isV_a = True

#Initial number of anomalous U and V nodes
c_ua = 100
c_va = 5

#Anomaly subgraph properties: 
#Anomaly percentage (ratio)
p_a = 0.1

#Burstiness value
G_b = 2
#Propagation flag
isG_p = True
#Propagation ratio
G_p = 0.01
#Start time
T_s = 1
#Duration
t_a = 47

In [144]:
#To minimize rounding error
def shiftFloatingAnomalies(arr, G_b):
    sum1 = 0
    for i in range(len(arr)):
        if sum1 + arr[i] < G_b:
            sum1 += arr[i]
            arr[i] = 0
        else:
            sum2 = sum1 + arr[i]
            arr[i] = math.floor(sum2)
            sum1 = sum2 - arr[i]
        
    arr[len(arr) - 1] = math.ceil(arr[len(arr) - 1] + sum1)
    
    return arr    

In [145]:
#Number of anomalies per snapshot is also proportional to number of edges
C_a = probToOne(D_e[T_s : (T_s + t_a)]) * (p_a * nE)

C_a = shiftFloatingAnomalies(C_a, G_b).astype(int)

print(sum(C_a))

10004


In [146]:
# Randomly select nodes that are anomalous during the anomaly injection
ML_ua = sampleNodesAnomaly(uNodeIndex, c_ua, P_u)
ML_va = sampleNodesAnomaly(vNodeIndex, c_va, P_v)

P_ua = np.repeat(1/c_ua, c_ua)
P_va = np.repeat(1/c_va, c_va)

In [147]:
#Anomaly injection function
def anomalyInjection(
    F,           # Edge List
    isU_a,       # Flag if U can be anomalous
    isV_a,       # Flag if V can be anomalous
    c_ua,        # Number of initial U anomalous nodes
    c_va,        # Number of initial V anomalous nodes
    C_a,         # List of number of anomalous edges per snapshot
    G_b,         # Burstiness ratio
    isG_p,       # Flag if anomaly propagation is allowed
    G_p,         # Propagation ratio
    T_s,         # Start Time
    t_a,         # Duration
    uNodeIndex,  # U Node list
    vNodeIndex   # V Node List
):
    #Probability of U and V node of being selected for attacking node or attacked node
    P_u = np.repeat(1/len(uNodeIndex), len(uNodeIndex))
    P_v = np.repeat(1/len(vNodeIndex), len(vNodeIndex))
                        
    # Randomly select nodes that are anomalous during the anomaly injection
    ML_ua = sampleNodesAnomaly(uNodeIndex, c_ua, P_u)
    ML_va = sampleNodesAnomaly(vNodeIndex, c_va, P_v)
    
    for i in range(t_a):
        #Probability of Attacking nodes
        P_ua = np.repeat(1/ML_ua.shape[0], ML_ua.shape[0])
        P_va = np.repeat(1/ML_va.shape[0], ML_va.shape[0])
        
        #If both U and V nodes are attacking
        if isU_a and isV_a and C_a[i] != 0:
            #Calculate the number of attacks from u -> v and v -> u
            cAu_v = (C_a[i]//2)
            cAv_u = (C_a[i] - C_a[i]//2)
            
            #List of nodes that got attacked
            L_v = sampleNodesAnomaly(vNodeIndex, cAu_v * G_b, P_v)
            L_u = sampleNodesAnomaly(uNodeIndex, cAv_u * G_b, P_u)
            
            #Probability of attacked nodes
            P_sv = np.repeat(1/(cAu_v * G_b), cAu_v * G_b)
            P_su = np.repeat(1/(cAv_u * G_b), cAv_u * G_b)          
            
            #Pick U and V nodes cAu_v times from attacking U and attacked V 
            seqU, seqV = sampleEdgeAnomaly(ML_ua, L_v, P_ua, P_sv, cAu_v)
            
            #Update Feature List
            F = updateFeatureMatrix(F, seqU, seqV, T_s + i)
            
            #Pick U and V nodes cAv_u times from attacked V and attacking U 
            seqU, seqV = sampleEdgeAnomaly(L_u, ML_va, P_su, P_va, cAv_u)
            
            #Update Feature List
            F = updateFeatureMatrix(F, seqU, seqV, T_s + i)
        
        #If only U is attacking
        elif isU_a and C_a[i] != 0:
            #Calculate the number of attacks from u -> v
            cAu_v = C_a[i]     
            
            #List of nodes that got attacked
            L_v = sampleNodesAnomaly(vNodeIndex, cAu_v * G_b, P_v)
            
            #Probability of attacked nodes
            P_sv = np.repeat(1/(cAu_v * G_b), cAu_v * G_b)     
            
            #Pick U and V nodes cAu_v times from attacking U and attacked V 
            seqU, seqV = sampleEdgeAnomaly(ML_ua, L_v, P_ua, P_sv, cAu_v)
            
            #Update Feature List
            F = updateFeatureMatrix(F, seqU, seqV, T_s + i)
        
        #If only V is attacking
        elif isV_a and C_a[i] != 0:
            #Calculate the number of attacks from v -> u
            cAv_u = C_a[i]        
            
            #List of nodes that got attacked
            L_u = sampleNodesAnomaly(uNodeIndex, cAv_u * G_b, P_u)
            
            #Probability of attacked nodes
            P_su = np.repeat(1/(cAv_u * G_b), cAv_u * G_b)      
            
            #Pick U and V nodes cAv_u times from attacked V and attacking U
            seqU, seqV = sampleEdgeAnomaly(L_u, ML_va, P_su, P_va, cAv_u)
            
            #Update Feature List
            F = updateFeatureMatrix(F, seqU, seqV, T_s + i)            

        #Only a portion of attacked nodes (propagation ratio) will be infected. Add the infected nodes to the list of existing infected nodes
        if isG_p:
            if isU_a:
                np.append(ML_ua, sampleNodesAnomaly(L_u, int(G_p * len(L_u)), np.repeat(1/len(L_u), len(L_u))))

            if isV_a:
                np.append(ML_va, sampleNodesAnomaly(L_v, int(G_p * len(L_v)), np.repeat(1/len(L_v), len(L_v))))
                
    return F

In [148]:
#Inject anomaly
F = anomalyInjection(F, isU_a, isV_a, c_ua, c_va, C_a, G_b, isG_p, G_p, T_s, t_a, uNodeIndex, vNodeIndex)

TypeError: 'numpy.float64' object cannot be interpreted as an integer

In [149]:
#Final graph stats
nU = np.unique(F[:, 0]).shape[0]
nV = np.unique(F[:, 1]).shape[0]
nE = F.shape[0]
nA = F[F[:, 3] == 1].shape[0]

In [113]:
#Graph Stats:
print("Number of Nodes U: "+str(nU))
print("Number of Nodes V: "+str(nV))
print("Number of Edges: "+str(nE))
print("Number of Anomalies: "+str(nA))

Number of Nodes U: 76223
Number of Nodes V: 18633
Number of Edges: 1100079
Number of Anomalies: 100008


In [114]:
#####################################-------Attributes Generation-----------#########################################

In [115]:
#datasetName = 'Reddit'
datasetName = 'pcore'
#datasetName = 'Wiki'

In [116]:
#####################################----------Sampling----------------#########################################

In [117]:
nTrainData = pd.read_csv('../Dataset/'+ datasetName + 'Normal.csv')
aTrainData = pd.read_csv('../Dataset/'+ datasetName + 'Mal.csv')

In [None]:
#Skip this if you have pre-trained model
#Training normal dataset
model_normal = CTGAN(
    verbose = True,
    cuda = True
)
model_normal.fit(nTrainData)
model_normal.save(datasetName + 'Normal.pkl')

#Training anomaly dataset
model_anomaly = CTGAN(
    verbose = True,
    cuda = True
)
model_anomaly.fit(aTrainData)
model_anomaly.save(datasetName +'Mal.pkl')

In [118]:
#Load from the trained model
model_normal = CTGAN.load('../Saved_models/'+ datasetName + 'Normal.pkl')

model_anomaly = CTGAN.load('../Saved_models/'+ datasetName + 'Mal.pkl')

In [119]:
#Generate normal Attributes
attributes_normal = model_normal.sample(num_rows=(nE - nA))

#Generate anomaly attributes
attributes_anomaly = model_anomaly.sample(num_rows=nA)

In [120]:
#####################################----------Mapping----------------#########################################

In [121]:
df_normal = pd.DataFrame(F[F[:,3] == 0], columns = ['src', 'dest', 't', 'isAnomaly'])
df_anomaly = pd.DataFrame(F[F[:,3] == 1], columns = ['src', 'dest', 't', 'isAnomaly'])

df_normal = pd.concat([df_normal, attributes_normal], axis=1)
df_anomaly = pd.concat([df_anomaly, attributes_anomaly], axis=1)

df_final = pd.concat([df_normal, df_anomaly], axis=0)

In [122]:
id=3

In [123]:
df_final.to_csv(datasetName + '_Syn'+'genID_'+str(id)+'.csv',index=False)