In [2]:
import pandas as pd
import networkx as nx
import numpy as np
import random
from sklearn.utils import shuffle
import json


import math
from datetime import datetime, timedelta
from tqdm import tqdm
from collections import defaultdict
import matplotlib.pyplot as plt


In [81]:
def read_file(path):
    print(f'Reading file: {path}')
    df = pd.read_csv(path, low_memory=False, names=['Date','Source','Target','Calls','some','duration','rate'])
    print('Droping Duplicates')
    df = df.drop_duplicates(subset=['Source', 'Target'])
    df['Class'] = 1
    df['Date']= df['Date'].astype('datetime64[ns]')
    df['Source'] = df['Source'].astype(str)
    df['Target'] = df['Target'].astype(str)
    df = df[['Date', 'Source', 'Target','Class']]
    return df


def edges_to_df(g, edges):
    subgraph = g.edge_subgraph(edges)
    df = nx.to_pandas_edgelist(subgraph, source='Source', target='Target')
    df = df.drop_duplicates(subset=['Source','Target'])
    df['Class'] = 1
    df['Date']= df['Date'].astype('datetime64[ns]')
    df['Source'] = df['Source'].astype(str)
    df['Target'] = df['Target'].astype(str)
    df = df[['Date', 'Source', 'Target','Class']]
    return df

def node_sampling(g, size):
    print('Node Sampling')
    nodes = list(g.nodes())
    edges = set()
    while len(edges) < size:
        sampled_nodes = random.sample(nodes,1000)
        edges.update(g.edges(sampled_nodes))
    return edges_to_df(g,edges)


def bfs_sampling(g, size):
    nodes = list(g.nodes())
    root = random.choice(nodes)
    sampled_edges = set()
    sampled_nodes = set(root)
    node_queue = [root]
    while len(sampled_edges) < size:
        if node_queue:
            node = node_queue.pop(0)
            edges = g.edges(node)
            new_nodes = [t for s,t in edges if s not in sampled_nodes]
            sampled_nodes.update(new_nodes)
            node_queue += new_nodes
            sampled_edges.update(edges)
        else:
            node_queue.append(random.choice(nodes))
    return edges_to_df(g,sampled_edges)
        
    
def dfs_sampling(g, size):
    nodes = list(g.nodes())
    root = random.choice(nodes)
    sampled_edges = set()
    sampled_nodes = set(root)
    node_stack = [root]
    while len(sampled_edges) < size:
        if node_stack:
            node = node_stack.pop()
            edges = g.edges(node)
            new_nodes = [t for s,t in edges if s not in sampled_nodes]
            sampled_nodes.update(new_nodes)
            node_stack += new_nodes
            sampled_edges.update(edges)
        else:
            node_stack.append(random.choice(nodes))
    return edges_to_df(g,sampled_edges)

def negative_edge_sampling(g, df_train, df_test):
    # Sample negative edges
    nodes = list(g.nodes())
    source = list(df_train.Source.values[:(len(df_train)//2)]) + random.choices(nodes, k=len(df_train)//2)
    target = random.choices(nodes, k=len(df_train)//2) + list(df_train.Target.values[:(len(df_train)//2)])
    train_non_edges = set()
    for s,t in zip(source,target):
        if s == t:
            continue
        if (s,t) in train_non_edges:
            continue
        if g.has_edge(s,t):
            continue
        train_non_edges.add((s,t))

    source = random.choices(nodes, k=len(df_test))
    target = random.choices(nodes, k=len(df_test))
    test_non_edges = set()
    for s,t in zip(source,target):
        if s == t:
            continue
        if (s,t) in test_non_edges or (s,t) in train_non_edges:
            continue
        if g.has_edge(s,t):
            continue
        test_non_edges.add((s,t))

    df_neg_train = pd.DataFrame(list(train_non_edges), columns=['Source', 'Target'])
    df_neg_test = pd.DataFrame(list(test_non_edges), columns=['Source', 'Target'])
    df_neg_train['Class'] = 0
    df_neg_test['Class'] = 0
    df_train = df_train.append(df_neg_train, sort=True)
    df_test = df_test.append(df_neg_test, sort=True)
    df_train= df_train[['Date','Source', 'Target','Class']]
    df_test= df_test[['Date','Source', 'Target','Class']]
    print('size train:', len(df_train))
    print('size test:', len(df_test))
    return df_train, df_test


In [20]:
df_train_full = read_file('clean/2008-07-28.txt.gz')

Reading file: clean/2008-07-28.txt.gz
Droping Duplicates


In [12]:
df_train = shuffle(df_train_full).iloc[0:100000]

In [21]:
G = nx.from_pandas_edgelist(df_train_full.iloc[0:2000000], source='Source', target='Target',edge_attr='Date', create_using=nx.DiGraph())

In [22]:
df_node = node_sampling(G,100000)
g_node = nx.from_pandas_edgelist(df_node, source='Source', target='Target', create_using=nx.DiGraph())

df_random = shuffle(df_train).iloc[0:100000]
g_random = nx.from_pandas_edgelist(df_random, source='Source', target='Target',edge_attr='Date', create_using=nx.DiGraph())

df_bfs = bfs_sampling(G,100000)
g_bfs = nx.from_pandas_edgelist(df_bfs, source='Source', target='Target',edge_attr='Date', create_using=nx.DiGraph())

df_dfs = dfs_sampling(G,100000)
g_dfs = nx.from_pandas_edgelist(df_dfs, source='Source', target='Target',edge_attr='Date', create_using=nx.DiGraph())

Node Sampling


In [82]:
df_train, df_test = negative_edge_sampling(g_node, df_node, df_node)

size train: 201343
size test: 201466


In [7]:
def get_metrics(g):
    out_degree = [v for i,v in g.out_degree]
    in_degree = [v for i,v in g.in_degree]
    betweeness =list(nx.betweenness_centrality(g,k=10).values())
    closeness = list(nx.closeness_centrality(g).values())
    clustering = list(nx.clustering(g).values())
    n_degree = [v for v in nx.average_neighbor_degree(g).values()]
    scc = [len(s) for s in nx.strongly_connected_components(g)]
    wcc = [len(s) for s in nx.weakly_connected_components(g)]
    
    metrics = {
        'min-out-degree':min(out_degree),
        'max-out-degree':max(out_degree),
        'avg-out-degree':np.mean(out_degree),
        'std-out-degree':np.std(out_degree),
        
        'min-in-degree':min(in_degree),
        'max-in-degree':max(in_degree),
        'avg-in-degree':np.mean(in_degree),
        'std-in-degree':np.std(in_degree),
        
        'min-betweeness':min(betweeness),
        'max-betweeness':max(betweeness),
        'avg-betweeness':np.mean(betweeness),
        'std-betweeness':np.std(betweeness),
        
        'min-closeness':min(closeness),
        'max-closeness':max(closeness),
        'avg-closeness':np.mean(closeness),
        'std-closeness':np.std(closeness),
        
        'min-neighboor-degree':min(n_degree),
        'max-neighboor-degree':max(n_degree),
        'avg-neighboor-degree':np.mean(n_degree),
        'std-neighboor-degree':np.std(n_degree),
        
        'count-scc':len(scc),
        'min-scc':min(scc),
        'max-scc':max(scc),
        'avg-scc':np.mean(scc),
        'std-scc':np.std(scc),
        
        'count-wcc':len(wcc),
        'min-wcc':min(wcc),
        'max-wcc':max(wcc),
        'avg-wcc':np.mean(wcc),
        'std-wcc':np.std(wcc),
    }
    return pd.Series(metrics)
    

    

In [8]:
metrics_node = get_metrics(g_node)
metrics_random = get_metrics(g_random)
metrics_bfs = get_metrics(g_bfs)
metrics_dfs = get_metrics(g_dfs)

In [9]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [10]:
pd.DataFrame({'node': metrics_node,
              'random': metrics_random,
              'bfs': metrics_bfs,
              'dfs': metrics_dfs,
             })

Unnamed: 0,node,random,bfs,dfs
min-out-degree,0.0,0.0,0.0,0.0
max-out-degree,46.0,4.0,22.0,39.0
avg-out-degree,0.67158,0.54828,0.74672,0.74755
std-out-degree,1.00531,0.51906,1.05716,1.0658
min-in-degree,0.0,0.0,0.0,0.0
max-in-degree,3849.0,3843.0,4013.0,4059.0
avg-in-degree,0.67158,0.54828,0.74672,0.74755
std-in-degree,11.82988,10.71578,12.6496,12.78281
min-betweeness,0.0,0.0,0.0,0.0
max-betweeness,0.0,0.0,0.0,0.0
