In [1]:
import pandas as pd
import csv
import os, inspect, glob
import numpy as np, scipy.sparse as sp
import pickle
import networkx as nx
import random
from sklearn.model_selection import train_test_split




In [6]:
def combine_csv():
    os.chdir(os.getcwd())
    extension = 'csv'
    all_filenames = [i for i in glob.glob('*.{extension}'.format(extension=extension))]
    combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
    combined_csv.to_csv("combined_csv.csv", index=False, encoding='utf-8-sig')

In [7]:
combine_csv()

df = pd.read_csv("combined_csv.csv")

In [8]:
columns = ['c_asn', 'c_bytes_all', 'c_durat', 'c_first_abs', 'c_ip', 'c_isint',
   'c_pkts_all', 'c_port', 'c_type', 's_asn', 's_bytes_all', 's_durat',
   's_first_abs', 's_ip', 's_isint', 's_pkts_all', 's_port', 's_type']
df['c_asn'] = pd.to_numeric(df['c_asn'], errors='coerce')
df['s_asn'] = pd.to_numeric(df['s_asn'], errors='coerce')
df = df[~df['c_asn'].isnull()]
df = df[~df['s_asn'].isnull()]
# df = df[df['c_asn'].isnumeric()]
# df = df[df['s_asn'].isnumeric()]
df = df.dropna(subset=['c_asn'])
df = df.dropna(subset=['s_asn'])

In [74]:
df['c_asn'] = df['c_asn'].astype('int32')
df['s_asn'] = df['s_asn'].astype('int32')

asn_set = set(df['c_asn']).union(set(df['s_asn']))
asn_list= list(asn_set)
asn_dict = {asn_list[nodeid] : nodeid for nodeid in range(len(asn_list))}


In [75]:
# generate hypergraph
c_asn_ids = [asn_dict[asn] for asn in df['c_asn']]
s_asn_ids = [asn_dict[asn] for asn in df['s_asn']]
edges = list(zip(c_asn_ids, s_asn_ids))
hypergraph = {i: edges[i] for i in range(len(edges))}


In [81]:
attribute_names = ['c_asn', 'c_durat', 'c_first_abs', 'c_ip', 'c_isint', 'c_port', 
                   's_asn','s_durat', 's_first_abs', 's_ip', 's_isint', 's_port'] # select attributes name
label_names = ['c_type'] # select label to predict

In [82]:
c_nodes = c_asn_ids
attribute_mat = np.zeros((len(c_nodes), len(attribute_names)))
for c, name in enumerate(attribute_names):
    attribute_mat[:, c] = df[name]


In [83]:
#generate features
attr_dict = dict(zip(c_nodes, attribute_mat))
attr_dict_sorted = {}
for key in sorted(attr_dict.keys()):
    attr_dict_sorted[key] = attr_dict[key]
    
features = []
for v in range(len(attr_dict_sorted)):
    if v in attr_dict_sorted.keys():
        features.append(attr_dict_sorted[v])
    else:
        features.append(np.zeros(len(attribute_names)))
        
features = sp.csr_matrix(features)
features

<11632x12 sparse matrix of type '<class 'numpy.float64'>'
	with 76995 stored elements in Compressed Sparse Row format>

In [84]:
# generate labels
label_dict = dict(zip(c_asn_ids, df['c_type'])) #c
label_dict_sorted = {}
for key in sorted(label_dict.keys()):
    label_dict_sorted[key] = label_dict[key]
    
labels = []
for v in range(V):
    if v in label_dict_sorted.keys():
        labels.append(label_dict_sorted[v])
    else:
        labels.append(-1)

In [85]:
# train/test split
splits = {}
node_sorted = list(range(V))
random.shuffle(node_sorted)
train_size = round(0.75*V)
test_size = V - train_size
train_set, test_set = train_test_split(node_sorted, train_size=train_size, test_size=test_size)
splits['train'] = train_set
splits['test'] = test_set


In [86]:
with open("hypergraph.pickle", "wb") as hypergraph_out:
    pickle.dump(hypergraph, hypergraph_out)
    hypergraph_out.close()
with open("features.pickle", "wb") as features_out:
    pickle.dump(features, features_out)
    features_out.close()
with open("labels.pickle", "wb") as labels_out:
    pickle.dump(labels, labels_out)
    labels_out.close()
with open("splits.pickle", "wb") as splits_out:
    pickle.dump(splits, splits_out)
    splits_out.close()