In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import datetime
import numpy as np
import networkx as nx
import pandas as pd
from scipy.stats import entropy

class NodeEngFeatures:
    def __init__(self, nodes, edges):
        self.nodes = nodes  # a dataframe
        self.edges = edges  # a dataframe
        self.G = nx.from_pandas_edgelist(self.edges, source='source', target='target',
                                         edge_attr=['timestamp', 'amount'],
                                         create_using=nx.MultiDiGraph())
        print("*** Original MD-Graph ***")
        print(nx.info(self.G))
        self.node_feature_names = self.retrieve_feature_name()

    def retrieve_feature_name(self):
        """
        retrieve the names of the features for the nodes
        """
        feature_stat_df = FeatureStatus().feature_stat
        feature_name_list = feature_stat_df['feature'].tolist()
        return feature_name_list

    def get_tx_amount_and_interval_list(self, node, opt):
        """
        returns the list of amount and the list of timestamps for all the (opt-) transactions
        :param node: the node that we focus on
        :param opt: 'in', 'out', or 'all' transactions
        """
        if opt == 'in':  # incoming tx
            node_tx_df = self.edges[self.edges['target'] == node]
        elif opt == 'out':  # outgoing tx
            node_tx_df = self.edges[self.edges['source'] == node]
        elif opt == 'all':  # all tx
            node_tx_df = self.edges[(self.edges['target'] == node) | (self.edges['source'] == node)]
        else:
            raise ValueError("Option unavailable!")

        amount_list = node_tx_df['amount'].tolist()
        linux_timestamp_list = node_tx_df['timestamp'].tolist()
        timestamp_list = [datetime.datetime.fromtimestamp(t) for t in linux_timestamp_list]
        timestamp_list.sort()
        # interval of txs in minutes
        tx_interval = [((timestamp_list[i + 1] - timestamp_list[i]).total_seconds() / 60) for i
                       in range(len(timestamp_list) - 1)]

        return amount_list, tx_interval

    def neighbor_degree_features(self, node):
        """
        get the features related to the degree distributions of the neighbors of the node
        """
        # extract the egonet of the node
        egonet = nx.ego_graph(self.G, node)

        # prerequisite for some neighborhood features
        egonet_node = nx.nodes(egonet)
        no_edge_egonet_in = 0  # number of in-coming edges to egonet
        no_edge_egonet_out = 0  # number of out-going edges from egonet
        for nb_node in egonet_node:
            if node != nb_node:
                no_edge_egonet_in += (self.G.in_degree[nb_node] - egonet.in_degree[nb_node])
                no_edge_egonet_out += (self.G.out_degree[nb_node] - egonet.out_degree[nb_node])

        neighbor_degrees = [d for n, d in egonet.degree() if n != node]
        neighbor_w_degrees = [d for n, d in egonet.degree(weight='amount') if n != node]
        neighbor_in_degrees = [d for n, d in egonet.in_degree() if n != node]
        neighbor_out_degrees = [d for n, d in egonet.out_degree() if n != node]

        no_edge_egonet = egonet.number_of_edges()

        return no_edge_egonet, no_edge_egonet_in, no_edge_egonet_out, \
               neighbor_degrees, neighbor_w_degrees, neighbor_in_degrees, neighbor_out_degrees

    def gen_node_features_single(self, node):
        """
        generate the features for the node
        :param node: node of interest
        """
        no_edge_egonet, no_edge_egonet_in, no_edge_egonet_out, \
            neighbor_degrees, neighbor_w_degrees, neighbor_in_degrees, neighbor_out_degrees = \
            self.neighbor_degree_features(node)
        amnt_in_list, interval_in_tx = self.get_tx_amount_and_interval_list(node, 'in')
        amnt_out_list, interval_out_tx = self.get_tx_amount_and_interval_list(node, 'out')
        amnt_all_list, interval_all_tx = self.get_tx_amount_and_interval_list(node, 'all')
        node_row = self.nodes.loc[self.nodes['node'] == node]
        node_feature_dict = {
            'node': node_row['node'].values[0],
            # 'address': node_row['address'].values[0],
            'isp': node_row['isp'].values[0],
            # 'is_anchor': node_row['is_anchor'].values[0],
            # 'balance': node_row['balance'].values[0],

            # structural
            'degree': len(amnt_all_list),
            # 'w_degree': self.G.degree(node, weight='amount'),
            'in_degree': len(amnt_in_list),
            'out_degree': len(amnt_out_list),

            # transactional
            'avg_amount_in_tx': np.mean(amnt_in_list) if len(amnt_in_list) > 0 else 0,
            'min_amount_in_tx': np.min(amnt_in_list) if len(amnt_in_list) > 0 else 0,
            'max_amount_in_tx': np.max(amnt_in_list) if len(amnt_in_list) > 0 else 0,
            'sum_amount_in_tx': np.sum(amnt_in_list),
            'std_amount_in_tx': np.std(amnt_in_list) if len(amnt_in_list) > 0 else 0,
            'ent_amount_in_tx': entropy(amnt_in_list) if np.sum(amnt_in_list) != 0 else 0,

            'avg_in_tx_interval': np.mean(interval_in_tx) if len(interval_in_tx) > 0 else 0,
            'min_in_tx_interval': np.min(interval_in_tx) if len(interval_in_tx) > 0 else 0,
            'max_in_tx_interval': np.max(interval_in_tx) if len(interval_in_tx) > 0 else 0,
            'sum_in_tx_interval': np.sum(interval_in_tx),
            'std_in_tx_interval': np.std(interval_in_tx) if len(interval_in_tx) > 0 else 0,
            'ent_in_tx_interval': entropy(interval_in_tx) if np.sum(interval_in_tx) != 0 else 0,

            'avg_amount_out_tx': np.mean(amnt_out_list) if len(amnt_out_list) > 0 else 0,
            'min_amount_out_tx': np.min(amnt_out_list) if len(amnt_out_list) > 0 else 0,
            'max_amount_out_tx': np.max(amnt_out_list) if len(amnt_out_list) > 0 else 0,
            'sum_amount_out_tx': np.sum(amnt_out_list),
            'std_amount_out_tx': np.std(amnt_out_list) if len(amnt_out_list) > 0 else 0,
            'ent_amount_out_tx': entropy(amnt_out_list) if np.sum(amnt_out_list) != 0 else 0,

            'avg_out_tx_interval': np.mean(interval_out_tx) if len(interval_out_tx) > 0 else 0,
            'min_out_tx_interval': np.min(interval_out_tx) if len(interval_out_tx) > 0 else 0,
            'max_out_tx_interval': np.max(interval_out_tx) if len(interval_out_tx) > 0 else 0,
            'sum_out_tx_interval': np.sum(interval_out_tx),
            'std_out_tx_interval': np.std(interval_out_tx) if len(interval_out_tx) > 0 else 0,
            'ent_out_tx_interval': entropy(interval_out_tx) if np.sum(interval_out_tx) != 0 else 0,

            'avg_amount_all_tx': np.mean(amnt_all_list) if len(amnt_all_list) > 0 else 0,  # all tx: in & out
            'min_amount_all_tx': np.min(amnt_all_list) if len(amnt_all_list) > 0 else 0,
            'max_amount_all_tx': np.max(amnt_all_list) if len(amnt_all_list) > 0 else 0,
            'sum_amount_all_tx': np.sum(amnt_all_list),  # this should be equal to weighted degree
            'std_amount_all_tx': np.std(amnt_all_list) if len(amnt_all_list) > 0 else 0,
            'ent_amount_all_tx': entropy(amnt_all_list) if np.sum(amnt_all_list) != 0 else 0,

            'avg_all_tx_interval': np.mean(interval_all_tx) if len(interval_all_tx) > 0 else 0,
            'min_all_tx_interval': np.min(interval_all_tx) if len(interval_all_tx) > 0 else 0,
            'max_all_tx_interval': np.max(interval_all_tx) if len(interval_all_tx) > 0 else 0,
            'sum_all_tx_interval': np.sum(interval_all_tx),
            'std_all_tx_interval': np.std(interval_all_tx) if len(interval_all_tx) > 0 else 0,
            'ent_all_tx_interval': entropy(interval_all_tx) if np.sum(interval_all_tx) != 0 else 0,

            # regional features
            'no_edge_within_egonet': no_edge_egonet,  # number of edges within the egonet for all nodes
            'no_edge_in_egonet': no_edge_egonet_in,  # number of in-edges to the egonet
            'no_edge_out_egonet': no_edge_egonet_out,  # number of out-edges from the egonet
            'no_edge_all_egonet': no_edge_egonet_in + no_edge_egonet_out,  # total number of edges to/from the egonet

            # neighborhood features
            'avg_neighbor_degree': np.mean(neighbor_degrees) if len(neighbor_degrees) > 0 else 0,
            'min_neighbor_degree': np.min(neighbor_degrees) if len(neighbor_degrees) > 0 else 0,
            'max_neighbor_degree': np.max(neighbor_degrees) if len(neighbor_degrees) > 0 else 0,
            'sum_neighbor_degree': np.sum(neighbor_degrees),
            'std_neighbor_degree': np.std(neighbor_degrees) if len(neighbor_degrees) > 0 else 0,
            'ent_neighbor_degree': entropy(neighbor_degrees) if np.sum(neighbor_degrees) != 0 else 0,

            'avg_neighbor_w_degree': np.mean(neighbor_w_degrees) if len(neighbor_w_degrees) > 0 else 0,
            'min_neighbor_w_degree': np.min(neighbor_w_degrees) if len(neighbor_w_degrees) > 0 else 0,
            'max_neighbor_w_degree': np.max(neighbor_w_degrees) if len(neighbor_w_degrees) > 0 else 0,
            'sum_neighbor_w_degree': np.sum(neighbor_w_degrees),
            'std_neighbor_w_degree': np.std(neighbor_w_degrees) if len(neighbor_w_degrees) > 0 else 0,
            'ent_neighbor_w_degree': entropy(neighbor_w_degrees) if np.sum(neighbor_w_degrees) != 0 else 0,

            'avg_neighbor_in_degree': np.mean(neighbor_in_degrees) if len(neighbor_in_degrees) > 0 else 0,
            'min_neighbor_in_degree': np.min(neighbor_in_degrees) if len(neighbor_in_degrees) > 0 else 0,
            'max_neighbor_in_degree': np.max(neighbor_in_degrees) if len(neighbor_in_degrees) > 0 else 0,
            'sum_neighbor_in_degree': np.sum(neighbor_in_degrees),
            'std_neighbor_in_degree': np.std(neighbor_in_degrees) if len(neighbor_in_degrees) > 0 else 0,
            'ent_neighbor_in_degree': entropy(neighbor_in_degrees) if np.sum(neighbor_in_degrees) != 0 else 0,

            'avg_neighbor_out_degree': np.mean(neighbor_out_degrees) if len(neighbor_out_degrees) > 0 else 0,
            'min_neighbor_out_degree': np.min(neighbor_out_degrees) if len(neighbor_out_degrees) > 0 else 0,
            'max_neighbor_out_degree': np.max(neighbor_out_degrees) if len(neighbor_out_degrees) > 0 else 0,
            'sum_neighbor_out_degree': np.sum(neighbor_out_degrees),
            'std_neighbor_out_degree': np.std(neighbor_out_degrees) if len(neighbor_out_degrees) > 0 else 0,
            'ent_neighbor_out_degree': entropy(neighbor_out_degrees) if np.sum(neighbor_out_degrees) != 0 else 0,

        }
        return node_feature_dict

    def gen_node_features_list(self, node_list):
        """
        generate features for each node in the node_list
        :param node_list: a list of different nodes
        :return node_feature_df: a dataframe of the nodes and their features
        """
        node_features_dict_list = [self.gen_node_features_single(node) for node in node_list]
        node_feature_df = pd.DataFrame(node_features_dict_list, columns=self.node_feature_names)
        return node_feature_df

In [None]:
import datetime
import numpy as np
import networkx as nx
import pandas as pd
from scipy.stats import entropy

class NodeEngFeatures:
    def __init__(self, nodes, edges):
        self.nodes = nodes  # a dataframe
        self.edges = edges  # a dataframe
        self.G = nx.from_pandas_edgelist(self.edges, source='source', target='target',
                                         edge_attr=['timestamp', 'amount'],
                                         create_using=nx.MultiDiGraph())
        print("*** Original MD-Graph ***")
        print(nx.info(self.G))
        self.node_feature_names = self.retrieve_feature_name()

    def retrieve_feature_name(self):
        """
        retrieve the names of the features for the nodes
        """
        feature_stat_df = FeatureStatus().feature_stat
        feature_name_list = feature_stat_df['feature'].tolist()
        return feature_name_list

    def get_tx_amount_and_interval_list(self, node, opt):
        """
        returns the list of amount and the list of timestamps for all the (opt-) transactions
        :param node: the node that we focus on
        :param opt: 'in', 'out', or 'all' transactions
        """
        if opt == 'in':  # incoming tx
            node_tx_df = self.edges[self.edges['target'] == node]
        elif opt == 'out':  # outgoing tx
            node_tx_df = self.edges[self.edges['source'] == node]
        elif opt == 'all':  # all tx
            node_tx_df = self.edges[(self.edges['target'] == node) | (self.edges['source'] == node)]
        else:
            raise ValueError("Option unavailable!")

        amount_list = node_tx_df['amount'].tolist()
        linux_timestamp_list = node_tx_df['timestamp'].tolist()
        timestamp_list = [datetime.datetime.fromtimestamp(t) for t in linux_timestamp_list]
        timestamp_list.sort()
        # interval of txs in minutes
        tx_interval = [((timestamp_list[i + 1] - timestamp_list[i]).total_seconds() / 60) for i
                       in range(len(timestamp_list) - 1)]

        return amount_list, tx_interval

    def neighbor_degree_features(self, node):
        """
        get the features related to the degree distributions of the neighbors of the node
        """
        # extract the egonet of the node
        egonet = nx.ego_graph(self.G, node)

        # prerequisite for some neighborhood features
        egonet_node = nx.nodes(egonet)
        no_edge_egonet_in = 0  # number of in-coming edges to egonet
        no_edge_egonet_out = 0  # number of out-going edges from egonet
        for nb_node in egonet_node:
            if node != nb_node:
                no_edge_egonet_in += (self.G.in_degree[nb_node] - egonet.in_degree[nb_node])
                no_edge_egonet_out += (self.G.out_degree[nb_node] - egonet.out_degree[nb_node])

        neighbor_degrees = [d for n, d in egonet.degree() if n != node]
        neighbor_w_degrees = [d for n, d in egonet.degree(weight='amount') if n != node]
        neighbor_in_degrees = [d for n, d in egonet.in_degree() if n != node]
        neighbor_out_degrees = [d for n, d in egonet.out_degree() if n != node]

        no_edge_egonet = egonet.number_of_edges()

        return no_edge_egonet, no_edge_egonet_in, no_edge_egonet_out, \
               neighbor_degrees, neighbor_w_degrees, neighbor_in_degrees, neighbor_out_degrees

    def gen_node_features_single(self, node):
        """
        generate the features for the node
        :param node: node of interest
        """
        no_edge_egonet, no_edge_egonet_in, no_edge_egonet_out, \
            neighbor_degrees, neighbor_w_degrees, neighbor_in_degrees, neighbor_out_degrees = \
            self.neighbor_degree_features(node)
        amnt_in_list, interval_in_tx = self.get_tx_amount_and_interval_list(node, 'in')
        amnt_out_list, interval_out_tx = self.get_tx_amount_and_interval_list(node, 'out')
        amnt_all_list, interval_all_tx = self.get_tx_amount_and_interval_list(node, 'all')
        node_row = self.nodes.loc[self.nodes['node'] == node]
        node_feature_dict = {
            'node': node_row['node'].values[0],
            # 'address': node_row['address'].values[0],
            'isp': node_row['isp'].values[0],
            # 'is_anchor': node_row['is_anchor'].values[0],
            # 'balance': node_row['balance'].values[0],

            # structural
            'degree': len(amnt_all_list),
            # 'w_degree': self.G.degree(node, weight='amount'),
            'in_degree': len(amnt_in_list),
            'out_degree': len(amnt_out_list),

            # transactional
            'avg_amount_in_tx': np.mean(amnt_in_list) if len(amnt_in_list) > 0 else 0,
            'min_amount_in_tx': np.min(amnt_in_list) if len(amnt_in_list) > 0 else 0,
            'max_amount_in_tx': np.max(amnt_in_list) if len(amnt_in_list) > 0 else 0,
            'sum_amount_in_tx': np.sum(amnt_in_list),
            'std_amount_in_tx': np.std(amnt_in_list) if len(amnt_in_list) > 0 else 0,
            'ent_amount_in_tx': entropy(amnt_in_list) if np.sum(amnt_in_list) != 0 else 0,

            'avg_in_tx_interval': np.mean(interval_in_tx) if len(interval_in_tx) > 0 else 0,
            'min_in_tx_interval': np.min(interval_in_tx) if len(interval_in_tx) > 0 else 0,
            'max_in_tx_interval': np.max(interval_in_tx) if len(interval_in_tx) > 0 else 0,
            'sum_in_tx_interval': np.sum(interval_in_tx),
            'std_in_tx_interval': np.std(interval_in_tx) if len(interval_in_tx) > 0 else 0,
            'ent_in_tx_interval': entropy(interval_in_tx) if np.sum(interval_in_tx) != 0 else 0,

            'avg_amount_out_tx': np.mean(amnt_out_list) if len(amnt_out_list) > 0 else 0,
            'min_amount_out_tx': np.min(amnt_out_list) if len(amnt_out_list) > 0 else 0,
            'max_amount_out_tx': np.max(amnt_out_list) if len(amnt_out_list) > 0 else 0,
            'sum_amount_out_tx': np.sum(amnt_out_list),
            'std_amount_out_tx': np.std(amnt_out_list) if len(amnt_out_list) > 0 else 0,
            'ent_amount_out_tx': entropy(amnt_out_list) if np.sum(amnt_out_list) != 0 else 0,

            'avg_out_tx_interval': np.mean(interval_out_tx) if len(interval_out_tx) > 0 else 0,
            'min_out_tx_interval': np.min(interval_out_tx) if len(interval_out_tx) > 0 else 0,
            'max_out_tx_interval': np.max(interval_out_tx) if len(interval_out_tx) > 0 else 0,
            'sum_out_tx_interval': np.sum(interval_out_tx),
            'std_out_tx_interval': np.std(interval_out_tx) if len(interval_out_tx) > 0 else 0,
            'ent_out_tx_interval': entropy(interval_out_tx) if np.sum(interval_out_tx) != 0 else 0,

            'avg_amount_all_tx': np.mean(amnt_all_list) if len(amnt_all_list) > 0 else 0,  # all tx: in & out
            'min_amount_all_tx': np.min(amnt_all_list) if len(amnt_all_list) > 0 else 0,
            'max_amount_all_tx': np.max(amnt_all_list) if len(amnt_all_list) > 0 else 0,
            'sum_amount_all_tx': np.sum(amnt_all_list),  # this should be equal to weighted degree
            'std_amount_all_tx': np.std(amnt_all_list) if len(amnt_all_list) > 0 else 0,
            'ent_amount_all_tx': entropy(amnt_all_list) if np.sum(amnt_all_list) != 0 else 0,

            'avg_all_tx_interval': np.mean(interval_all_tx) if len(interval_all_tx) > 0 else 0,
            'min_all_tx_interval': np.min(interval_all_tx) if len(interval_all_tx) > 0 else 0,
            'max_all_tx_interval': np.max(interval_all_tx) if len(interval_all_tx) > 0 else 0,
            'sum_all_tx_interval': np.sum(interval_all_tx),
            'std_all_tx_interval': np.std(interval_all_tx) if len(interval_all_tx) > 0 else 0,
            'ent_all_tx_interval': entropy(interval_all_tx) if np.sum(interval_all_tx) != 0 else 0,

            # regional features
            'no_edge_within_egonet': no_edge_egonet,  # number of edges within the egonet for all nodes
            'no_edge_in_egonet': no_edge_egonet_in,  # number of in-edges to the egonet
            'no_edge_out_egonet': no_edge_egonet_out,  # number of out-edges from the egonet
            'no_edge_all_egonet': no_edge_egonet_in + no_edge_egonet_out,  # total number of edges to/from the egonet

            # neighborhood features
            'avg_neighbor_degree': np.mean(neighbor_degrees) if len(neighbor_degrees) > 0 else 0,
            'min_neighbor_degree': np.min(neighbor_degrees) if len(neighbor_degrees) > 0 else 0,
            'max_neighbor_degree': np.max(neighbor_degrees) if len(neighbor_degrees) > 0 else 0,
            'sum_neighbor_degree': np.sum(neighbor_degrees),
            'std_neighbor_degree': np.std(neighbor_degrees) if len(neighbor_degrees) > 0 else 0,
            'ent_neighbor_degree': entropy(neighbor_degrees) if np.sum(neighbor_degrees) != 0 else 0,

            'avg_neighbor_w_degree': np.mean(neighbor_w_degrees) if len(neighbor_w_degrees) > 0 else 0,
            'min_neighbor_w_degree': np.min(neighbor_w_degrees) if len(neighbor_w_degrees) > 0 else 0,
            'max_neighbor_w_degree': np.max(neighbor_w_degrees) if len(neighbor_w_degrees) > 0 else 0,
            'sum_neighbor_w_degree': np.sum(neighbor_w_degrees),
            'std_neighbor_w_degree': np.std(neighbor_w_degrees) if len(neighbor_w_degrees) > 0 else 0,
            'ent_neighbor_w_degree': entropy(neighbor_w_degrees) if np.sum(neighbor_w_degrees) != 0 else 0,

            'avg_neighbor_in_degree': np.mean(neighbor_in_degrees) if len(neighbor_in_degrees) > 0 else 0,
            'min_neighbor_in_degree': np.min(neighbor_in_degrees) if len(neighbor_in_degrees) > 0 else 0,
            'max_neighbor_in_degree': np.max(neighbor_in_degrees) if len(neighbor_in_degrees) > 0 else 0,
            'sum_neighbor_in_degree': np.sum(neighbor_in_degrees),
            'std_neighbor_in_degree': np.std(neighbor_in_degrees) if len(neighbor_in_degrees) > 0 else 0,
            'ent_neighbor_in_degree': entropy(neighbor_in_degrees) if np.sum(neighbor_in_degrees) != 0 else 0,

            'avg_neighbor_out_degree': np.mean(neighbor_out_degrees) if len(neighbor_out_degrees) > 0 else 0,
            'min_neighbor_out_degree': np.min(neighbor_out_degrees) if len(neighbor_out_degrees) > 0 else 0,
            'max_neighbor_out_degree': np.max(neighbor_out_degrees) if len(neighbor_out_degrees) > 0 else 0,
            'sum_neighbor_out_degree': np.sum(neighbor_out_degrees),
            'std_neighbor_out_degree': np.std(neighbor_out_degrees) if len(neighbor_out_degrees) > 0 else 0,
            'ent_neighbor_out_degree': entropy(neighbor_out_degrees) if np.sum(neighbor_out_degrees) != 0 else 0,

        }
        return node_feature_dict

    def gen_node_features_list(self, node_list):
        """
        generate features for each node in the node_list
        :param node_list: a list of different nodes
        :return node_feature_df: a dataframe of the nodes and their features
        """
        node_features_dict_list = [self.gen_node_features_single(node) for node in node_list]
        node_feature_df = pd.DataFrame(node_features_dict_list, columns=self.node_feature_names)
        return node_feature_df

In [None]:
import pandas as pd


class FeatureStatus:
    def __init__(self):
        self.feature_stat = self.generate_feature_status()

    def generate_feature_status(self):
        """"
        generate the status of different features
        """
        feature_df_column_name = ['feature', 'select']
        feature_dict = {
            # major features
            0: ['node', 1],

            # 1: ['address', 0],  # NEVER select
            2: ['isp', 0],  # NEVER select
            # 3: ['is_anchor', 0],  # NEVER select

            5: ['degree', 0],
            6: ['in_degree', 1],  # this
            7: ['out_degree', 0],

            8: ['avg_amount_in_tx', 0],
            9: ['min_amount_in_tx', 0],
            10: ['max_amount_in_tx', 0],
            11: ['sum_amount_in_tx', 1],  # this
            12: ['std_amount_in_tx', 1],  # this
            13: ['ent_amount_in_tx', 0],

            14: ['avg_in_tx_interval', 1],  # this
            15: ['min_in_tx_interval', 0],
            16: ['max_in_tx_interval', 1],  # this
            17: ['sum_in_tx_interval', 1],  # this
            18: ['std_in_tx_interval', 0],
            19: ['ent_in_tx_interval', 0],

            20: ['avg_amount_out_tx', 0],
            21: ['min_amount_out_tx', 0],
            22: ['max_amount_out_tx', 0],
            23: ['sum_amount_out_tx', 1],  # this
            24: ['std_amount_out_tx', 0],
            25: ['ent_amount_out_tx', 0],

            26: ['avg_out_tx_interval', 0],
            27: ['min_out_tx_interval', 0],
            28: ['max_out_tx_interval', 0],
            29: ['sum_out_tx_interval', 0],
            30: ['std_out_tx_interval', 0],
            31: ['ent_out_tx_interval', 0],

            32: ['avg_amount_all_tx', 0],
            33: ['min_amount_all_tx', 0],
            34: ['max_amount_all_tx', 0],
            35: ['sum_amount_all_tx', 0],
            36: ['std_amount_all_tx', 0],
            37: ['ent_amount_all_tx', 1],  # this

            38: ['avg_all_tx_interval', 0],
            39: ['min_all_tx_interval', 0],
            40: ['max_all_tx_interval', 0],
            41: ['sum_all_tx_interval', 0],
            42: ['std_all_tx_interval', 0],
            43: ['ent_all_tx_interval', 0],

            44: ['no_edge_within_egonet', 0],
            45: ['no_edge_in_egonet', 1],  # this
            46: ['no_edge_out_egonet', 0],
            47: ['no_edge_all_egonet', 1],  # this

            48: ['avg_neighbor_degree', 0],
            49: ['min_neighbor_degree', 0],
            50: ['max_neighbor_degree', 0],
            51: ['sum_neighbor_degree', 0],
            52: ['std_neighbor_degree', 0],
            53: ['ent_neighbor_degree', 0],

            54: ['avg_neighbor_w_degree', 0],
            55: ['min_neighbor_w_degree', 0],
            56: ['max_neighbor_w_degree', 0],
            57: ['sum_neighbor_w_degree', 0],
            58: ['std_neighbor_w_degree', 0],
            59: ['ent_neighbor_w_degree', 0],

            60: ['avg_neighbor_in_degree', 0],
            61: ['min_neighbor_in_degree', 0],
            62: ['max_neighbor_in_degree', 0],
            63: ['sum_neighbor_in_degree', 0],
            64: ['std_neighbor_in_degree', 0],
            65: ['ent_neighbor_in_degree', 0],

            66: ['avg_neighbor_out_degree', 0],
            67: ['min_neighbor_out_degree', 0],
            68: ['max_neighbor_out_degree', 0],
            69: ['sum_neighbor_out_degree', 0],
            70: ['std_neighbor_out_degree', 0],
            71: ['ent_neighbor_out_degree', 0],

            # 62: ['balance', 1, 0],

            # derived features
        }

        feature_df = pd.DataFrame.from_dict(feature_dict, orient='index',
                                            columns=feature_df_column_name)

        return feature_df

In [None]:
""""
Preparation of the graph that is going to be used by RiWalk --- Ethereum
"""

# --- import statements ---

import pandas as pd
import numpy as np
import networkx as nx
import time
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler


# --- variables ---
rnd_seed = 42


class NodeFeature:
    """
    a class to define the appropriate features for ALL nodes of the graph
    this facilitates the modifications to the RiWalk
    """

    def __init__(self, nodes, edges):
        self.nodes = nodes
        self.edges = edges
        self.n_fe_features = NodeEngFeatures(self.nodes, self.edges)

    def generate_edge_list_for_RiWalk_unweighted(self):
        """
        generates graphs for the original RiWalk method
        :return unweighted, directed, simple graph
        """
        grouped_edges = self.edges.groupby(['source', 'target'])
        source_list = []
        target_list = []
        for key, value in grouped_edges:
            source_list.append(key[0])  # source node
            target_list.append(key[1])  # target node
        simp_dir_edge_list = pd.DataFrame(list(zip(source_list, target_list)), columns=['source', 'target'])

        return simp_dir_edge_list

    def generate_edge_list_for_RiWalk_weighted(self):
        """
        generate graph for RiWalk
        convert MDG to a simple directed weighted graph
        """
        mdg = nx.from_pandas_edgelist(self.edges, source='source', target='target',
                                      edge_attr=['amount', 'timestamp'], create_using=nx.MultiDiGraph())

        # generate simple graph
        simG = nx.DiGraph()
        for u, v, data in mdg.edges(data=True):
            a = data['amount'] if 'amount' in data else 0
            t = data['timestamp'] if 'timestamp' in data else 0

            if simG.has_edge(u, v):
                simG[u][v]['amount'] += a  # sum of amounts

                current_timestamp = simG[u][v]['timestamp']
                simG[u][v]['timestamp'] = max(t, current_timestamp)  # more recent

                simG[u][v]['n_tx'] += 1
            else:
                simG.add_edge(u, v, amount=a, timestamp=t, n_tx=1)

        # normalize weight values
        for u, v, data in simG.edges(data=True):
            if simG.degree(u, weight='amount') != 0:
                simG[u][v]['amount'] = simG[u][v]['amount'] / simG.degree(u, weight='amount')
            else:
                simG[u][v]['amount'] = 0
            simG[u][v]['timestamp'] = simG[u][v]['timestamp'] / simG.degree(u, weight='timestamp')
            simG[u][v]['n_tx'] = simG[u][v]['n_tx'] / simG.degree(u)

            # aggregated weight value
            simG[u][v]['weight'] = simG[u][v]['amount'] * simG[u][v]['timestamp'] * simG[u][v]['n_tx']

        edge_list_df = nx.to_pandas_edgelist(simG, source='source', target='target')
        # only preserve the 'weight'
        edge_list_df = edge_list_df.drop(['amount', 'timestamp', 'n_tx'], axis=1)

        return edge_list_df

    def gen_features_all_nodes(self):
        """
        get a dataframe containing the selected features for all nodes
        """
        node_list = self.nodes['node'].tolist()  # select all the nodes
        all_feature_df = self.n_fe_features.gen_node_features_list(node_list)
        return all_feature_df

    def gen_t_SNE_components_2d(self, features_df, shift_coff):
        """
        generate the t-SNE components of a set of features
        * the 2 first most important components
        """
        values = features_df.drop(['node', 'isp'], axis=1).values.tolist()

        min_max_scaler = MinMaxScaler()
        values_scaled = min_max_scaler.fit_transform(values)

        print('\tGraphPrep; t-SNE starts.')
        time_start = time.time()
        tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=300)
        tsne_results = tsne.fit_transform(values_scaled)
        print('\tt-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))

        # plotting
        min_first = [np.abs(np.min(tsne_results[:, 0])) * shift_coff] * len(tsne_results)
        min_second = [np.abs(np.min(tsne_results[:, 1])) * shift_coff] * len(tsne_results)
        tsne_data = {'tsne-2d-first': tsne_results[:, 0] + min_first,
                     'tsne-2d-second': tsne_results[:, 1] + min_second,
                     }
        return tsne_data


def generate_edgelist_for_RiWalk(n_feature, edgelist_RiWalk_filename):
    """
    generates edge list for RiWalk
    """
    # generate edge-list
    print("\tEdge-list generation starts.")
    start_time = time.time()
    # simp_dir_edge_list = n_feature.generate_edge_list_for_RiWalk_unweighted()
    simp_dir_edge_list = n_feature.generate_edge_list_for_RiWalk_weighted()
    # simp_dir_edge_list.to_csv(edgelist_RiWalk_filename, sep=' ', index=False, header=False)
    simp_dir_edge_list.to_csv(edgelist_RiWalk_filename, index=False)
    print("\tEdge-list generation lasted {} seconds.".format(time.time() - start_time))


def generate_features_for_all_nodes(n_feature, features_filename):
    """
    generates node-list with features for RiWalk-NA
    """
    # generate node-features-df
    print("\tFeatures generation starts.")
    start_time = time.time()
    nodes_all_features_df = n_feature.gen_features_all_nodes()
    nodes_all_features_df.to_csv(features_filename, index=False)
    print("\tFeature generation lasted {} seconds.".format(time.time() - start_time))
    return nodes_all_features_df


def generate_node_list_for_RiWalk(nodes_all_features_df, selected_features,
                                  n_feature, tsne, nodelist_RiWalk_filename):
    print("\tSelecting features for RiWalk node-list.")
    start_time = time.time()

    if not tsne:
        # select some features
        nodes_selected_features_df = nodes_all_features_df[selected_features].copy()
    else:
        # t-SNE components
        shift_coeff = 10
        tsne_data = n_feature.gen_t_SNE_components_2d(nodes_all_features_df, shift_coeff)
        nodes_selected_features_df = pd.DataFrame(list(zip(nodes_all_features_df['node'].tolist(),
                                                           tsne_data['tsne-2d-first'],
                                                           tsne_data['tsne-2d-second'])),
                                                  columns=['node', 'tsne_2d_1', 'tsne_2d_2'])

    nodes_selected_features_df.to_csv(nodelist_RiWalk_filename, index=False)
    print("\tFeature selection lasted {} seconds.".format(time.time() - start_time))


def main():
    """
    Experiments
    """
    gprep_opt = 'ri'

  
    # generated file
    nodelist_RiWalk_filename = "/content/drive/My Drive/Baseline/Dataset/nodelist.nodelist"
    edgelist_RiWalk_filename = "/content/drive/My Drive/Baseline/Dataset/edgelist.edgelist"
    node_filename = "/content/drive/My Drive/Baseline/Dataset/nodeData.csv"
    edge_filename = "/content/drive/My Drive/Baseline/Dataset/edgeData.csv"
    features_filename = "/content/drive/My Drive/Baseline/Dataset/features.csv"
    feat_imp_filename = "/content/drive/My Drive/Baseline/Dataset/imp_features.csv"

    nodes = pd.read_csv(node_filename)
    edges = pd.read_csv(edge_filename)


    n_feature = NodeFeature(nodes, edges)

    # graph pre-paration tasks
    if gprep_opt == 'feature':
        generate_features_for_all_nodes(n_feature, features_filename)
    elif gprep_opt == 'ri':
        # generate edge list for RiWalk
        generate_edgelist_for_RiWalk(n_feature, edgelist_RiWalk_filename)
        # generate node list for RiWalk
        nodes_all_features_df = pd.read_csv(features_filename)
        feature_rank = pd.read_csv(feat_imp_filename)['feature'].tolist()
        selected_features = ['node'] + feature_rank[0:10]

        tsne = False  # use TSNE components
        generate_node_list_for_RiWalk(nodes_all_features_df, selected_features, n_feature,
                                      tsne, nodelist_RiWalk_filename)
    else:
        raise ValueError("Incorrect value for graph preparation option!")

if __name__ == '__main__':
    main()

*** Original MD-Graph ***
MultiDiGraph with 35417 nodes and 81000 edges
	Edge-list generation starts.
	Edge-list generation lasted 23.30220079421997 seconds.
	Selecting features for RiWalk node-list.
	Feature selection lasted 0.4857761859893799 seconds.


In [None]:
"""
Implement the node classification task
In fact, this file is a collection of utility functions
"""

# --- import statements ---
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score, roc_auc_score
from sklearn.manifold import TSNE
import time
import seaborn as sns
import math


# --- parameters ---
rnd_seed = 42
random.seed(rnd_seed)
test_size = 0.2


# --- utility functions ---

def perf_report(identifier, y_true, y_pred, binary, print_enable=False):
    if binary:
        # print(">>> Binary Classification.")
        prec, rec, f1, num = precision_recall_fscore_support(y_true, y_pred, average='binary')
        micro_f1 = f1_score(y_true, y_pred, average='binary')
    else:
        print(">>> Multi-class Classification.")
        prec, rec, f1, num = precision_recall_fscore_support(y_true, y_pred, average='macro')
        micro_f1 = f1_score(y_true, y_pred, average='macro')
    acc = accuracy_score(y_true, y_pred)
    if print_enable:
        print("\t*** {} performance reports: ***".format(str(identifier)))
        print("\t\tPrecision: %.3f \n\t\tRecall: %.3f \n\t\tF1-Score: %.3f" % (prec, rec, f1))
        print('\t\tMicro-Average F1-Score: %.3f' % micro_f1)
        print('\t\tAccuracy: %.3f' % acc)
        print(classification_report(y_true, y_pred))
    return prec, rec, f1, acc


def train_test_split(X, y, rnd_seed):
    """
    split the features and the labels according to the indices
    :param X: feature set, should be array or list
    :param y: labels, should be array or list
    :param rnd_seed: random seed
    """
    # generate indices for the train and test set
    indices = [i for i in range(len(y))]
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=rnd_seed)
    sss.get_n_splits(indices, y)
    train_indices, test_indices = next(sss.split(indices, y))

    # train/test split
    X_train = [X[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]

    y_train = [y[i] for i in train_indices]
    y_test = [y[i] for i in test_indices]

    return X_train, X_test, y_train, y_test


def simple_classification(clf, clf_id, emb_flag, X_train, X_test, y_train, y_test,
                          binary, exp_id, print_enable=False):
    """
    train the model on the train set and test it on the test set.
    to be consistent among different run, the indices are passed.
    important NOTE: it is implicitly inferred that the positive label is 1.
    no cross-validation is applied.
    """
    print("C")
    print(X_train, y_train)
    # train the model
    clf.fit(X_train, y_train)
    print("D")
    # predict the training set labels
    y_train_pred = clf.predict(X_train)

    # predict the test set labels
    y_test_pred = clf.predict(X_test)

    # evaluate the performance for the training set
    tr_prec, tr_rec, tr_f1, tr_acc = perf_report(str(clf_id) + ' - Training Set',
                                                 y_train, y_train_pred, binary, print_enable)
    ts_prec, ts_rec, ts_f1, ts_acc = perf_report(str(clf_id) + ' - Test Set',
                                                 y_test, y_test_pred, binary, print_enable)

    # auc-roc
    if binary:
        y_test_proba = clf.predict_proba(X_test)[::,1]
        y_train_proba = clf.predict_proba(X_train)[::,1]
        tr_roc_auc = roc_auc_score(y_train, y_train_proba)
        ts_roc_auc = roc_auc_score(y_test, y_test_proba)

    split_exp_id = exp_id.split(";")
    if len(split_exp_id) == 2:
        index = split_exp_id[0]
        id = split_exp_id[1]
    elif len(split_exp_id) == 1:
        index = 0
        id = split_exp_id[0]
    else:
        raise ValueError("Incorrect Experiment ID!")

    perf_dict = {
        'index': index,
        'exp_id': id,
        'emb_method': str(emb_flag),
        'classifier': str(clf_id),

        'train_prec': tr_prec,
        'train_rec': tr_rec,
        'train_f1': tr_f1,
        'train_acc': tr_acc,
        'train_auc': tr_roc_auc,

        'test_prec': ts_prec,
        'test_rec': ts_rec,
        'test_f1': ts_f1,
        'test_acc': ts_acc,
        'test_auc': ts_roc_auc
    }
    print(perf_dict)
    return perf_dict, clf


def rf_lr_classification(X_train, X_test, y_train, y_test, stats_file, flag,
                         binary, exp_id, print_report=False):
    """
    apply classification to input X with label y with "Random Forest" & "Logistic Regression"
    :param X_train: train set
    :param X_test: test set
    :param y_train: train set labels
    :param y_test: test set labels
    :param print_report: whether print the results of classification or not
    :return the classification results
    """
    # define classifier
    rf_clf = RandomForestClassifier(n_estimators=50, max_features=10, max_depth=5, random_state=rnd_seed)
    lr_clf = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1e5, random_state=rnd_seed)

    # apply classification
    rf_perf, rf_clf = simple_classification(rf_clf, 'RF', flag, X_train, X_test, y_train, y_test,
                                            binary, exp_id, print_report)
    lr_perf, lr_clf = simple_classification(lr_clf, 'LR', flag, X_train, X_test, y_train, y_test,
                                            binary, exp_id, print_report)

    # append the results to file
    # stats_df = pd.read_csv(stats_file)
    # stats_df = stats_df.append(rf_perf, ignore_index=True)
    # stats_df = stats_df.append(lr_perf, ignore_index=True)
    # stats_df.to_csv(stats_file, index=False)

    return rf_perf, rf_clf, lr_perf, lr_clf


def RF_sorted_feature_importance(clf, feature_name):
    """
    return the top 10 most important features of the RF clf model
    assumption: clf is a trained RF model
    """
    # feature importance
    importance = clf.feature_importances_
    indices = np.argsort(importance)[::-1]

    # Print the feature ranking
    sorted_feature_name = [feature_name[indices[i]] for i in range(len(feature_name))]
    sorted_feature_importance = [importance[indices[i]] for i in range(len(feature_name))]
    feature_imp_df = pd.DataFrame(list(zip(sorted_feature_name, sorted_feature_importance)),
                                  columns=['feature', 'importance'])
    return feature_imp_df


def RF_feature_imp(X, y, feature_name, png_file):
    """
    calculate feature importance for the Random Forest Classifier
    :param X: features
    :param y: labels
    :param feature_name: the name of the features
    """
    # define and fit classifier
    rf_clf = RandomForestClassifier(n_estimators=100, max_features=16, max_depth=5,
                                    random_state=rnd_seed)
    rf_clf.fit(X, y)

    # feature importance
    importances = rf_clf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in rf_clf.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")
    for f in range(len(feature_name)):
        print("%d. feature %d (%s) (%f)" % (f + 1, indices[f], feature_name[indices[f]],
                                            importances[indices[f]]))

    # Plot the impurity-based feature importances of the forest
    plt.figure()
    plt.title("Feature Importance")
    plt.bar(range(len(feature_name)), importances[indices], color="g", yerr=std[indices], align="center")
    plt.xticks(range(len(feature_name)), indices)
    plt.xlim([-1, len(feature_name)])
    # plt.show()
    plt.savefig(png_file)


def read_emb_and_node_list(emb_file, node_file):
    # read embedding
    emb_df = pd.read_csv(emb_file, sep=' ', skiprows=1, header=None)
    emb_df.columns = ['node'] + [f'emb_{i}' for i in range(emb_df.shape[1] - 1)]

    # read node list
    node_df = pd.read_csv(node_file)
    node_df = node_df[['node', 'isp']]

    # merge
    merged_df = emb_df.merge(node_df, on='node', how='left')
    return merged_df


def data_preproc_for_RiWalk_Binary_clf(emb_file, node_file):
    """
    pre-process the RiWalk generated embedding for node classification
    """
    # read and merge the data frames
    merged_df = read_emb_and_node_list(emb_file, node_file)

    # datasets for  BINARY classification
    X = merged_df # only anchor nodes
    y = X['isp'].tolist()
    X = X.drop(['node', 'isp'], axis=1)
    feature_names = X.columns
    X = X.values.tolist()

    # split the train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, rnd_seed)

    return X_train, X_test, y_train, y_test, feature_names


def prepare_data_for_concat_fe_emb(emb_file, fe_file):
    """
    pre-process the data for the node classification of a new dataset consisting of the
    engineered features and the embeddings
    """
    # read embedding
    emb_df = pd.read_csv(emb_file, sep=' ', skiprows=1, header=None)
    emb_df.columns = ['node'] + [f'emb_{i}' for i in range(emb_df.shape[1] - 1)]

    # read node list
    node_df = pd.read_csv(fe_file)
    # scale features
    feature_col = [f for f in node_df.columns if f not in ['node', 'isp']]
    scaler = StandardScaler()
    node_df[feature_col] = scaler.fit_transform(node_df[feature_col])

    # merge
    merged_df = emb_df.merge(node_df, on='node', how='left')

    # datasets for  BINARY classification
    X = merged_df  # only anchor nodes
    y = X['isp'].tolist()
    X = X.drop(['node', 'isp'], axis=1)
    X = X.values.tolist()

    # split the train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, rnd_seed)

    return X_train, X_test, y_train, y_test


def plot_TSNE(values, labels, png_file):
    """
    plot the embeddings as a TSNE graph
    """
    print('\tt-SNE starts.')
    time_start = time.time()
    tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
    tsne_results = tsne.fit_transform(values)
    print('\tt-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))

    # plotting
    p_data = {'tsne-2d-first': tsne_results[:, 0],
              'tsne-2d-second': tsne_results[:, 1],
              'label': labels,
              }

    plt.figure(figsize=(16, 10))
    sns.scatterplot(
        x="tsne-2d-first", y="tsne-2d-second",
        hue="label",
        palette=sns.color_palette("hls", len(set(labels))),
        data=p_data,
        legend="full",
        alpha=0.3
    )
    # plt.show()
    plt.savefig(png_file)


def EF_analysis_selected_nodes(output_path, graph, edges_filename, nodes_filename,
                               features_filename, stats_file, feat_imp_filename,
                               flag, binary, rnd_seed, exp_id, extra_analysis):
    nodes_df = pd.read_csv(nodes_filename)

    print("\tRetrieve anchor nodes for classification.")
    start_time = time.time()
    selected_node_list = nodes_df['node'].tolist()
    print("\t\tTime elapsed {} seconds.".format(time.time() - start_time))


    print("\tRead features for anchor nodes.")
    start_time = time.time()
    all_node_features_df = pd.read_csv(features_filename)
    features_df = all_node_features_df.loc[all_node_features_df['node'].isin(selected_node_list)]
    print("\t\tTime elapsed {} seconds.".format(time.time() - start_time))

    # make ready for classification
    y = features_df['isp'].tolist()  # only anchor nodes where selected
    X_orig = features_df.drop(['node', 'isp'], axis=1)
    feature_names = X_orig.columns
    X_orig = X_orig.values.tolist()

    # split the train and test set
    print("\tTrain-Test split.")
    X_train, X_test, y_train, y_test = train_test_split(X_orig, y, rnd_seed)

    # scale the features; note that it should be fitted on the train set ONLY
    print('\tScaling the features.')
    min_max_scaler = MinMaxScaler()
    min_max_scaler.fit(X_train)
    X_train_scaled = min_max_scaler.transform(X_train)
    X_test_scaled = min_max_scaler.transform(X_test)

    # classification
    print('\tApplying classification.')
    start_time = time.time()
    rf_perf, rf_clf, lr_perf, lr_clf = rf_lr_classification(X_train_scaled, X_test_scaled, y_train,
                                                            y_test, stats_file, flag, binary,
                                                            exp_id, print_report=True)
    print("\t\tTime elapsed {} seconds.".format(time.time() - start_time))

    # calculates and saves features importance
    feature_imp_df = RF_sorted_feature_importance(rf_clf, feature_names)
    feature_imp_df.to_csv(feat_imp_filename, index=False)

    if extra_analysis:
        # Feature importance
        print("\tInvestigate feature importance.")
        png_file = output_path + '/' + graph + '_' + flag + '_FE_feature_impo.png'
        RF_feature_imp(X_train_scaled, y_train, feature_names, png_file)

        # plot t-SNE graph
        print("\tt-SNE graph.")
        values = X_orig
        groups = y
        png_file = output_path + '/' + graph + '_' + flag + '_FE_tsne.png'
        plot_TSNE(values, groups, png_file)

    print("FE node classification finished.")


def RiWalk_analysis_selected_nodes(output_path, graph, emb_filename, nodes_filename, stats_filename,
                                   flag, binary, exp_id, extra_analysis):
    # prepare the data
    print("\tPrepare data sets.")
    X_train, X_test, y_train, y_test, feature_names = data_preproc_for_RiWalk_Binary_clf(emb_filename,
                                                                                         nodes_filename)
    # classification
    print('\tApplying classification.')
    start_time = time.time()
    rf_lr_classification(X_train, X_test, y_train, y_test, stats_filename, flag,
                         binary, exp_id, print_report=True)
    print("\tTime elapsed {} seconds.".format(time.time() - start_time))

    if extra_analysis:
        # Feature importance
        print("\tInvestigate feature importance.")
        png_file = output_path + '/' + graph + '_' + flag + '_Ri_feature_impo.png'
        RF_feature_imp(X_train, y_train, feature_names, png_file)

        # plot t-SNE graph
        print("\tPlot t-SNE.")
        values = X_train + X_test
        groups = y_train + y_test
        # nodes_df = pd.read_csv(nodes_filename)
        png_file = output_path + '/' + graph + flag + '_Ri_tsne.png'
        plot_TSNE(values, groups, png_file)

    print("RiWalk node classification finished.")


def nd_clf_fe_emb_combined(emb_file, fe_file, stats_file, flag, binary, exp_id):
    """
    apply the node classification based on a new feature set constructed by combining the
    engineered features and the (structural) embedding generated by an automatic method like node2vec
    """
    print("\tConcatenating embedding with engineered features for node classification.")
    # data preparation
    X_train, X_test, y_train, y_test = prepare_data_for_concat_fe_emb(emb_file, fe_file)

    # classification
    print('\tApplying classification.')
    start_time = time.time()
    rf_lr_classification(X_train, X_test, y_train, y_test, stats_file, flag,
                         binary, exp_id, print_report=True)
    print("\tTime elapsed {} seconds.".format(time.time() - start_time))


def main():
    """
    end-to-end classification
    """
    binary = True  # binary or multi-class classification.

    flag = 'sp'
    clf_opt = 'abcd'
    exp_id = '1;elliptic'

    nodes_filename = "/content/drive/My Drive/Baseline/Dataset/nodeData.csv"
    edges_filename = "/content/drive/My Drive/Baseline/Dataset/edgeData.csv"
    features_filename = "/content/drive/My Drive/Baseline/Dataset/features.csv"
    feat_imp_filename = "/content/drive/My Drive/Baseline/Dataset/imp_features.csv"
    prod_data_dir = "/content/drive/My Drive/Baseline/Dataset/Prod"
    graph_filename = 'graph_filename'
    stats_file = "/content/drive/My Drive/Baseline/Dataset/stats.csv"
    emb_filename = "/content/drive/My Drive/Baseline/Dataset/embeddings.emb"


    if clf_opt == 'fe':
        # ------------------ Feature Engineering ------------------
        # read the input file and generating the features and the labels set
        print("Node Classification --- Feature Engineering ---")

        EF_analysis_selected_nodes(prod_data_dir, graph_filename, edges_filename, nodes_filename,
                                   features_filename, stats_file, feat_imp_filename, 'FE', binary,
                                   rnd_seed, exp_id, extra_analysis=False)
        print("--- Node Classification Feature Engineering is done ---")
        # ---------------------------------------------------------

    elif clf_opt == 'concat':
        print("Node classification: Concat. FE &" + flag + " embeddings.")

        emb_file = "/content/drive/My Drive/Baseline/Dataset/embeddings.emb"
        fe_file = features_filename
        nd_clf_fe_emb_combined(emb_file, fe_file, stats_file, flag, binary, exp_id)

    else:
        # ------------------ RiWalk -------------------------------
        print("Node classification: --- RiWalk - " + flag + "---")
        RiWalk_analysis_selected_nodes(prod_data_dir, graph_filename, emb_filename, nodes_filename, stats_file,
                                       flag, binary, exp_id, extra_analysis=False)
        print("--- Classification RiWalk is done ---")
        # ---------------------------------------------------------


if __name__ == '__main__':
    main()


Node classification: --- RiWalk - sp---
	Prepare data sets.
	Applying classification.
C


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



D
	*** RF - Training Set performance reports: ***
		Precision: 0.000 
		Recall: 0.000 
		F1-Score: 0.000
		Micro-Average F1-Score: 0.000
		Accuracy: 0.967


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.97      1.00      0.98     27401
           1       0.00      0.00      0.00       932

    accuracy                           0.97     28333
   macro avg       0.48      0.50      0.49     28333
weighted avg       0.94      0.97      0.95     28333

	*** RF - Test Set performance reports: ***
		Precision: 0.000 
		Recall: 0.000 
		F1-Score: 0.000
		Micro-Average F1-Score: 0.000
		Accuracy: 0.967
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      6851
           1       0.00      0.00      0.00       233

    accuracy                           0.97      7084
   macro avg       0.48      0.50      0.49      7084
weighted avg       0.94      0.97      0.95      7084

{'index': '1', 'exp_id': 'elliptic', 'emb_method': 'sp', 'classifier': 'RF', 'train_prec': 0.0, 'train_rec': 0.0, 'train_f1': 0.0, 'train_acc': 0.9671054953587689, 'train_auc': 0.6824987434279598, 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


	*** LR - Training Set performance reports: ***
		Precision: 0.000 
		Recall: 0.000 
		F1-Score: 0.000
		Micro-Average F1-Score: 0.000
		Accuracy: 0.967
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     27401
           1       0.00      0.00      0.00       932

    accuracy                           0.97     28333
   macro avg       0.48      0.50      0.49     28333
weighted avg       0.94      0.97      0.95     28333

	*** LR - Test Set performance reports: ***
		Precision: 0.000 
		Recall: 0.000 
		F1-Score: 0.000
		Micro-Average F1-Score: 0.000
		Accuracy: 0.967
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      6851
           1       0.00      0.00      0.00       233

    accuracy                           0.97      7084
   macro avg       0.48      0.50      0.49      7084
weighted avg       0.94      0.97      0.95      7084

{'index': '1', 'exp_id': 'elliptic', 'emb_me