In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import OneClassSVM, SVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score, roc_auc_score
from sklearn.manifold import TSNE
import time
import seaborn as sns

rnd_seed = 42
random.seed(rnd_seed)
test_size = 0.2

In [None]:
def perf_report(identifier, y_true, y_pred, binary, print_enable=False):
    if binary:
        print(">>> Binary Classification.")
        prec, rec, f1, num = precision_recall_fscore_support(y_true, y_pred, average='binary')
        micro_f1 = f1_score(y_true, y_pred, average='binary')
    else:
        print(">>> Multi-class Classification.")
        prec, rec, f1, num = precision_recall_fscore_support(y_true, y_pred, average='macro')
        micro_f1 = f1_score(y_true, y_pred, average='macro')
    acc = accuracy_score(y_true, y_pred)
    if print_enable:
        print("\t*** {} performance reports: ***".format(str(identifier)))
        print("\t\tPrecision: %.3f \n\t\tRecall: %.3f \n\t\tF1-Score: %.3f" % (prec, rec, f1))
        print('\t\tMicro-Average F1-Score: %.3f' % micro_f1)
        print('\t\tAccuracy: %.3f' % acc)
        print(classification_report(y_true, y_pred))
    return prec, rec, f1, acc

In [None]:
def train_test_split(X, y, rnd_seed):
    """
    split the features and the labels according to the indices
    :param X: feature set, should be array or list
    :param y: labels, should be array or list
    :param rnd_seed: random seed
    """
    # generate indices for the train and test set
    indices = [i for i in range(len(y))]
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=rnd_seed)
    sss.get_n_splits(indices, y)
    train_indices, test_indices = next(sss.split(indices, y))

    # train/test split
    X_train = [X[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]

    y_train = [y[i] for i in train_indices]
    y_test = [y[i] for i in test_indices]

    return X_train, X_test, y_train, y_test

def simple_classification(clf, clf_id, emb_flag, X_train, X_test, y_train, y_test,
                          binary, exp_id, print_enable=False):
    """
    train the model on the train set and test it on the test set.
    to be consistent among different run, the indices are passed.
    important NOTE: it is implicitly inferred that the positive label is 1.
    no cross-validation is applied.
    """

    # train the model
    clf.fit(X_train, y_train)

    # predict the training set labels
    y_train_pred = clf.predict(X_train)

    # predict the test set labels
    y_test_pred = clf.predict(X_test)

    # evaluate the performance for the training set
    tr_prec, tr_rec, tr_f1, tr_acc = perf_report(str(clf_id) + ' - Training Set',
                                                 y_train, y_train_pred, binary, print_enable)
    ts_prec, ts_rec, ts_f1, ts_acc = perf_report(str(clf_id) + ' - Test Set',
                                                 y_test, y_test_pred, binary, print_enable)

    # # auc-roc
    if binary:
        y_test_proba = clf.predict_proba(X_test)[::,1]
        y_train_proba = clf.predict_proba(X_train)[::,1]
        tr_roc_auc = roc_auc_score(y_train, y_train_proba)
        ts_roc_auc = roc_auc_score(y_test, y_test_proba)
    # else:
    #     tr_roc_auc = roc_auc_score(y_train, clf.predict_proba(X_train), multi_class='ovr')
    #     ts_roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test), multi_class='ovr')

    split_exp_id = exp_id.split(";")
    if len(split_exp_id) == 2:
        index = split_exp_id[0]
        id = split_exp_id[1]
    elif len(split_exp_id) == 1:
        index = 0
        id = split_exp_id[0]
    else:
        raise ValueError("Incorrect Experiment ID!")

    perf_dict = {
        'index': index,
        'exp_id': id,
        'emb_method': str(emb_flag),
        'classifier': str(clf_id),

        'train_prec': tr_prec,
        'train_rec': tr_rec,
        'train_f1': tr_f1,
        'train_acc': tr_acc,
        'train_auc': tr_roc_auc,

        'test_prec': ts_prec,
        'test_rec': ts_rec,
        'test_f1': ts_f1,
        'test_acc': ts_acc,
        'test_auc': ts_roc_auc
    }

    print(perf_dict)

    return perf_dict, clf

In [None]:
def rf_lr_classification(X_train, X_test, y_train, y_test, stats_file, flag,
                         binary, exp_id, print_report=False):
    """
    apply classification to input X with label y with "Random Forest" & "Logistic Regression"
    :param X_train: train set
    :param X_test: test set
    :param y_train: train set labels
    :param y_test: test set labels
    :param print_report: whether print the results of classification or not
    :return the classification results
    """
    # define classifier
    rf_clf = RandomForestClassifier(n_estimators=50, max_features=10, max_depth=5, random_state=rnd_seed)
    lr_clf = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1e5, random_state=rnd_seed)

    # apply classification
    rf_perf, rf_clf = simple_classification(rf_clf, 'RF', flag, X_train, X_test, y_train, y_test,
                                            binary, exp_id, print_report)
    binary = True
    lr_perf, lr_clf = simple_classification(lr_clf, 'LR', flag, X_train, X_test, y_train, y_test,
                                            binary, exp_id, print_report)

    # append the results to file
    # stats_df = pd.read_csv(stats_file)
    # stats_df = stats_df.append(rf_perf, ignore_index=True)
    # stats_df = stats_df.append(lr_perf, ignore_index=True)
    # stats_df.to_csv(stats_file, index=False)

    return rf_perf, rf_clf, lr_perf, lr_clf

In [None]:
def RF_sorted_feature_importance(clf, feature_name):
    """
    return the top 10 most important features of the RF clf model
    assumption: clf is a trained RF model
    """
    # feature importance
    importance = clf.feature_importances_
    indices = np.argsort(importance)[::-1]

    # Print the feature ranking
    sorted_feature_name = [feature_name[indices[i]] for i in range(len(feature_name))]
    sorted_feature_importance = [importance[indices[i]] for i in range(len(feature_name))]
    feature_imp_df = pd.DataFrame(list(zip(sorted_feature_name, sorted_feature_importance)),
                                  columns=['feature', 'importance'])
    return feature_imp_df

In [None]:
def RF_feature_imp(X, y, feature_name, png_file):
    """
    calculate feature importance for the Random Forest Classifier
    :param X: features
    :param y: labels
    :param feature_name: the name of the features
    """
    # define and fit classifier
    rf_clf = RandomForestClassifier(n_estimators=100, max_features=16, max_depth=5,
                                    random_state=rnd_seed)
    rf_clf.fit(X, y)

    # feature importance
    importances = rf_clf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in rf_clf.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")
    for f in range(len(feature_name)):
        print("%d. feature %d (%s) (%f)" % (f + 1, indices[f], feature_name[indices[f]],
                                            importances[indices[f]]))

    # Plot the impurity-based feature importances of the forest
    plt.figure()
    plt.title("Feature Importance")
    plt.bar(range(len(feature_name)), importances[indices], color="g", yerr=std[indices], align="center")
    plt.xticks(range(len(feature_name)), indices)
    plt.xlim([-1, len(feature_name)])
    # plt.show()
    plt.savefig(png_file)

In [None]:
def read_emb_and_node_list(emb_file, node_file):
    # read embedding
    emb_df = pd.read_csv(emb_file, sep=' ', skiprows=1, header=None)
    emb_df.columns = ['node'] + [f'emb_{i}' for i in range(emb_df.shape[1] - 1)]

    # read node list
    node_df = pd.read_csv(node_file)
    node_df = node_df[['node', 'isp']]

    # merge
    merged_df = emb_df.merge(node_df, on='node', how='left')

    return merged_df

In [None]:
def data_preproc_for_RiWalk_Binary_clf(emb_file, node_file):
    """
    pre-process the RiWalk generated embedding for node classification
    """
    # read and merge the data frames
    merged_df = read_emb_and_node_list(emb_file, node_file)

    # datasets for  BINARY classification
    X = merged_df # only anchor nodes
    y = X['isp'].tolist()
    X = X.drop(['node', 'isp'], axis=1)
    feature_names = X.columns
    X = X.values.tolist()

    # split the train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, rnd_seed)

    return X_train, X_test, y_train, y_test, feature_names

In [None]:
def prepare_data_for_concat_fe_emb(emb_file, fe_file):
    """
    pre-process the data for the node classification of a new dataset consisting of the
    engineered features and the embeddings
    """
    # read embedding
    emb_df = pd.read_csv(emb_file, sep=' ', skiprows=1, header=None)
    emb_df.columns = ['node'] + [f'emb_{i}' for i in range(emb_df.shape[1] - 1)]

    # read node list
    node_df = pd.read_csv(fe_file)
    # scale features
    feature_col = [f for f in node_df.columns if f not in ['node', 'isp']]
    scaler = StandardScaler()
    node_df[feature_col] = scaler.fit_transform(node_df[feature_col])

    # merge
    merged_df = emb_df.merge(node_df, on='node', how='left')

    # datasets for  BINARY classification
    X = merged_df  # only anchor nodes
    y = X['isp'].tolist()
    X = X.drop(['node', 'isp'], axis=1)
    X = X.values.tolist()

    # split the train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, rnd_seed)

    return X_train, X_test, y_train, y_test

In [None]:
def plot_TSNE(values, labels, png_file):
    """
    plot the embeddings as a TSNE graph
    """
    print('\tt-SNE starts.')
    time_start = time.time()
    tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
    tsne_results = tsne.fit_transform(values)
    print('\tt-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))

    # plotting
    p_data = {'tsne-2d-first': tsne_results[:, 0],
              'tsne-2d-second': tsne_results[:, 1],
              'label': labels,
              }

    plt.figure(figsize=(16, 10))
    sns.scatterplot(
        x="tsne-2d-first", y="tsne-2d-second",
        hue="label",
        palette=sns.color_palette("hls", len(set(labels))),
        data=p_data,
        legend="full",
        alpha=0.3
    )
    # plt.show()
    plt.savefig(png_file)

In [None]:
def EF_analysis_selected_nodes(output_path, graph, edges_filename, nodes_filename,
                               features_filename, stats_file, feat_imp_filename,
                               flag, binary, rnd_seed, exp_id, extra_analysis):
    # print("\tRead edge list and node list.")
    # start_time = time.time()
    # edges_df = pd.read_csv(edges_filename)
    nodes_df = pd.read_csv(nodes_filename)
    # print("\t\tTime elapsed {} seconds.".format(time.time() - start_time))

    print("\tRetrieve anchor nodes for classification.")
    start_time = time.time()
    selected_node_list = nodes_df['node'].tolist()
    print("\t\tTime elapsed {} seconds.".format(time.time() - start_time))


    print("\tRead features for anchor nodes.")
    start_time = time.time()
    all_node_features_df = pd.read_csv(features_filename)
    features_df = all_node_features_df.loc[all_node_features_df['node'].isin(selected_node_list)]
    print("\t\tTime elapsed {} seconds.".format(time.time() - start_time))

    # make ready for classification
    # features_df = pd.read_csv(features_filename)
    y = features_df['isp'].tolist()  # only anchor nodes where selected
    X_orig = features_df.drop(['node', 'isp'], axis=1)
    feature_names = X_orig.columns
    X_orig = X_orig.values.tolist()

    # split the train and test set
    print("\tTrain-Test split.")
    X_train, X_test, y_train, y_test = train_test_split(X_orig, y, rnd_seed)

    # scale the features; note that it should be fitted on the train set ONLY
    print('\tScaling the features.')
    min_max_scaler = MinMaxScaler()
    min_max_scaler.fit(X_train)
    X_train_scaled = min_max_scaler.transform(X_train)
    X_test_scaled = min_max_scaler.transform(X_test)

    # classification
    print('\tApplying classification.')
    start_time = time.time()
    rf_perf, rf_clf, lr_perf, lr_clf = rf_lr_classification(X_train_scaled, X_test_scaled, y_train,
                                                            y_test, stats_file, flag, binary,
                                                            exp_id, print_report=True)
    print("\t\tTime elapsed {} seconds.".format(time.time() - start_time))

    # calculates and saves features importance
    feature_imp_df = RF_sorted_feature_importance(rf_clf, feature_names)
    feature_imp_df.to_csv(feat_imp_filename, index=False)

    if extra_analysis:
        # Feature importance
        print("\tInvestigate feature importance.")
        png_file = output_path + '/' + graph + '_' + flag + '_FE_feature_impo.png'
        RF_feature_imp(X_train_scaled, y_train, feature_names, png_file)

        # plot t-SNE graph
        print("\tt-SNE graph.")
        values = X_orig
        groups = y
        png_file = output_path + '/' + graph + '_' + flag + '_FE_tsne.png'
        plot_TSNE(values, groups, png_file)

    print("FE node classification finished.")

In [None]:
def RiWalk_analysis_selected_nodes(output_path, graph, emb_filename, nodes_filename, stats_filename,
                                   flag, binary, exp_id, extra_analysis):
    # prepare the data
    print("\tPrepare data sets.")
    X_train, X_test, y_train, y_test, feature_names = data_preproc_for_RiWalk_Binary_clf(emb_filename,
                                                                                         nodes_filename)
    # classification
    print('\tApplying classification.')
    start_time = time.time()
    rf_lr_classification(X_train, X_test, y_train, y_test, stats_filename, flag,
                         binary, exp_id, print_report=True)
    print("\tTime elapsed {} seconds.".format(time.time() - start_time))

    if extra_analysis:
        # Feature importance
        print("\tInvestigate feature importance.")
        png_file = output_path + '/' + graph + '_' + flag + '_Ri_feature_impo.png'
        RF_feature_imp(X_train, y_train, feature_names, png_file)

        # plot t-SNE graph
        print("\tPlot t-SNE.")
        values = X_train + X_test
        groups = y_train + y_test
        # nodes_df = pd.read_csv(nodes_filename)
        png_file = output_path + '/' + graph + flag + '_Ri_tsne.png'
        plot_TSNE(values, groups, png_file)

    print("RiWalk node classification finished.")

In [None]:
def nd_clf_fe_emb_combined(emb_file, fe_file, stats_file, flag, binary, exp_id):
    """
    apply the node classification based on a new feature set constructed by combining the
    engineered features and the (structural) embedding generated by an automatic method like node2vec
    """
    print("\tConcatenating embedding with engineered features for node classification.")
    # data preparation
    X_train, X_test, y_train, y_test = prepare_data_for_concat_fe_emb(emb_file, fe_file)

    # classification
    print('\tApplying classification.')
    start_time = time.time()
    rf_lr_classification(X_train, X_test, y_train, y_test, stats_file, flag,
                         binary, exp_id, print_report=True)
    print("\tTime elapsed {} seconds.".format(time.time() - start_time))

In [None]:
"""
Node2Vec graph embedding method
The source code is from the repository of the authors of the paper
"""

# Import Statements

import numpy as np
import networkx as nx
import gensim
from joblib import Parallel, delayed
from collections import defaultdict
import os
import random
import pandas as pd
import time
import csv


# -------------------------------------------------------------------------------
# Class Definition


class Node2Vec:
    FIRST_TRAVEL_KEY = 'first_travel_key'
    PROBABILITIES_KEY = 'probabilities'
    NEIGHBORS_KEY = 'neighbors'
    WEIGHT_KEY = 'weight'
    NUM_WALKS_KEY = 'num_walks'
    WALK_LENGTH_KEY = 'walk_length'
    P_KEY = 'p'
    Q_KEY = 'q'

    def __init__(self, node_list_file_name, edge_filename, graph: nx.Graph, dimensions: int = 128, walk_length: int = 80,
                 num_walks: int = 10,
                 p: float = 1,
                 q: float = 1, weight_key: str = 'weight', workers: int = 10,
                 sampling_strategy: dict = None,
                 quiet: bool = True, temp_folder: str = None):
        """
        Initiates the Node2Vec object, precomputes walking probabilities and generates the walks.

        :param graph: Input graph
        :param dimensions: Embedding dimensions (default: 128)
        :param walk_length: Number of nodes in each walk (default: 80)
        :param num_walks: Number of walks per node (default: 10)
        :param p: Return hyper parameter (default: 1)
        :param q: Inout parameter (default: 1)
        :param weight_key: On weighted graphs, this is the key for the weight attribute (default: 'weight')
        :param workers: Number of workers for parallel execution (default: 1)
        :param sampling_strategy: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'.
        Use these keys exactly. If not set, will use the global ones which were passed on the object initialization
        :param temp_folder: Path to folder with enough space to hold the memory map of self.d_graph (for big graphs); to be passed joblib.Parallel.temp_folder
        """

        self.graph = graph
        self.dimensions = dimensions
        self.walk_length = walk_length
        self.num_walks = num_walks
        self.p = p
        self.q = q
        self.weight_key = weight_key
        self.workers = workers
        self.quiet = quiet
        self.d_graph = defaultdict(dict)

        if sampling_strategy is None:
            self.sampling_strategy = {}
        else:
            self.sampling_strategy = sampling_strategy

        self.temp_folder, self.require = None, None
        if temp_folder:
            if not os.path.isdir(temp_folder):
                raise NotADirectoryError("temp_folder does not exist or is not a directory. ({})".format(temp_folder))

            self.temp_folder = temp_folder
            self.require = "sharedmem"

        self._new_compute_prob(edge_filename, node_list_file_name) # for trans2vec
        # self._precompute_probabilities() #if there then node2vec
        self.walks = self._generate_walks()

    def _precompute_probabilities(self):
        """
        Precomputes transition probabilities for each node.
        """

        d_graph = self.d_graph

        # nodes_generator = self.graph.nodes() if self.quiet \
        #     else tqdm(self.graph.nodes(), desc='Computing transition probabilities')
        nodes_generator = self.graph.nodes()

        for source in nodes_generator:

            # Init probabilities dict for first travel
            if self.PROBABILITIES_KEY not in d_graph[source]:
                d_graph[source][self.PROBABILITIES_KEY] = dict()

            for current_node in self.graph.neighbors(source):

                # Init probabilities dict
                if self.PROBABILITIES_KEY not in d_graph[current_node]:
                    d_graph[current_node][self.PROBABILITIES_KEY] = dict()

                unnormalized_weights = list()
                d_neighbors = list()

                # Calculate unnormalized weights
                for destination in self.graph.neighbors(current_node):

                    p = self.sampling_strategy[current_node].get(self.P_KEY,
                                                                 self.p) if current_node in self.sampling_strategy else self.p
                    q = self.sampling_strategy[current_node].get(self.Q_KEY,
                                                                 self.q) if current_node in self.sampling_strategy else self.q

                    if destination == source:  # Backwards probability
                        ss_weight = self.graph[current_node][destination].get(self.weight_key, 1) * 1 / p
                    elif destination in self.graph[source]:  # If the neighbor is connected to the source
                        ss_weight = self.graph[current_node][destination].get(self.weight_key, 1)
                    else:
                        ss_weight = self.graph[current_node][destination].get(self.weight_key, 1) * 1 / q

                    # Assign the unnormalized sampling strategy weight, normalize during random walk
                    unnormalized_weights.append(ss_weight)
                    d_neighbors.append(destination)

                # Normalize
                unnormalized_weights = np.array(unnormalized_weights)
                d_graph[current_node][self.PROBABILITIES_KEY][
                    source] = unnormalized_weights / unnormalized_weights.sum()

                # Save neighbors
                d_graph[current_node][self.NEIGHBORS_KEY] = d_neighbors

            # Calculate first_travel weights for source
            first_travel_weights = []

            for destination in self.graph.neighbors(source):
                first_travel_weights.append(self.graph[source][destination].get(self.weight_key, 1))

            first_travel_weights = np.array(first_travel_weights)
            d_graph[source][self.FIRST_TRAVEL_KEY] = first_travel_weights / first_travel_weights.sum()

    def _new_compute_prob(self, edge_filename, node_list_file_name):
      data = pd.read_csv(edge_filename)
      nodes_data = pd.read_csv(node_list_file_name)
      d_graph = self.d_graph
      nodes_generator = self.graph.nodes()
      for source in nodes_generator:
          df_source = data.loc[data['source'] == source]
          unique_to = pd.unique(df_source['target']).tolist()
          total = df_source['amount'].sum()
          for destination in unique_to:
              df_target = df_source.loc[data['target'] == destination]
              sum = df_target['amount'].sum()
              prob = sum / total
              d_graph[source][destination] = prob
              
          df_node = nodes_data.loc[nodes_data['node'] == source]
          is_p = df_node.iloc[0]['isp']  
          d_graph[source][self.PROBABILITIES_KEY] = is_p
          d_graph[source][self.FIRST_TRAVEL_KEY] = []

    def _generate_walks(self) -> list:
        """
        Generates the random walks which will be used as the skip-gram input.
        :return: List of walks. Each walk is a list of nodes.
        """

        flatten = lambda l: [item for sublist in l for item in sublist]

        # Split num_walks for each worker
        num_walks_lists = np.array_split(range(self.num_walks), self.workers)

        walk_results = Parallel(n_jobs=self.workers, temp_folder=self.temp_folder, require=self.require)(
            delayed(parallel_generate_walks)(self.d_graph,
                                             self.walk_length,
                                             len(num_walks),
                                             idx,
                                             self.sampling_strategy,
                                             self.NUM_WALKS_KEY,
                                             self.WALK_LENGTH_KEY,
                                             self.NEIGHBORS_KEY,
                                             self.PROBABILITIES_KEY,
                                             self.FIRST_TRAVEL_KEY,
                                             self.quiet) for
            idx, num_walks
            in enumerate(num_walks_lists, 1))

        walks = flatten(walk_results)

        return walks

    def fit(self, **skip_gram_params) -> gensim.models.Word2Vec:
        """
        Creates the embeddings using gensim's Word2Vec.
        :param skip_gram_params: Parameteres for gensim.models.Word2Vec - do not supply 'size' it is taken from the Node2Vec 'dimensions' parameter
        :type skip_gram_params: dict
        :return: A gensim word2vec model
        """

        if 'workers' not in skip_gram_params:
            skip_gram_params['workers'] = self.workers

        if 'size' not in skip_gram_params:
            skip_gram_params['size'] = self.dimensions

        return gensim.models.Word2Vec(self.walks, **skip_gram_params)


def parallel_generate_walks(d_graph: dict, global_walk_length: int, num_walks: int, cpu_num: int,
                            sampling_strategy: dict = None, num_walks_key: str = None,
                            walk_length_key: str = None, neighbors_key: str = None,
                            probabilities_key: str = None, first_travel_key: str = None,
                            quiet: bool = False) -> list:
    """
    Generates the random walks which will be used as the skip-gram input.
    :return: List of walks. Each walk is a list of nodes.
    """

    walks = list()


    for n_walk in range(num_walks):

        # Shuffle the nodes
        shuffled_nodes = list(d_graph.keys())
        random.shuffle(shuffled_nodes)

        # Start a random walk from every node
        for source in shuffled_nodes:

            # Skip nodes with specific num_walks
            if source in sampling_strategy and \
                    num_walks_key in sampling_strategy[source] and \
                    sampling_strategy[source][num_walks_key] <= n_walk:
                continue

            # Start walk
            walk = [source]

            # Calculate walk length
            if source in sampling_strategy:
                walk_length = sampling_strategy[source].get(walk_length_key, global_walk_length)
            else:
                walk_length = global_walk_length

            # Perform walk
            while len(walk) < walk_length:

                walk_options = d_graph[walk[-1]].get(neighbors_key, None)

                # Skip dead end nodes
                if not walk_options:
                    break

                if len(walk) == 1:  # For the first step
                    probabilities = d_graph[walk[-1]][first_travel_key]
                    walk_to = np.random.choice(walk_options, size=1, p=probabilities)[0]
                else:
                    probabilities = d_graph[walk[-1]][probabilities_key][walk[-2]]
                    walk_to = np.random.choice(walk_options, size=1, p=probabilities)[0]

                walk.append(walk_to)

            walk = list(map(str, walk))  # Convert all to strings

            walks.append(walk)

    return walks


def main():
    """
    instantiate a node2vec object
    """
    print("Node2Vec main method.")
    start_time = time.time()

    iter_num = 5
    num_walks = 20
    dim = 64
    walk_length = 5
    workers = 10
    window_size = 10
    p = 1
    q = 1
    exp_id = '1;elliptic'

    edge_list_file_name = "/content/drive/My Drive/Baseline/Dataset/edgelist.edgelist"
    node_list_file_name = "/content/drive/My Drive/Baseline/Dataset/nodeData.csv"
    edge_filename = "/content/drive/My Drive/Baseline/Dataset/edgeData.csv"
    stats_file = "/content/drive/My Drive/Baseline/Dataset/stats.csv"
    embeddings_filename = "/content/drive/My Drive/Baseline/Dataset/embeddings.emb"

    nx_g = nx.from_pandas_edgelist(pd.read_csv(edge_list_file_name), source='source', target='target',
                                   create_using=nx.DiGraph())
    print("Graph info:", nx.info(nx_g))

    print("\tInstantiate a node2vec object.")
    node2vec = Node2Vec(node_list_file_name, edge_filename, nx_g, dimensions=dim, walk_length=walk_length,
                        num_walks=num_walks, workers=workers, p=p, q=q)
    print("\tFit node2vec.")
    model = node2vec.fit(window=window_size, sg=1, hs=0, min_count=1, iter=iter_num)

    # read node list
    print("\tExtract embeddings and labels for the anchor nodes.")
    nodes_df = pd.read_csv(node_list_file_name)

    # binary classification anchor nodes
    anchor_nodes_df = nodes_df
    node_list = [str(node_id) for node_id in anchor_nodes_df['node'].tolist()]
    embeddings = [model.wv.get_vector(node) for node in node_list]
    model.wv.save_word2vec_format(embeddings_filename)
    labels = anchor_nodes_df['isp'].tolist()


    # classification
    print("\tApply classification.")
    rnd_seed = 42
    binary = True
    X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, rnd_seed)
    rf_lr_classification(X_train, X_test, y_train, y_test, stats_file, 'n2v',
                                            binary, exp_id, print_report=True)
    print("Total elapsed time:", str(time.time() - start_time))


if __name__ == '__main__':
    main()

Node2Vec main method.
Graph info: DiGraph with 35417 nodes and 45377 edges
	Instantiate a node2vec object.




done
	Fit node2vec.
	Extract embeddings and labels for the anchor nodes.
Tanay
<class 'list'>
Tanay
	Apply classification.
done 1
done 2
done 3
>>> Binary Classification.
	*** RF - Training Set performance reports: ***
		Precision: 0.000 
		Recall: 0.000 
		F1-Score: 0.000
		Micro-Average F1-Score: 0.000
		Accuracy: 0.967


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.97      1.00      0.98     27401
           1       0.00      0.00      0.00       932

    accuracy                           0.97     28333
   macro avg       0.48      0.50      0.49     28333
weighted avg       0.94      0.97      0.95     28333

>>> Binary Classification.
	*** RF - Test Set performance reports: ***
		Precision: 0.000 
		Recall: 0.000 
		F1-Score: 0.000
		Micro-Average F1-Score: 0.000
		Accuracy: 0.967
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      6851
           1       0.00      0.00      0.00       233

    accuracy                           0.97      7084
   macro avg       0.48      0.50      0.49      7084
weighted avg       0.94      0.97      0.95      7084

done all
{'index': '1', 'exp_id': 'elliptic', 'emb_method': 'n2v', 'classifier': 'RF', 'train_prec': 0.0, 'train_rec': 0.0, 'train_f1': 0.0, 'train_acc': 0.96710549535876

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'index': '1', 'exp_id': 'elliptic', 'emb_method': 'n2v', 'classifier': 'LR', 'train_prec': 0.0, 'train_rec': 0.0, 'train_f1': 0.0, 'train_acc': 0.9671054953587689, 'train_auc': 0.5, 'test_prec': 0.0, 'test_rec': 0.0, 'test_f1': 0.0, 'test_acc': 0.9671089779785432, 'test_auc': 0.5}
Total elapsed time: 574.583459854126


In [None]:
flag = 'sp'
binary = True
clf_opt = 'fe'
exp_id = '1;elliptic'
emb_filename = '/content/drive/My Drive/Embeddings/embeddings1.emb'
nodes_filename = "/content/drive/My Drive/Initial CSV for FeatureEngineering/nodeData1.csv"
edges_filename = '/content/drive/My Drive/Riwalk_T2/edgelist1.edgelist'
prod_data_dir = "/content/drive/My Drive/SIGTRAN/1/"
graph_filename = 'graph_filename'
stats_file = "/content/drive/My Drive/SIGTRAN/stats.csv"

RiWalk_analysis_selected_nodes(prod_data_dir, graph_filename, emb_filename, nodes_filename, stats_file,flag, binary, exp_id, extra_analysis=False)

	Prepare data sets.
	Applying classification.
done 1
done 2
done 3
	*** SVC - Training Set performance reports: ***
		Precision: 0.905 
		Recall: 0.195 
		F1-Score: 0.321
		Micro-Average F1-Score: 0.321
		Accuracy: 0.974
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     28198
           1       0.91      0.20      0.32       932

    accuracy                           0.97     29130
   macro avg       0.94      0.60      0.65     29130
weighted avg       0.97      0.97      0.97     29130

	*** SVC - Test Set performance reports: ***
		Precision: 0.622 
		Recall: 0.099 
		F1-Score: 0.170
		Micro-Average F1-Score: 0.170
		Accuracy: 0.969
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      7050
           1       0.62      0.10      0.17       233

    accuracy                           0.97      7283
   macro avg       0.80      0.55      0.58      7283
weighted avg       0.96      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
