In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import random
import networkx as nx
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import OneClassSVM, SVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score, roc_auc_score
from sklearn.manifold import TSNE
import time
import seaborn as sns
from sklearn.decomposition import PCA
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')

rnd_seed = 42
random.seed(rnd_seed)
test_size = 0.2

Defining Paths

In [None]:
main_data = '‘/content/drive/My Drive/BaselineToShow/D1.csv’'
edge_list_file_name = '/content/drive/My Drive/BaselineToShow/edgelist.edgelist'
node_list_file_name = "/content/drive/My Drive/BaselineToShow/nodeD1.csv"
edge_filename = "/content/drive/My Drive/BaselineToShow/edgeD1.csv"
stats_file = '/content/drive/My Drive/BaselineToShow/deepwalk/stats.csv'
embeddings_filename = '/content/drive/My Drive/BaselineToShow/embeddings.emb'

Creating Graph

In [None]:
nx_g = nx.from_pandas_edgelist(pd.read_csv(edge_list_file_name), source='source', target='target',
                                   create_using=nx.DiGraph())
print("Graph info:", nx.info(nx_g))

Graph info: DiGraph with 1403 nodes and 1504 edges


DeepWalk and Creating Embeddings

In [None]:
window_size = 10
walk_per_node = 5
walk_length = 10

def get_randomwalk(node, path_length):
    random_walk = [node]
    for i in range(path_length-1):
        temp = list(nx_g.neighbors(node))
        temp = list(set(temp) - set(random_walk))    
        if len(temp) == 0:
            break

        random_node = random.choice(temp)
        random_walk.append(random_node)
        node = random_node
    return random_walk

all_nodes = list(nx_g.nodes())
random_walks = []
for n in tqdm(all_nodes):
    for i in range(walk_per_node):
        random_walks.append(get_randomwalk(n,walk_length))

model = Word2Vec(window = 4, sg = 1, hs = 0, negative = 10, alpha=0.03, min_alpha=0.0007, seed = 14)
model.build_vocab(random_walks, progress_per=2)
model.train(random_walks, total_examples = model.corpus_count, epochs=20, report_delay=1)

model.wv

100%|██████████| 1403/1403 [00:00<00:00, 57148.77it/s]


<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7faabd2a48d0>

Displaying Classification Report Function

In [None]:
def perf_report(identifier, y_true, y_pred, binary, print_enable=False):
    if binary:
        print(">>> Binary Classification.")
        prec, rec, f1, num = precision_recall_fscore_support(y_true, y_pred, average='binary')
        micro_f1 = f1_score(y_true, y_pred, average='binary')
    else:
        print(">>> Multi-class Classification.")
        prec, rec, f1, num = precision_recall_fscore_support(y_true, y_pred, average='macro')
        micro_f1 = f1_score(y_true, y_pred, average='macro')
    acc = accuracy_score(y_true, y_pred)
    if print_enable:
        print("\t*** {} performance reports: ***".format(str(identifier)))
        print("\t\tPrecision: %.3f \n\t\tRecall: %.3f \n\t\tF1-Score: %.3f" % (prec, rec, f1))
        print('\t\tMicro-Average F1-Score: %.3f' % micro_f1)
        print('\t\tAccuracy: %.3f' % acc)
        print(classification_report(y_true, y_pred))
    return prec, rec, f1, acc

Spliting Data Into Train and Test

In [None]:
def train_test_split(X, y, rnd_seed):

    # generate indices for the train and test set
    indices = [i for i in range(len(y))]
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=rnd_seed)
    sss.get_n_splits(indices, y)
    train_indices, test_indices = next(sss.split(indices, y))

    # train/test split
    X_train = [X[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]

    y_train = [y[i] for i in train_indices]
    y_test = [y[i] for i in test_indices]

    return X_train, X_test, y_train, y_test

Function To Fit Models and Get Metrics

In [None]:
def simple_classification(clf, clf_id, emb_flag, X_train, X_test, y_train, y_test,
                          binary, exp_id, print_enable=False):

    # train the model
    clf.fit(X_train, y_train)

    # predict the training set labels
    y_train_pred = clf.predict(X_train)

    # predict the test set labels
    y_test_pred = clf.predict(X_test)

    # evaluate the performance for the training set
    tr_prec, tr_rec, tr_f1, tr_acc = perf_report(str(clf_id) + ' - Training Set', y_train, y_train_pred, binary, print_enable)
    ts_prec, ts_rec, ts_f1, ts_acc = perf_report(str(clf_id) + ' - Test Set', y_test, y_test_pred, binary, print_enable)

    # auc-roc
    if binary:
        y_test_proba = clf.predict_proba(X_test)[::,1]
        y_train_proba = clf.predict_proba(X_train)[::,1]
        tr_roc_auc = roc_auc_score(y_train, y_train_proba)
        ts_roc_auc = roc_auc_score(y_test, y_test_proba)

    split_exp_id = exp_id.split(";")
    if len(split_exp_id) == 2:
        index = split_exp_id[0]
        id = split_exp_id[1]
    elif len(split_exp_id) == 1:
        index = 0
        id = split_exp_id[0]
    else:
        raise ValueError("Incorrect Experiment ID!")

    perf_dict = {
        'index': index,
        'exp_id': id,
        'emb_method': str(emb_flag),
        'classifier': str(clf_id),

        'train_prec': tr_prec,
        'train_rec': tr_rec,
        'train_f1': tr_f1,
        'train_acc': tr_acc,
        'train_auc': tr_roc_auc,

        'test_prec': ts_prec,
        'test_rec': ts_rec,
        'test_f1': ts_f1,
        'test_acc': ts_acc,
        'test_auc': ts_roc_auc
    }

    return perf_dict, clf

Defining Classification Function

In [None]:
def rf_lr_classification(X_train, X_test, y_train, y_test, stats_file, flag,
                         binary, exp_id, print_report=False):
    rf_clf = RandomForestClassifier(n_estimators=50, max_features=10, max_depth=5, random_state=rnd_seed)
    lr_clf = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1e5, random_state=rnd_seed)

    rf_perf, rf_clf = simple_classification(rf_clf, 'Random Forest', flag, X_train, X_test, y_train, y_test,
                                            binary, exp_id, print_report)
    binary = True
    lr_perf, lr_clf = simple_classification(lr_clf, 'Logistic Regression', flag, X_train, X_test, y_train, y_test,
                                            binary, exp_id, print_report)

    return rf_perf, rf_clf, lr_perf, lr_clf

Main

In [None]:
nodes_df = pd.read_csv(node_list_file_name)
anchor_nodes_df = nodes_df
node_list = [str(node_id) for node_id in anchor_nodes_df['node'].tolist()]
embeddings = [model.wv.get_vector(node) for node in node_list]
model.wv.save_word2vec_format(embeddings_filename)
labels = anchor_nodes_df['isp'].tolist()

rnd_seed = 42
binary = True
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, rnd_seed)
rf_lr_classification(X_train, X_test, y_train, y_test, stats_file, 'deepwalk', binary, '1;elliptic' , print_report=True)

>>> Binary Classification.
	*** Random Forest - Training Set performance reports: ***
		Precision: 1.000 
		Recall: 0.722 
		F1-Score: 0.839
		Micro-Average F1-Score: 0.839
		Accuracy: 0.996
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1104
           1       1.00      0.72      0.84        18

    accuracy                           1.00      1122
   macro avg       1.00      0.86      0.92      1122
weighted avg       1.00      1.00      1.00      1122

>>> Binary Classification.
	*** Random Forest - Test Set performance reports: ***
		Precision: 1.000 
		Recall: 0.200 
		F1-Score: 0.333
		Micro-Average F1-Score: 0.333
		Accuracy: 0.986
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       276
           1       1.00      0.20      0.33         5

    accuracy                           0.99       281
   macro avg       0.99      0.60      0.66       281
weighted avg       0.99

({'index': '1',
  'exp_id': 'elliptic',
  'emb_method': 'deepwalk',
  'classifier': 'Random Forest',
  'train_prec': 1.0,
  'train_rec': 0.7222222222222222,
  'train_f1': 0.8387096774193548,
  'train_acc': 0.9955436720142602,
  'train_auc': 0.9999496779388084,
  'test_prec': 1.0,
  'test_rec': 0.2,
  'test_f1': 0.33333333333333337,
  'test_acc': 0.9857651245551602,
  'test_auc': 0.9347826086956523},
 RandomForestClassifier(max_depth=5, max_features=10, n_estimators=50,
                        random_state=42),
 {'index': '1',
  'exp_id': 'elliptic',
  'emb_method': 'deepwalk',
  'classifier': 'Logistic Regression',
  'train_prec': 1.0,
  'train_rec': 0.05555555555555555,
  'train_f1': 0.10526315789473684,
  'train_acc': 0.9848484848484849,
  'train_auc': 0.9422302737520128,
  'test_prec': 0.0,
  'test_rec': 0.0,
  'test_f1': 0.0,
  'test_acc': 0.9822064056939501,
  'test_auc': 0.9572463768115942},
 LogisticRegression(max_iter=100000.0, penalty='l1', random_state=42,
                   