## Build simple ML model to understand vulnerabilities in empirical networks

Use logistic regression to build classification model of nodes vulnerabilities (better off in all model: 0, worse off in all models: 1). Features are network measures:
* degree
* average neighbor degree
* clustering
* k-shell
* betweeness centrality
* eigen vector centrality
* closeness centrality
* eccentricity


In [None]:
import networkx as nx
import numpy as np
import pandas as pd
import os
import random
from collections import Counter, defaultdict
import operator
import pickle
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score

from core_functions import load_network

%matplotlib notebook

In [None]:
from matplotlib import rc
#rc('text', usetex=True)
plt.rcParams['pdf.fonttype'] = 'truetype'
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})

### Define some parameters

In [None]:
networks = ['ia-infect-dublin','polblogs','UC_irvine','facebook_combined','ca-AstroPh']
network_path = '/Volumes/ExtremeSSD/last-mile/last_mile/networks_empirical/'

path_to_activation_count_data = 'some_path'
path_to_activation_time_data = 'some_path'

# pick which type of vulnerability to run models for
frequency = True # to run models for frequency
#frequency = False # to run models for recency

### do some precomputing of centrality measures for networks as they can be expensive to continually recompute

In [None]:
# first pre-compute centrality metrics for a network and save the files - this speeds things up when doing the ML partdef load_measure(id_,type_):
save_path = 'some_path/'
for network_id in networks:
    G = load_network(network_path + network_id + '.txt')
    
    # betweenness centrality
    C = nx.betweenness_centrality(G)
    pickle.dump(C, open(save_path + '%s_betweenness.pkl' % network_id,'wb'))

    # closeness centrality
    C = nx.closeness_centrality(G)
    pickle.dump(C, open(save_path + '%s_closeness.pkl' % network_id,'wb'))

    # eccentricity
    C = nx.eccentricity(G)
    pickle.dump(C, open(save_path + '%s_eccentricity.pkl' % network_id,'wb'))

    # eigenvector centrality
    C = nx.eigenvector_centrality(G)
    pickle.dump(C, open(save_path + '%s_eigenvector.pkl' % network_id,'wb'))

In [None]:
# function to later load the saved centrality files
def load_measure(save_path, network_id, centrality_type):
    pkl_file = open(save_path + '%s_%s.pkl' % (network_id,centrality_type),'rb')
    #pkl_file = open('some_path/%s_%s.pkl' % (network_id,centrality_type),'rb')
    centrality = pickle.load(pkl_file, encoding='latin1')
    pkl_file.close()

    return centrality

## Oprerationalize for multiple networks

In [None]:
def label_nodes(activation_count_,activation_time_):
    '''
    Label nodes, with 
    0 : better off in all models
    1 : worse off in all models
    '''
    
    methods = ['coreHD', 'degreeDiscount', 'kcore', 'HD']

    # counter to count how many nods are worse off
    worse_off_freq = Counter()
    # go through methods of selecting seeds - {random}
    for method in methods:
        worse_off_freq += Counter([n for n in activation_count_['random'] if activation_count_[method].get(n,0) < activation_count_['random'][n]])

    # counter to count how many nods are worse off
    worse_off_rec = Counter()
    # go through methods of selecting seeds - {random}
    for method in methods:
        worse_off_rec += Counter([n for n in activation_time_['random'] if activation_time_[method].get(n,0) < activation_time_['random'][n]])

    # infer nodes to include in ML dataset 
    label_freq = dict()
    label_rec = dict()
    for n in G.nodes():
        
        if n not in worse_off_freq:
            label_freq[n] = 0 # node always is better off
        if worse_off_freq[n] == 4:
            label_freq[n] = 1 # if node always is worse off

        if n not in worse_off_rec:
            label_rec[n] = 0 # node always is better off
        if worse_off_rec[n] == 4:
            label_rec[n] = 1 # if node always is worse of
        
    return label_freq, label_rec

def run_logReg_ML(X_,Y_):
    # CV
    kf = KFold(n_splits=5,shuffle=True)
    
    # balanced accuracy
    accuracy = []
    weights = []
    
    # find optimal parameters
    for lambda_ in np.logspace(-5,5,201):
        #tmp = []
        #c = Counter()
        #for train_index, test_index in skf.split(X_, Y_):
        for train_index, test_index in kf.split(X_):
            X_train, X_test = X_[train_index], X_[test_index]
            Y_train, Y_test = Y_[train_index], Y_[test_index]

            # standardize features
            mean = np.mean(X_train,axis=0)
            std = np.std(X_train,axis=0,ddof=1)
            X_train = (X_train - mean)/std
            X_test = (X_test - mean)/std

            model = LogisticRegression(penalty='l2', tol=0.0001, C=lambda_, fit_intercept=True, 
                                    max_iter=1000000, n_jobs=-1, class_weight = 'balanced')


            model.fit(X_train,Y_train)

            accuracy.append(balanced_accuracy_score(Y_test, model.predict(X_test)))
            weights.append(model.coef_)

    return np.mean(accuracy), np.mean(weights,axis=0)[0]

In [None]:
statistics = []
ML_coeff = []
accuracy = []

for idd_ in networks:
    print(idd_)
    # load network
    G = load_network(network_path + idd_ + '.txt')
    norm = 10.*G.number_of_nodes() # number of repetitions
    seeds = max(1,int(round(len(G)/100.,0))) # select ~1% of nodes

    # define measures
    activation_count = dict()
    activation_time = dict()

    # load activation_count for the various methods
    methods = ['random','HD','coreHD','degreeDiscount','kcore']
    for method in methods:
        activation_count[method] = dict()
        with open(path_to_activation_count_data + '/%s_%s_seeds=%d_m=10N_p=pc.csv' % (idd_,method,seeds)) as f:
            for line in f:
                n,a = line.strip().split(',')
                activation_count[method][int(n)] = int(a)/norm

    # load activation_time for various methods
    for method in methods:
        activation_time[method] = dict()
        with open(path_to_activation_time_data + '/%s_%s_seeds=%d_m=10N_p=pc.csv' % (idd_,method,seeds)) as f:
            for line in f:
                dat = line.strip().split(',')
                n = dat[0]
                # estimate inverse activation time
                tau = 0.
                for part_ in dat[1:]:
                    t,nn = part_.split(':') # nn = number of times an epidemic spent t times reaching node
                    tau += float(nn)/(float(t)+1)
                activation_time[method][int(n)] = tau/norm
    
    label_freq, label_rec = label_nodes(activation_count,activation_time)
    
    if frequency:
        # keep statistics
        statistics.append((len(G), # total nodes
                           len(label_freq)-sum(label_freq.values()), # label 0 nodes
                           sum(label_freq.values()), # label 1 nodes
                           idd_) # network id
                         )
    else:
        # keep statistics
        statistics.append((len(G), # total nodes
                           len(label_rec)-sum(label_rec.values()), # label 0 nodes
                           sum(label_rec.values()), # label 1 nodes
                           idd_) # network id
                         )
        
    # build feature matrix
    features = ['degree','k-shell','clustering','betweenness','closeness','eigenvector','eccentricity']

    degree = nx.degree(G)
    k_shell = nx.core_number(G)
    clustering = nx.clustering(G)
    betweenness = load_measure(idd_,'betweenness')
    closeness = load_measure(idd_,'closeness')
    eigenvector = load_measure(idd_,'eigenvector')
    eccentricity = load_measure(idd_,'eccentricity')
    
    # create freq dataset
    X_freq = []
    for n in sorted(label_freq.keys()):
        X_freq.append([degree[n],k_shell[n],clustering[n],betweenness[n],closeness[n],eigenvector[n],eccentricity[n]])

    X_freq = np.array(X_freq)
    Y_freq = np.array([label_freq[n] for n in sorted(label_freq.keys())])
    
    # create rec dataset
    X_rec = []
    for n in sorted(label_rec.keys()):
        X_rec.append([degree[n],k_shell[n],clustering[n],betweenness[n],closeness[n],eigenvector[n],eccentricity[n]])

    X_rec = np.array(X_rec)
    Y_rec = np.array([label_rec[n] for n in sorted(label_rec.keys())])
    
    # do some ML
    if frequency:
        accuracy_, coeff = run_logReg_ML(X_freq,Y_freq)
    else:
        accuracy_, coeff = run_logReg_ML(X_rec,Y_rec)

    ML_coeff.append((idd_, #coeff)) # network id
                    coeff/max(abs(coeff)))) # normalize with respect to largest value
    
    accuracy.append((idd_, # network id
                     accuracy_))

__Plot some label statistics__

In [None]:
networks = ['ia-infect-dublin','URVemail','polblogs','UC_irvine','facebook_combined','ca-CondMat','ca-AstroPh']
network_labels = ['SocioPatterns','Emails (URV)','Political blogs','UCI messages','Facebook','CondMat','AstroPh']
labels = dict(zip(networks,network_labels))

# unpack data
network_size, lab0, lab1, ids_ = zip(*sorted(statistics,key=operator.itemgetter(0)))

# plot stuff
plt.figure(figsize=(3,3))

# label 0
plt.plot(np.array(lab0)/(1.0*np.array(network_size)),color='#282828')
plt.fill_between(range(len(network_size)),0,np.array(lab0)/(1.0*np.array(network_size)),color='#dcdcdc',alpha=1
                 ,label='Label 0',lw=0)

# label 1
plt.fill_between(range(len(network_size)),np.array(lab0)/(1.0*np.array(network_size)),(np.array(lab0)+np.array(lab1))/(1.0*np.array(network_size)),
                 color='orange',label='Label 1',lw=0)
plt.plot((np.array(lab0)+np.array(lab1))/(1.0*np.array(network_size)),color='#282828')

plt.legend(loc=0,frameon=True,fontsize=8)
plt.xlim(0,4)
plt.ylim(0,1)
plt.xticks(range(len(ids_)),[labels.get(i,i) for i in ids_],fontsize=9,rotation=45,ha='right')
#plt.xlabel('Network id')
plt.ylabel('Fraction of nodes')
plt.tight_layout()
plt.show()

__Plot feature importance__

In [None]:
networks = ['ia-infect-dublin','URVemail','polblogs','UC_irvine','facebook_combined','ca-CondMat','ca-AstroPh']
network_labels = ['SocioPatterns','Emails (URV)','Political blogs','UCI messages','Facebook','CondMat','AstroPh']
labels = dict(zip(networks,network_labels))
plt.figure(figsize=(4,3.5))
ids_, ml_coeff = zip(*ML_coeff)
plt.imshow(np.array(ml_coeff).T,cmap=plt.cm.coolwarm,aspect='equal',vmin=-1,vmax=1) # LogReg
cbar = plt.colorbar(orientation='vertical',pad=0.02,fraction=0.06)
plt.xticks(range(len(ids_)),[labels[f] for f in ids_],fontsize=10,rotation=45,ha='right')
plt.yticks(range(len(features)),features)
cbar.set_label('relative feature weight')
#plt.xlabel('Network id')
plt.tight_layout()
plt.show()