In [15]:
import pandas as pd 
import seaborn as sns
import numpy as np
import pickle
from matplotlib import pyplot as plt
from matplotlib.gridspec import GridSpec
import networkx as nx
import os
from tqdm.notebook import tqdm
# import warnings
# warnings.filterwarnings('ignore')

import ndlib.models.epidemics as ep
import ndlib.models.ModelConfig as mc

We have some set of datasets

In [135]:
path_to_datasets = '../datasets/'
path_to_uniform_data = '../data/'
path_to_output = '../output/'
path_to_samples = '../samples/'

## Reading datasets

In [126]:
def read_graph_data(filename, sep=',', header=None, skiprows=0):
    edgelist = pd.read_csv(os.path.join(path_to_datasets, filename), sep=sep, skiprows=skiprows, header=header, names=['source', 'target'])
    edgelist.to_csv(os.path.join(path_to_uniform_data, filename), index=False, header=None)
    return nx.from_pandas_edgelist(edgelist)

In [127]:
citeseer = read_graph_data('citeseer.cites', sep='\t')
deezer_europe = read_graph_data('deezer_europe_edges.csv', header=0)
lastfm_asia = read_graph_data('lastfm_asia_edges.csv', header=0)
cora = read_graph_data('cora.cites', sep='\t')
email_Eu_core = read_graph_data('email-Eu-core.txt', sep=' ')
fb_0 = read_graph_data('0.edges', sep=' ')
fb_1 = read_graph_data('107.edges', sep=' ')
fb_2 = read_graph_data('348.edges', sep=' ')
fb_3 = read_graph_data('414.edges', sep=' ')
fb_4 = read_graph_data('686.edges', sep=' ')
fb_5 = read_graph_data('698.edges', sep=' ')
fb_6 = read_graph_data('1684.edges', sep=' ')
fb_7 = read_graph_data('1912.edges', sep=' ')
fb_8 = read_graph_data('3437.edges', sep=' ')
fb_9 = read_graph_data('3980.edges', sep=' ')
email_univ = read_graph_data('email-univ.edges', sep=' ')
fb_company = read_graph_data('fb-pages-company.edges', header=0)
fb_food = read_graph_data('fb-pages-food.edges')
fb_politician = read_graph_data('fb-pages-politician.edges')
fb_public_figure = read_graph_data('fb-pages-politician.edges')
fb_tvshow = read_graph_data('fb-pages-tvshow.edges')
soc_anybeat = read_graph_data('soc-anybeat.edges', sep=' ')
soc_hamsterster = read_graph_data('soc-hamsterster.edges', sep=' ', skiprows=2)
soc_wiki_vote = read_graph_data('soc-wiki-Vote.mtx', sep=' ', skiprows=2)
cit_DBLP = read_graph_data('cit-DBLP.edges', sep=' ', skiprows=2)

# list_of_graphs = [citeseer, deezer_europe, lastfm_asia, cora, email_Eu_core,
#                   fb_0, fb_1, fb_2, fb_3, fb_4, fb_5, fb_6, fb_7, fb_8, fb_9,
#                   email_univ, fb_company, fb_food, fb_politician, fb_public_figure,
#                   fb_tvshow, soc_anybeat, soc_hamsterster, soc_wiki_vote, cit_DBLP]
# len(list_of_graphs)

25

In [130]:
graphs_list = os.listdir(path_to_uniform_data) # перевели все в 1 формат

In [None]:
iters_list = [] # считаем для каждого из них ndlib количество итераций до заражения
for elem in tqdm(graphs_list): # make it with multiprocessing
    iters_list.append((elem, calc_iter(elem)))

In [None]:
for elem in tqdm(graphs_list): # считаем для них распределение мотивов
    gt.extract_motifs(elem, 4, path_to_graphs=path_to_uniform_data, path_to_output=path_to_output)

In [10]:
for graph in tqdm(graphs_list): # сэмплируем из них 10 графов с примерно половиной количества нод
    
    G = nx.from_pandas_edgelist(
        pd.read_csv(os.path.join(path_to_uniform_data, graph), names=['source', 'target']))
    
    for j in range(20):
        extra_hop = set()
        first_node = np.random.choice(G.nodes())
        extra_hop = extra_hop.union(list(nx.neighbors(G, first_node)))
        i=1

        while (i<4) and (len(extra_hop)<G.number_of_nodes()):
            i+=1
            for node in extra_hop:
                 extra_hop = extra_hop.union(nx.neighbors(G, node))

            nx.to_pandas_edgelist(nx.subgraph(G, extra_hop)).to_csv(
            os.path.join(path_to_samples, graph+'_h{}_s{}.csv'.format(i,j+1)),
            header=None,
            index=False)

In [None]:
# для каждого из них считаем распределения и усредняем

In [None]:
# создаем датасет и делаем кроссвалидацию

In [129]:
def calc_iter(elem):    
    g = nx.from_pandas_edgelist(
    pd.read_csv(os.path.join(path_to_uniform_data, elem),
                names=['source', 'target']))
    
    len_nodes = len(g.nodes())
    
    list_of_iter = []
    
    for i in range(10):
        model = ep.SIModel(g)
        cfg = mc.Configuration()
        cfg.add_model_parameter('beta', 0.1)
        cfg.add_model_parameter("percentage_infected", 0.01)
        model.set_initial_status(cfg)
    
        iteration = model.iteration()
    
        while (iteration['node_count'][1]<len_nodes):
            iteration = model.iteration()
        
        list_of_iter.append(iteration['iteration'])
        
    return elem, np.mean(list_of_iter)

In [None]:
def add_size_n_dens(mdf):
    path_to_gph = '/Users/zaikoval/Downloads/graphs_5types/graphs'
    for idx in mdf.index:
        graph = nx.from_pandas_edgelist(pd.read_csv(os.path.join(path_to_gph, '-'+str(idx)+'.csv'), names=['source', 'target']))
        n = graph.number_of_nodes()
        e = graph.number_of_edges()
        mdf.loc[idx, 'nodes'] = n
        mdf.loc[idx, 'edges'] = e
        mdf.loc[idx, 'density'] = 2*e/(n*(n-1))

In [134]:
class GTscanner:
    """
    Python wrapper for GTscanner algorithm
    """ 
    
    def extract_motifs(self, filename, size, random=10, algo='fase', threads=3, path_to_graphs='', path_to_output='/Users/zaikoval/Documents/GitHub/network-motif-analysis/results/'):
        """
        Calls execution of GTscanner algorithm with parameters:
        
        graph - path to the txt file of graph
        
        size - size of motif to extract 
        
        random - number of random graph to generate (better 100+)
        """
        import os
        
        path_to_examples = '/Users/zaikoval/Downloads/gtscanner/examples/sampling/'
        graph_data = pd.read_csv(path_to_graphs+filename+'.csv', header=None)
        unq_elem = np.unique(graph_data)
        graph_data = graph_data.applymap(lambda x: np.where(x == unq_elem)[0][0]+1)
        graph_data.to_csv(path_to_examples+filename+'.txt', sep=' ', header=None, index=False)
        ы
        cmd = '/home/zaikoval/Documents/Work/gtscanner/./GTScanner -s ' \
        + str(size) \
        + ' -m ' + algo \
        + ' -g ' + path_to_examples+filename+'.txt' \
        + ' -f simple' \
        + ' -t html' \
        + ' -o ' + output + filename + '_' + str(size) +'.html'\
        + ' -r ' + str(random) \
        + ' -th ' + str(threads)
        #answer = os.popen(cmd).read()
        print(cmd)
    
    def extract_result(self, file='/home/zaikoval/Documents/Work/gtscanner/results/result.html'):
        from bs4 import BeautifulSoup
        
        adjs = []
        freqs = []
        zs = []
        
        soup = BeautifulSoup(open(file).read())
        content = soup.find_all('tr')[1:]
        
        for motif in content:
            adjs.append(np.matrix([list(x) for x in motif.find('td', attrs={'class':'pre'}).text.split('\n')], dtype=int))
            stats = motif.find_all('td')[2:4]
            freqs.append(float(stats[0].text))
            zs.append(float(stats[1].text))
        
        ans = list(zip(adjs, freqs, zs))
        return ans
    def data_4(self, files=[]):
       
        stats = []
#         path_to_results_4 = '/Users/zaikoval/Documents/GitHub/network-motif-analysis/sampling/hop_result_4/'
        
        for item in files:
            stats.append((item.split('/')[-1][:-7],
                          self.extract_result(item)))
            
#         for item in stats:
#             item.sort(key=lambda x: int(''.join(list(np.array(x[0]).flatten().astype(str))), base=10))
            
        pickleFile = open("/Users/zaikoval/Documents/GitHub/network-motif-analysis/dict_4.pkl", 'rb')
        dict_4 = pickle.load(pickleFile)
        pickleFile.close()
        
        
        i=0
        
        box_list = []
        
        for graph in stats:
            
            dict_4_z = dict(dict_4) 
            dict_4_f = dict(dict_4)
            
            
            # for 4-motif
            for elem in graph[1]:
                dict_4_z[str(elem[0])] = 0
                dict_4_f[str(elem[0])] = 0
                
                if (elem[2] not in [float('inf'), float('-inf')]) and (not np.isnan(elem[2])):
                    dict_4_z[str(elem[0])] = elem[2]
                else: 
                    dict_4_z[str(elem[0])] = 0
                    
                if (elem[1] not in [float('inf'), float('-inf')]) and (not np.isnan(elem[2])):
                    dict_4_f[str(elem[0])] = elem[1]
                else: 
                    dict_4_f[str(elem[0])] = 0
            
            z_scores_4 = list(dict_4_z.values())
            freqs_4 = list(dict_4_f.values())
            
            sum_of_freqs_4 = np.sum(freqs_4)
            normed_freqs_4 = freqs_4 / sum_of_freqs_4
            
            normed_z_scores_4 = z_scores_4 / np.sqrt(np.sum([x**2 for x in z_scores_4])) # ыыыыыыыыы
         
            
            box_list.append([graph[0]] + \
                            #list(normed_z_scores_4) + \
                            list(normed_freqs_4))
            
        return box_list
    
gt = GTscanner()