In [1]:
import numpy as np
import modin.pandas as mipd
import os
import time
import pandas as pd
from tqdm import tqdm
import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt
import numpy as np
import gc

from matplotlib import pyplot as plt


In [2]:
time_resolution = 1
c_path = './network_data'+str(time_resolution)+'/commits/'
e_path = './network_data'+str(time_resolution)+'/emails/'

all_graduated_name = np.load('./all_graduated.npy').tolist()
all_graduated = [x.lower() for x in all_graduated_name]
all_retired_name = np.load('./all_retired.npy').tolist()
all_retired = [x.lower() for x in all_retired_name]

## Netowork Description 
### Rewrite Likang's function

In [3]:
class Technet(object):

    def __init__(self, path):
        self.path = path
        self.get_tech_net()
    

    def get_tech_net(self):

        self.bipartite_G = nx.Graph()
        df = pd.read_csv(self.path, header=None, sep='##', engine='python')
        df.columns = ['file', 'dev', 'weight']

        ## Logic to add nodes and edges to graph with their metadata
        for _, row in df.iterrows():
            dev_node = row['dev']
            file_node = row['file'].replace('   (with props)', '')
            self.bipartite_G.add_node(dev_node, bipartite='dev')
            self.bipartite_G.add_node(file_node, bipartite='file')
            self.bipartite_G.add_edge(dev_node, file_node)
    
        return self

    def stats_tech_net(self):
        # check if file does not exist or empty
        if not os.path.exists(self.path) or os.stat(self.path).st_size == 0:
            return {'t_num_dev_nodes':0,\
                    't_num_file_nodes':0,\
                    't_num_edges': 0,\
                    't_num_dev_per_file':0,\
                    't_num_file_per_dev':0,\
                    't_graph_density':0,\
                    't_dev_nodes': set()}

        self.dev_nodes = {n for n, d in self.bipartite_G.nodes(data=True) if d["bipartite"] == 'dev'}
        self.file_nodes = {n for n, d in self.bipartite_G.nodes(data=True) if d["bipartite"] == 'file'}

        self.graph_density = bipartite.density(self.bipartite_G, self.dev_nodes)
        self.file_degrees, self.dev_degrees = bipartite.degrees(self.bipartite_G, self.dev_nodes)

        self.num_file_nodes = len(self.file_degrees)
        self.num_dev_nodes = len(self.dev_degrees)

        self.file_node_degree = np.average([degree for node, degree in self.file_degrees])
        self.dev_node_degree = np.average([degree for node, degree in self.dev_degrees])

        # return the features of tech net
        return {'t_num_dev_nodes':self.num_dev_nodes,\
                't_num_file_nodes':self.num_file_nodes,\
                't_num_edges': int(self.bipartite_G.size()),\
                't_num_dev_per_file':self.file_node_degree,\
                't_num_file_per_dev':self.dev_node_degree,\
                't_graph_density':self.graph_density,\
                't_dev_nodes': set(self.dev_nodes),\
                't_file_nodes': set(self.file_nodes)}
    
class Socialnet(object):

    def __init__(self, path):
        if os.path.exists(path):
            self.path = path
            self.get_social_net()
        else: 
            raise FileNotFoundError('Edgelist Do not Exist')
            return None
            print("File Not Found")

    
    def get_social_net(self):
        self.G = nx.read_edgelist(self.path, create_using=nx.DiGraph(), nodetype=str, comments='*', delimiter='##', data=(('weight', int),))
        return self.G

    # social nets are weighted
    def stats_social_net(self):
        # if no network data
        if not os.path.exists(self.path) or os.stat(self.path).st_size == 0:
            return {'s_num_nodes':0, \
                    's_dev_nodes':set(),\
                    's_num_edges': 0,\
                    's_weighted_mean_degree':0,\
                    's_num_component':0,\
                    's_avg_clustering_coef':0,\
                    's_largest_component':0,\
                    's_graph_density':0}

        # Processing features in social networks
        #self.G = nx.read_edgelist(self.path, create_using=nx.DiGraph(), nodetype=str, comments='*', delimiter='##', data=(('weight', int),))
        # all dev nodes
        self.dev_nodes = set(self.G.nodes)
        # num. of total nodes
        self.num_nodes = len(self.dev_nodes)
        # weighted mean degree
        self.degrees = self.G.degree(weight='weight')
        self.weighted_mean_degree = np.average([degree for node, degree in self.degrees])
        # average clustering coefficient
        self.avg_clustering_coef = nx.average_clustering(self.G)
        # betweenness = nx.betweenness_centrality(G, weight='weight')
        self.graph_density = nx.density(self.G)

        self.G = nx.read_edgelist(self.path, create_using=nx.Graph(), nodetype=str, comments='*', delimiter='##', data=(('weight', int),))
        # num. of dis-connected components
        self.num_component = nx.number_connected_components(self.G)
        # largest connected component
        self.largest_component = len(max(nx.connected_components(self.G), key=len))
        # num. of nodes in each component
        # num_nodes_component = [list(c) for c in list(nx.connected_components(G))]

        # return the features of the 
        return {'s_num_nodes': self.num_nodes,\
                's_dev_nodes': self.dev_nodes,\
                's_num_edges': int(self.G.size()),\
                's_weighted_mean_degree':self.weighted_mean_degree,\
                's_num_component':self.num_component,\
                's_avg_clustering_coef':self.avg_clustering_coef,\
                's_largest_component':self.largest_component,\
                's_graph_density':self.graph_density}


In [4]:
class ASFProject(object):

    def __init__(self, projid, the_path):
        self.projid = projid
        self.net_num_limit = 43
        self.time_resolution = 1
        self.c_path = the_path+str(self.time_resolution)+'/commits/'
        self.e_path = the_path+str(self.time_resolution)+'/emails/'
    
    def get_tech_nets(self):
        self.tech_networks = []
        self.tech_network_stats = []
        for seq_num in range(0, self.net_num_limit, self.time_resolution):
            #seq_num = 1
            this_f = self.c_path+self.projid+'__'+str(seq_num)+".edgelist"
            if os.path.exists(this_f):
                this_tnet = Technet(this_f)
                self.tech_networks.append(this_tnet)
                self.tech_network_stats.append(this_tnet.stats_tech_net())
            
                
    def get_social_nets(self):
        self.social_networks = []
        self.social_network_stats = []
        for seq_num in range(0, self.net_num_limit, self.time_resolution):
            #seq_num = 1
            this_f = self.e_path+self.projid+'__'+str(seq_num)+".edgelist"
            if os.path.exists(this_f):
                this_snet = Socialnet(this_f)
                self.social_networks.append(this_snet)
                self.social_network_stats.append(this_snet.stats_social_net())

    def stats_proj_net(self):
        
        return{ 'lifecycle': max(len(self.social_networks),len(self.tech_networks)),\
                't_dev_nodes': [x['t_num_dev_nodes'] for x in self.tech_network_stats],
                't_file_nodes': [x['t_num_file_nodes'] for x in self.tech_network_stats],
                't_edges': [x['t_num_edges'] for x in self.tech_network_stats],
                's_nodes': [x['s_num_nodes'] for x in self.social_network_stats],
                's_edges': [x['s_num_edges'] for x in self.social_network_stats],
                't_num_dev_per_file':[x['t_num_dev_per_file'] for x in self.tech_network_stats],\
                't_num_file_per_dev':[x['t_num_file_per_dev'] for x in self.tech_network_stats]
                }
    


In [5]:
import multiprocessing as mp

In [6]:
def run_proj_stats(projects,q):
    for this_proj in tqdm(projects):
        this_proj.get_tech_nets()
        this_proj.get_social_nets()
        q.put(this_proj.stats_proj_net())    

#def run__process(processnum=6):  # 这里是主进程



In [7]:

if __name__ == '__main__' :

    the_path = './network_data'
    projects = os.listdir(the_path+"1/commits/")
    projects2 = os.listdir(the_path+"1/emails/")
    project_names = [x.split('__')[0] for x in projects]
    project_names2 = [x.split('__')[0] for x in projects2]
    project_names.extend(project_names2)
    project_freqs = pd.Series(project_names)
    project_names = project_freqs.drop_duplicates().values

    all_projects = {}

    all_proj_stats = {}

    for projid in tqdm(project_names) :
        this_proj = ASFProject(projid, the_path)
        all_projects[projid] = this_proj
    
    all_proj = list(all_projects.values())
    q = mp.Queue()
    process = [mp.Process(target=run_proj_stats, args=(all_proj[:len(all_proj)//6],q)),
               mp.Process(target=run_proj_stats, args=(all_proj[len(all_proj)//6:2*len(all_proj)//6],q)), 
               mp.Process(target=run_proj_stats, args=(all_proj[2*len(all_proj)//6:3*len(all_proj)//6],q)),
               mp.Process(target=run_proj_stats, args=(all_proj[3*len(all_proj)//6:4*len(all_proj)//6],q)),
               mp.Process(target=run_proj_stats, args=(all_proj[4*len(all_proj)//6:5*len(all_proj)//6],q)),
               mp.Process(target=run_proj_stats, args=(all_proj[5*len(all_proj)//6:],q))]
    [p.start() for p in process]  # 开启了两个进程
    [p.join() for p in process]   # 等待两个进程依次结束
    result = [q.get() for proj in all_proj]



100%|██████████| 275/275 [00:00<00:00, 386409.92it/s]
  0%|          | 0/45 [00:00<?, ?it/s].63it/s]
Process Process-1:
Traceback (most recent call last):
  File "/home/wsl2forwin/.pyenv/versions/3.8.11/lib/python3.8/site-packages/pandas/io/parsers/python_parser.py", line 492, in _infer_columns
    line = self._buffered_line()
  File "/home/wsl2forwin/.pyenv/versions/3.8.11/lib/python3.8/site-packages/pandas/io/parsers/python_parser.py", line 584, in _buffered_line
    return self._next_line()
  File "/home/wsl2forwin/.pyenv/versions/3.8.11/lib/python3.8/site-packages/pandas/io/parsers/python_parser.py", line 681, in _next_line
    orig_line = self._next_iter_line(row_num=self.pos + 1)
  File "/home/wsl2forwin/.pyenv/versions/3.8.11/lib/python3.8/site-packages/pandas/io/parsers/python_parser.py", line 742, in _next_iter_line
    return next(self.data)
  4%|▍         | 2/46 [00:00<00:14,  3.05it/s]StopIteration

The above exception was the direct cause of the following exception:

Trace

KeyboardInterrupt: 

In [None]:
project_names2

['edgent',
 'hdt',
 'mnemonic',
 'juneau',
 'odf',
 'taverna',
 'zeta',
 'streams',
 'oozie',
 'roller',
 'odf',
 'nuvem',
 'synapse',
 'griffin',
 'guacamole',
 'wave',
 'kalumet',
 'jackrabbit',
 'wink',
 'deltacloud',
 'batchee',
 'apex',
 'falcon',
 'eagle',
 'odf',
 'roller',
 'hawq',
 'esme',
 'provisionr',
 'gora',
 'twill',
 'sling',
 'pulsar',
 'quickstep',
 'batchee',
 'wink',
 'ignite',
 'flex',
 'juneau',
 'photark',
 'tamaya',
 'omid',
 'gora',
 'tashi',
 's4',
 'phoenix',
 'river',
 'sis',
 'myriad',
 'ode',
 'samoa',
 'empire',
 'singa',
 'phoenix',
 'river',
 'datafu',
 'echarts',
 'oltu',
 'predictionio',
 'kafka',
 'datafu',
 'juneau',
 'openwhisk',
 'slider',
 'directory',
 'madlib',
 'sling',
 'olio',
 'streams',
 'weex',
 'ftpserver',
 'derby',
 'aurora',
 'superset',
 'nuvem',
 'plc4x',
 'openmeetings',
 'river',
 'pirk',
 'clerezza',
 'olio',
 'wink',
 'trafficcontrol',
 'empire',
 'oltu',
 'usergrid',
 'ctakes',
 'manifoldcf',
 'quickstep',
 'netbeans',
 'direct

In [None]:
all_projects['odf'].get_tech_nets()
all_projects['odf'].get_social_nets()
all_projects['odf'].stats_proj_net()

{'lifecycle': 0,
 't_dev_nodes': [],
 't_file_nodes': [],
 't_edges': [],
 's_nodes': [],
 's_edges': [],
 't_num_dev_per_file': [],
 't_num_file_per_dev': []}

In [None]:
all_projects['odf'].c_path

'./network_data1/commits/1/commits/'

In [None]:
all_proj

[<__main__.ASFProject at 0x7f55c3f845b0>,
 <__main__.ASFProject at 0x7f55c3f84610>,
 <__main__.ASFProject at 0x7f55c3f845e0>,
 <__main__.ASFProject at 0x7f55c3f84640>,
 <__main__.ASFProject at 0x7f55c3f84340>,
 <__main__.ASFProject at 0x7f55c3f84790>,
 <__main__.ASFProject at 0x7f55c3f84220>,
 <__main__.ASFProject at 0x7f55c3f84a90>,
 <__main__.ASFProject at 0x7f55c3f84af0>,
 <__main__.ASFProject at 0x7f55c3f84700>,
 <__main__.ASFProject at 0x7f55c3f84c40>,
 <__main__.ASFProject at 0x7f55c3f84400>,
 <__main__.ASFProject at 0x7f55c3f84430>,
 <__main__.ASFProject at 0x7f55c3f84d90>,
 <__main__.ASFProject at 0x7f55c3f84190>,
 <__main__.ASFProject at 0x7f55c3f84e80>,
 <__main__.ASFProject at 0x7f55c3f84f40>,
 <__main__.ASFProject at 0x7f55c3f84f70>,
 <__main__.ASFProject at 0x7f55c3f84f10>,
 <__main__.ASFProject at 0x7f55c3f84ca0>,
 <__main__.ASFProject at 0x7f55c3f84dc0>,
 <__main__.ASFProject at 0x7f55c3f84ee0>,
 <__main__.ASFProject at 0x7f55a2d46070>,
 <__main__.ASFProject at 0x7f55a2d

In [None]:
len(result)

265

In [None]:
result

[{'lifecycle': 0,
  't_dev_nodes': [],
  't_file_nodes': [],
  't_edges': [],
  's_nodes': [],
  's_edges': [],
  't_num_dev_per_file': [],
  't_num_file_per_dev': []},
 {'lifecycle': 0,
  't_dev_nodes': [],
  't_file_nodes': [],
  't_edges': [],
  's_nodes': [],
  's_edges': [],
  't_num_dev_per_file': [],
  't_num_file_per_dev': []},
 {'lifecycle': 0,
  't_dev_nodes': [],
  't_file_nodes': [],
  't_edges': [],
  's_nodes': [],
  's_edges': [],
  't_num_dev_per_file': [],
  't_num_file_per_dev': []},
 {'lifecycle': 0,
  't_dev_nodes': [],
  't_file_nodes': [],
  't_edges': [],
  's_nodes': [],
  's_edges': [],
  't_num_dev_per_file': [],
  't_num_file_per_dev': []},
 {'lifecycle': 0,
  't_dev_nodes': [],
  't_file_nodes': [],
  't_edges': [],
  's_nodes': [],
  's_edges': [],
  't_num_dev_per_file': [],
  't_num_file_per_dev': []},
 {'lifecycle': 0,
  't_dev_nodes': [],
  't_file_nodes': [],
  't_edges': [],
  's_nodes': [],
  's_edges': [],
  't_num_dev_per_file': [],
  't_num_file_p