In [41]:
import numpy as np
import modin.pandas as mipd
import os
import time
import pandas as pd
from tqdm import tqdm
import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt
import numpy as np
import gc

from matplotlib import pyplot as plt


In [42]:
# technical nets are unweighted
def get_tech_net(path):

    bipartite_G = nx.Graph()
    df = pd.read_csv(path, header=None, sep='##', engine='python')
    df.columns = ['file', 'dev', 'weight']

    ## Logic to add nodes and edges to graph with their metadata
    for _, row in df.iterrows():
        dev_node = row['dev']
        file_node = row['file'].replace('   (with props)', '')
        bipartite_G.add_node(dev_node, bipartite='dev')
        bipartite_G.add_node(file_node, bipartite='file')
        bipartite_G.add_edge(dev_node, file_node)

    dev_nodes = {n for n, d in bipartite_G.nodes(data=True) if d["bipartite"] == 'dev'}
    file_nodes = {n for n, d in bipartite_G.nodes(data=True) if d["bipartite"] == 'file'}
    
    return bipartite_G
def cal_tech_net(path):
    # check if file does not exist or empty
    if not os.path.exists(path) or os.stat(path).st_size == 0:
        return {'t_num_dev_nodes':0,\
                't_num_file_nodes':0,\
                't_num_dev_per_file':0,\
                't_num_file_per_dev':0,\
                't_graph_density':0,\
                't_dev_nodes': set()}

    bipartite_G = get_tech_net(path)

    graph_density = bipartite.density(bipartite_G, dev_nodes)
    file_degrees, dev_degrees = bipartite.degrees(bipartite_G, dev_nodes)

    num_file_nodes = len(file_degrees)
    num_dev_nodes = len(dev_degrees)
    file_node_degree = sum([degree for node, degree in file_degrees])/len(file_degrees)
    dev_node_degree = sum([degree for node, degree in dev_degrees])/len(dev_degrees)

    # return the features of tech net
    return {'t_num_dev_nodes':num_dev_nodes,\
            't_num_file_nodes':num_file_nodes,\
            't_num_dev_per_file':file_node_degree,\
            't_num_file_per_dev':dev_node_degree,\
            't_graph_density':graph_density,\
            't_dev_nodes': set(dev_nodes)}

def get_social_net(path):
    G = nx.read_edgelist(path, create_using=nx.DiGraph(), nodetype=str, comments='*', delimiter='##', data=(('weight', int),))
    return G

# social nets are weighted
def cal_social_net(path):
    # if no network data
    if not os.path.exists(path) or os.stat(path).st_size == 0:
        return {'s_num_nodes':0, \
                's_dev_nodes':set(),\
                's_weighted_mean_degree':0,\
                's_num_component':0,\
                's_avg_clustering_coef':0,\
                's_largest_component':0,\
                's_graph_density':0}

    # Processing features in social networks
    G = nx.read_edgelist(path, create_using=nx.DiGraph(), nodetype=str, comments='*', delimiter='##', data=(('weight', int),))
    # all dev nodes
    dev_nodes = set(G.nodes)
    # num. of total nodes
    num_nodes = len(dev_nodes)
    # weighted mean degree
    degrees = G.degree(weight='weight')
    weighted_mean_degree = sum([degree for node, degree in degrees])/num_nodes
    # average clustering coefficient
    avg_clustering_coef = nx.average_clustering(G)
    # betweenness = nx.betweenness_centrality(G, weight='weight')
    graph_density = nx.density(G)

    G = nx.read_edgelist(path, create_using=nx.Graph(), nodetype=str, comments='*', delimiter='##', data=(('weight', int),))
    # num. of dis-connected components
    num_component = nx.number_connected_components(G)
    # largest connected component
    largest_component = len(max(nx.connected_components(G), key=len))
    # num. of nodes in each component
    # num_nodes_component = [list(c) for c in list(nx.connected_components(G))]

    # return the features of the 
    return {'s_num_nodes': num_nodes,\
            's_dev_nodes': dev_nodes,\
            's_weighted_mean_degree':weighted_mean_degree,\
            's_num_component':num_component,\
            's_avg_clustering_coef':avg_clustering_coef,\
            's_largest_component':largest_component,\
            's_graph_density':graph_density}

In [43]:
def g_unite(net_tech, net_social, net_mix):
	# Merge tech network and social network by edgelist files
	if not os.path.exists(net_tech):
		return None
	if not os.path.exists(net_social):
		return None

	net1_set = []
	with open(net_tech, 'r') as f:
		lines = f.read().splitlines()
	# print([net1, net2])
	for line in lines:
		sender, recivier, weight = line.split('##')
		net1_set.append([recivier, sender, weight])
	net2_set = []
	with open(net_social, 'r') as f:
		lines = f.read().splitlines()
	for line in lines:
		sender, recivier, weight = line.split('##')		
		net1_set.append([sender, recivier, weight])

	#print(net_mix_set)
	with open(net_mix, 'w') as f:
		for sender, recivier, weight in net1_set:
			f.write(sender+"##"+recivier+"##"+weight+"\n")


	return net1_set

In [44]:
def edge_continue_stats(netseries):
    net_set = {}
    

In [45]:
def edge_continue_stats(netseries):
    max_set = {}
    prev_set = {}
    for netname in netseries:
        count_set = {}
        if not os.path.exists(netname):
            #print("Not Exist File:"+netname)
            #netseries.append(set())
            continue
            # Don't Return NONE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            #return None
        else:
            with open(netname, 'r') as f:
                lines = f.read().splitlines()
            for line in lines:
                sender, receiver, weight = line.split('##')
                if sender+"##"+receiver in prev_set:
                    count_set[sender+"##"+receiver]=prev_set[sender+"##"+receiver]+1
                else:
                    count_set[sender+"##"+receiver]= 1
        for edgekey in prev_set.keys():
            if edgekey not in max_set:
                max_set[edgekey]= prev_set[edgekey]
            if edgekey not in count_set :
                max_set[edgekey]= max(max_set[edgekey], prev_set[edgekey])
        prev_set = count_set
        #print(count_set)
    for edgekey in prev_set.keys():
        if edgekey not in max_set:
            max_set[edgekey]= prev_set[edgekey]
        else:
            max_set[edgekey]= max(max_set[edgekey], prev_set[edgekey])
    
    edge_counts = list()
    max_set_nums = list(max_set.values())
    if len(max_set_nums)>0:
        max_count = max(max_set_nums)
        for i in range(max_count):
            edge_counts.append(max_set_nums.count(i))
                
    gc.collect()
    return edge_counts

In [46]:
all_graduated = np.load('all_graduated.npy').tolist()
all_retired = np.load('all_retired.npy').tolist()
df_proj_tech = pd.read_csv("./df_proj_tech.csv")
df_proj_social = pd.read_csv("./df_proj_social.csv")

In [53]:
time_resolution = 2

In [54]:
data_path = './network_data'+str(time_resolution)+'/'

### Faster

c_path = './network_data'+str(time_resolution)+'/commits/'
e_path = './network_data'+str(time_resolution)+'/emails/'
#c_path = './network_data/commits/'

projects = os.listdir(c_path)
project_names = [x.split('__')[0] for x in projects]
project_names = pd.Series(project_names).drop_duplicates().values

mix_path = data_path+"mix/"
if not os.path.exists(mix_path):
    os.makedirs(mix_path)

for g_file in os.listdir(c_path):
    net_mix_set = g_unite(c_path+g_file, e_path+g_file, mix_path+g_file)
    #print(net_mix_set)

In [55]:
projects = os.listdir(c_path)
project_names = [x.split('__')[0] for x in projects]
project_names = pd.Series(project_names).drop_duplicates().values


df_tech_edges = pd.DataFrame()
the_path = c_path
for projid in tqdm(project_names):
    netlist = []
    for seq_num in range(0, 43, time_resolution):
        #seq_num = 1
        this_fname = the_path+projid+'__'+str(seq_num)+".edgelist"
        netlist.append(this_fname)
    proj_edge_stats = edge_continue_stats(netlist)
    if not os.path.exists(netlist[0]):
        netlist.pop(0)
    projname = str(df_proj_tech.loc[df_proj_tech["project_aliase"] == projid]["project_name"].values[0])
    df_tech_edges[projname]= pd.Series(proj_edge_stats)

df_tech_edges.to_csv("./tech_edges"+str(time_resolution)+".csv", index=None, sep=',')        
     

100%|██████████| 260/260 [00:11<00:00, 23.14it/s]


In [56]:
projects = os.listdir(e_path)
project_names = [x.split('__')[0] for x in projects]
project_names = pd.Series(project_names).drop_duplicates().values

df_social_edges = pd.DataFrame()
the_path = e_path
for projid in tqdm(project_names):
    netlist = []
    for seq_num in range(0, 43, time_resolution):
        #seq_num = 1
        this_fname = the_path+projid+'__'+str(seq_num)+".edgelist"
        netlist.append(this_fname)
    proj_edge_stats = edge_continue_stats(netlist)
    if not os.path.exists(netlist[0]):
        netlist.pop(0)
    try:
        projname = str(df_proj_social.loc[df_proj_social["project_aliase"] == projid]["project_name"].values[0])
    except BaseException as err:
        projname = projid
    df_social_edges[projname]= pd.Series(proj_edge_stats)

df_social_edges.to_csv("./social_edges"+str(time_resolution)+".csv", index=None, sep=',')

100%|██████████| 272/272 [00:12<00:00, 22.24it/s]


In [57]:
projects = os.listdir('./network_data'+str(time_resolution)+'/mix/')
project_names = [x.split('__')[0] for x in projects]
project_names = pd.Series(project_names).drop_duplicates().values

df_mix_edges = pd.DataFrame()
the_path = './network_data'+str(time_resolution)+'/mix/'
for projid in tqdm(project_names):
    netlist = []
    for seq_num in range(0, 43, time_resolution):
        #seq_num = 1
        this_fname = the_path+projid+'__'+str(seq_num)+".edgelist"
        netlist.append(this_fname)
    proj_edge_stats = edge_continue_stats(netlist)
    if not os.path.exists(netlist[0]):
        netlist.pop(0)
    projname = str(df_proj_tech.loc[df_proj_tech["project_aliase"] == projid]["project_name"].values[0])
    df_mix_edges[projname]= pd.Series(proj_edge_stats)

df_mix_edges.to_csv("./mix_edges"+str(time_resolution)+".csv", index=None, sep=',')
    

100%|██████████| 259/259 [00:11<00:00, 23.03it/s]
