In [101]:
import numpy as np
import modin.pandas as mipd
import os
import time
import pandas as pd
from tqdm import tqdm
import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt
import numpy as np

from matplotlib import pyplot as plt


In [102]:
# technical nets are unweighted
def get_tech_net(path):

    bipartite_G = nx.Graph()
    df = pd.read_csv(path, header=None, sep='##', engine='python')
    df.columns = ['file', 'dev', 'weight']

    ## Logic to add nodes and edges to graph with their metadata
    for _, row in df.iterrows():
        dev_node = row['dev']
        file_node = row['file'].replace('   (with props)', '')
        bipartite_G.add_node(dev_node, bipartite='dev')
        bipartite_G.add_node(file_node, bipartite='file')
        bipartite_G.add_edge(dev_node, file_node)

    dev_nodes = {n for n, d in bipartite_G.nodes(data=True) if d["bipartite"] == 'dev'}
    file_nodes = {n for n, d in bipartite_G.nodes(data=True) if d["bipartite"] == 'file'}
    
    return bipartite_G
def cal_tech_net(path):
    # check if file does not exist or empty
    if not os.path.exists(path) or os.stat(path).st_size == 0:
        return {'t_num_dev_nodes':0,\
                't_num_file_nodes':0,\
                't_num_dev_per_file':0,\
                't_num_file_per_dev':0,\
                't_graph_density':0,\
                't_dev_nodes': set()}

    bipartite_G = get_tech_net(path)

    graph_density = bipartite.density(bipartite_G, dev_nodes)
    file_degrees, dev_degrees = bipartite.degrees(bipartite_G, dev_nodes)

    num_file_nodes = len(file_degrees)
    num_dev_nodes = len(dev_degrees)
    file_node_degree = sum([degree for node, degree in file_degrees])/len(file_degrees)
    dev_node_degree = sum([degree for node, degree in dev_degrees])/len(dev_degrees)

    # return the features of tech net
    return {'t_num_dev_nodes':num_dev_nodes,\
            't_num_file_nodes':num_file_nodes,\
            't_num_dev_per_file':file_node_degree,\
            't_num_file_per_dev':dev_node_degree,\
            't_graph_density':graph_density,\
            't_dev_nodes': set(dev_nodes)}

def get_social_net(path):
    G = nx.read_edgelist(path, create_using=nx.DiGraph(), nodetype=str, comments='*', delimiter='##', data=(('weight', int),))
    return G

# social nets are weighted
def cal_social_net(path):
    # if no network data
    if not os.path.exists(path) or os.stat(path).st_size == 0:
        return {'s_num_nodes':0, \
                's_dev_nodes':set(),\
                's_weighted_mean_degree':0,\
                's_num_component':0,\
                's_avg_clustering_coef':0,\
                's_largest_component':0,\
                's_graph_density':0}

    # Processing features in social networks
    G = nx.read_edgelist(path, create_using=nx.DiGraph(), nodetype=str, comments='*', delimiter='##', data=(('weight', int),))
    # all dev nodes
    dev_nodes = set(G.nodes)
    # num. of total nodes
    num_nodes = len(dev_nodes)
    # weighted mean degree
    degrees = G.degree(weight='weight')
    weighted_mean_degree = sum([degree for node, degree in degrees])/num_nodes
    # average clustering coefficient
    avg_clustering_coef = nx.average_clustering(G)
    # betweenness = nx.betweenness_centrality(G, weight='weight')
    graph_density = nx.density(G)

    G = nx.read_edgelist(path, create_using=nx.Graph(), nodetype=str, comments='*', delimiter='##', data=(('weight', int),))
    # num. of dis-connected components
    num_component = nx.number_connected_components(G)
    # largest connected component
    largest_component = len(max(nx.connected_components(G), key=len))
    # num. of nodes in each component
    # num_nodes_component = [list(c) for c in list(nx.connected_components(G))]

    # return the features of the 
    return {'s_num_nodes': num_nodes,\
            's_dev_nodes': dev_nodes,\
            's_weighted_mean_degree':weighted_mean_degree,\
            's_num_component':num_component,\
            's_avg_clustering_coef':avg_clustering_coef,\
            's_largest_component':largest_component,\
            's_graph_density':graph_density}

In [103]:
df_incubating = pd.read_csv('/mnt/data0/lkyin/incubating.csv')
df_graduated = pd.read_csv('/mnt/data0/lkyin/graduated.csv')
df_retired = pd.read_csv('/mnt/data0/lkyin/retired.csv')

In [104]:
### Faster

#c_path = './network_data'+str(time_resolution)+'/commits/'
#c_path = './network_data/commits/'

#projects = os.listdir(c_path)
#project_names = [x.split('__')[0] for x in projects]
#project_names = pd.Series(project_names).drop_duplicates().values

### Description: Take a look at how many projects:
    
#### 1. Technical Data:

In [105]:
c_path = '/mnt/data0/lkyin/monthly_data/commits/'
projects = os.listdir(c_path)
print("Total:", len(projects), "months")

proj_names = []
proj_ids= []
proj_status = []
for project in tqdm(projects):
    project_name, period = project.replace('.csv', '').split('__')
    proj_ids.append(project_name)
    if project_name in df_incubating["alias"].values :
        proj_status.append("incubating")
        proj_names.append(str(df_incubating.loc[df_incubating["alias"] == project_name]["project_name"].values[0]))
    elif project_name in df_graduated["alias"].values :
        proj_status.append('graduated')
        proj_names.append(str(df_graduated.loc[df_graduated["alias"] == project_name]["project_name"].values[0]))
    elif project_name in df_retired["alias"].values :
        proj_status.append('retired')
        proj_names.append(str(df_retired.loc[df_retired["alias"] == project_name]["project_name"].values[0]))
    else:
        proj_status.append('unknown')
        proj_names.append('unknown')

Total: 4327 months


100%|██████████| 4327/4327 [00:01<00:00, 3385.26it/s]


In [106]:
tech_proj_names = pd.Series(proj_names)
print(tech_proj_names.describe())

tech_proj_ids = pd.Series(proj_ids)

tech_proj_status = pd.Series(proj_status)
print(tech_proj_status.unique())

df_proj_tech = pd.DataFrame()
df_proj_tech['project_name'] = tech_proj_names
df_proj_tech['project_aliase'] = tech_proj_ids
df_proj_tech['project_status'] = tech_proj_status

df_proj_tech = df_proj_tech.drop_duplicates()
df_proj_tech.describe()

count        4327
unique        263
top       JSPWiki
freq           63
dtype: object
['retired' 'graduated']


Unnamed: 0,project_name,project_aliase,project_status
count,263,263,263
unique,263,263,2
top,Taverna,taverna,graduated
freq,1,1,204


#### 2. Social Networks

In [107]:
e_path = '/mnt/data0/lkyin/monthly_data/emails/'
projects = os.listdir(e_path)
print("Total:", len(projects), "months")

proj_names = []
proj_ids = []
proj_status = []
for project in tqdm(projects):
    project_name, period = project.replace('.csv', '').split('__')
    proj_ids.append(project_name)
    if project_name in df_incubating["alias"].values :
        proj_status.append("incubating")
        proj_names.append(str(df_incubating.loc[df_incubating["alias"] == project_name]["project_name"].values[0]))
    elif project_name in df_graduated["alias"].values :
        proj_status.append('graduated')
        proj_names.append(str(df_graduated.loc[df_graduated["alias"] == project_name]["project_name"].values[0]))
    elif project_name in df_retired["alias"].values :
        proj_status.append('retired')
        proj_names.append(str(df_retired.loc[df_retired["alias"] == project_name]["project_name"].values[0]))
    else:
        proj_status.append('unknown')
        proj_names.append('unknown')

Total: 6036 months


100%|██████████| 6036/6036 [00:01<00:00, 3849.82it/s]


In [108]:


social_proj_names = pd.Series(proj_names)
print(social_proj_names.describe())

social_proj_ids = pd.Series(proj_ids)

social_proj_status = pd.Series(proj_status)
print(social_proj_status.unique())

df_proj_social = pd.DataFrame()
df_proj_social['project_name'] = social_proj_names
df_proj_social['project_aliase'] = social_proj_ids
df_proj_social['project_status'] = social_proj_status

df_proj_social = df_proj_social.drop_duplicates()
df_proj_social.describe()

count            6036
unique            272
top       ODF Toolkit
freq               87
dtype: object
['retired' 'graduated']


Unnamed: 0,project_name,project_aliase,project_status
count,272,272,272
unique,272,272,2
top,Taverna,taverna,graduated
freq,1,1,211


## Network Description

## Overlap Calculation now:

In [109]:
def sets_jaccard_unweighted(net1_set, net2_set):
    intersection_edges = net1_set.intersection(net2_set)
    return len(intersection_edges) / (len(net1_set) + len(net2_set)-len(intersection_edges))

def sets_overlap_unweighted(net1_set, net2_set):
    intersection_edges = net1_set.intersection(net2_set)
    return len(intersection_edges) / min([len(net1_set), len(net2_set)])

In [110]:
# Overlap calculation from Likang
# Jaccard Now
def get_net_overlap(net1, net2):
	
	if not os.path.exists(net1):
		return 0
	if not os.path.exists(net2):
		return 0

	net1_set = set()
	with open(net1, 'r') as f:
		lines = f.read().splitlines()
	# print([net1, net2])
	for line in lines:
		sender, recivier, weight = line.split('##')
		net1_set.add((sender, recivier))
	net2_set = set()
	with open(net2, 'r') as f:
		lines = f.read().splitlines()
	for line in lines:
		sender, recivier, weight = line.split('##')		
		net2_set.add((sender, recivier))
	if len(net1_set) == 0 or len(net2_set) == 0:
		return 0

	return sets_overlap_unweighted(net1_set, net2_set)
	#return sets_jaccard_unweighted(net1_set, net2_set)

In [111]:
# Overlap calculation from Likang
# Jaccard Now
def get_net_jaccard(net1, net2):
	
	if not os.path.exists(net1):
		return 0
	if not os.path.exists(net2):
		return 0

	net1_set = set()
	with open(net1, 'r') as f:
		lines = f.read().splitlines()
	# print([net1, net2])
	for line in lines:
		sender, recivier, weight = line.split('##')
		net1_set.add((sender, recivier))
	net2_set = set()
	with open(net2, 'r') as f:
		lines = f.read().splitlines()
	for line in lines:
		sender, recivier, weight = line.split('##')		
		net2_set.add((sender, recivier))
	if len(net1_set) == 0 or len(net2_set) == 0:
		return 0

	#return sets_overlap_unweighted(net1_set, net2_set)
	return sets_jaccard_unweighted(net1_set, net2_set)

In [112]:


def get_netseries_overlap(netnameseries, timeintervals):

    netseries = []
    for netname in netnameseries:
        if not os.path.exists(netname):
            print("Not Exist File:"+netname)
            netseries.append({})
        else:
            net_set = set()
            with open(netname, 'r') as f:
                lines = f.read().splitlines()
            for line in lines:
                sender, recivier, weight = line.split('##')
                net_set.add((sender, recivier))
            netseries.append(net_set)
            
    overlaplevelseries = []
    interlevelseries = []
    minlevelseries = []

    nets = netseries
    interlevelseries.append(nets)

    minseries = []
    for net_set in netseries:
        minseries.append(len(net_set))
    #minlevelseries.append(minseries)

    for level in range(1, timeintervals):
        
        for i in range(len(nets)-level):
            minseries[i] = min(minseries[i],minseries[i+1])
        #minseries = []
        #for i in range(len(nets)-level):
        #    minseries.append(min(minlevelseries[level-1][i:i+1]))
        #minlevelseries.append(minseries)
        
        overlapseries = []
        interseries = []
        for i in range(len(nets)-level):

            net1_set = nets[i]
            net2_set = nets[i+1]

            net_intersection = net1_set.intersection(net2_set)
            interseries.append(net_intersection)
            overlapseries.append(len(net_intersection)/minseries[i])

        overlaplevelseries.append(overlapseries)
        # for next round of for
        nets = interseries
        interlevelseries.append(interseries)

    return overlaplevelseries

        

In [113]:
def g_unite(net_tech, net_social, net_mix):
	# Merge tech network and social network by edgelist files
	if not os.path.exists(net_tech):
		return None
	if not os.path.exists(net_social):
		return None

	net1_set = []
	with open(net_tech, 'r') as f:
		lines = f.read().splitlines()
	# print([net1, net2])
	for line in lines:
		sender, recivier, weight = line.split('##')
		net1_set.append([recivier, sender, weight])
	net2_set = []
	with open(net_social, 'r') as f:
		lines = f.read().splitlines()
	for line in lines:
		sender, recivier, weight = line.split('##')		
		net1_set.append([sender, recivier, weight])

	#print(net_mix_set)
	with open(net_mix, 'w') as f:
		for sender, recivier, weight in net1_set:
			f.write(sender+"##"+recivier+"##"+weight+"\n")


	return net1_set

In [126]:
time_resolution = 1

In [127]:
data_path = '/mnt/data0/proj_osgeo/data_ASF_trial/network_data'+str(time_resolution)+'/'

### Faster

c_path = './network_data'+str(time_resolution)+'/commits/'
e_path = './network_data'+str(time_resolution)+'/emails/'
#c_path = './network_data/commits/'

projects = os.listdir(c_path)
project_names = [x.split('__')[0] for x in projects]
project_names = pd.Series(project_names).drop_duplicates().values

In [128]:
mix_path = data_path+"mix/"
if not os.path.exists(mix_path):
    os.makedirs(mix_path)

for g_file in os.listdir(c_path):
    net_mix_set = g_unite(c_path+g_file, e_path+g_file, mix_path+g_file)
    #print(net_mix_set)

In [129]:
projects = os.listdir(c_path)
project_names = [x.split('__')[0] for x in projects]
project_names = pd.Series(project_names).drop_duplicates().values

df_tech_overlaps = pd.DataFrame()
the_path = c_path
for projid in project_names:
    overlaps = []
    prev_fname = the_path+projid+'__'+str(0)+".edgelist"
    for seq_num in range(time_resolution, 63, time_resolution):
        #seq_num = 1
        this_fname = the_path+projid+'__'+str(seq_num)+".edgelist"
        
        #while os.path.exists(this_fname):
        if os.path.exists(prev_fname):
            overlaps.append(get_net_overlap(prev_fname, this_fname))
        #seq_num+=1
        prev_fname = this_fname
        this_fname = the_path+projid+'__'+str(seq_num)+".edgelist"
        
    overlaps.append(-1.0)
    for i in range(len(overlaps),65):
        overlaps.append(-1.0)
    projname = str(df_proj_tech.loc[df_proj_tech["project_aliase"] == projid]["project_name"].values[0])
    #print(projname)
    df_tech_overlaps[projname] = pd.Series(overlaps) 
df_tech_overlaps.to_csv("./tech_overlaps"+str(time_resolution)+".csv", index=None, sep=',')
    



In [130]:
projects = os.listdir(c_path)
project_names = [x.split('__')[0] for x in projects]
project_names = pd.Series(project_names).drop_duplicates().values

df_tech_overlaps = pd.DataFrame()
the_path = c_path
for projid in project_names:
    overlaps = []
    prev_fname = the_path+projid+'__'+str(0)+".edgelist"
    for seq_num in range(time_resolution, 63, time_resolution):
        #seq_num = 1
        this_fname = the_path+projid+'__'+str(seq_num)+".edgelist"
        
        #while os.path.exists(this_fname):
        if os.path.exists(prev_fname):
            overlaps.append(get_net_jaccard(prev_fname, this_fname))
        #seq_num+=1
        prev_fname = this_fname
        this_fname = the_path+projid+'__'+str(seq_num)+".edgelist"
        
    overlaps.append(-1.0)
    for i in range(len(overlaps),65):
        overlaps.append(-1.0)
    projname = str(df_proj_tech.loc[df_proj_tech["project_aliase"] == projid]["project_name"].values[0])
    #print(projname)
    df_tech_overlaps[projname] = pd.Series(overlaps) 
df_tech_overlaps.to_csv("./tech_jaccards"+str(time_resolution)+".csv", index=None, sep=',')
    



In [131]:
df_tech_overlaps.columns

Index(['Kitty', 'Tika', 'Lucene.NET', 'Pivot', 'Slider', 'Streams', 'Gossip',
       'Unomi', 'ODF Toolkit', 'Tephra',
       ...
       'Wicket', 'Geode', 'WebWork 2', 'Ignite', 'Crunch', 'CommonsRDF',
       'Directory', 'Kabuki', 'Concerted', 'Traffic Server'],
      dtype='object', length=263)

In [132]:
projects = os.listdir(e_path)
project_names = [x.split('__')[0] for x in projects]
project_names = pd.Series(project_names).drop_duplicates().values

df_social_overlaps = pd.DataFrame()
the_path = e_path
for projid in project_names:
    overlaps = []
    prev_fname = the_path+projid+'__'+str(0)+".edgelist"
    for seq_num in range(time_resolution,87, time_resolution):
        #seq_num = 1
        this_fname = the_path+projid+'__'+str(seq_num)+".edgelist"
        
        #while os.path.exists(this_fname):
        if os.path.exists(prev_fname):
            overlaps.append(get_net_overlap(prev_fname, this_fname))
        #seq_num+=1
        prev_fname = this_fname
        this_fname = the_path+projid+'__'+str(seq_num)+".edgelist"
    overlaps.append(-1.0)
    for i in range(len(overlaps),89):
        overlaps.append(-1.0)
    projname = str(df_proj_social.loc[df_proj_social["project_aliase"] == projid]["project_name"].values[0])
    #print(projname)
    df_social_overlaps[projname] = pd.Series(overlaps) 
df_social_overlaps.to_csv("./social_overlaps"+str(time_resolution)+".csv", index=None, sep=',')



In [133]:
projects = os.listdir(e_path)
project_names = [x.split('__')[0] for x in projects]
project_names = pd.Series(project_names).drop_duplicates().values

df_social_overlaps = pd.DataFrame()
the_path = e_path
for projid in project_names:
    overlaps = []
    prev_fname = the_path+projid+'__'+str(0)+".edgelist"
    for seq_num in range(time_resolution,87, time_resolution):
        #seq_num = 1
        this_fname = the_path+projid+'__'+str(seq_num)+".edgelist"
        
        #while os.path.exists(this_fname):
        if os.path.exists(prev_fname):
            overlaps.append(get_net_jaccard(prev_fname, this_fname))
        #seq_num+=1
        prev_fname = this_fname
        this_fname = the_path+projid+'__'+str(seq_num)+".edgelist"
    overlaps.append(-1.0)
    for i in range(len(overlaps),89):
        overlaps.append(-1.0)
    projname = str(df_proj_social.loc[df_proj_social["project_aliase"] == projid]["project_name"].values[0])
    #print(projname)
    df_social_overlaps[projname] = pd.Series(overlaps) 
df_social_overlaps.to_csv("./social_jaccards"+str(time_resolution)+".csv", index=None, sep=',')



In [134]:
projects = os.listdir('./network_data'+str(time_resolution)+'/mix/')
project_names = [x.split('__')[0] for x in projects]
project_names = pd.Series(project_names).drop_duplicates().values

df_tech_overlaps = pd.DataFrame()
the_path = './network_data'+str(time_resolution)+'/mix/'
for projid in project_names:
    overlaps = []
    prev_fname = the_path+projid+'__'+str(0)+".edgelist"
    for seq_num in range(time_resolution,63, time_resolution):
        #seq_num = 1
        this_fname = the_path+projid+'__'+str(seq_num)+".edgelist"
        
        #while os.path.exists(this_fname):
        if os.path.exists(prev_fname):
            overlaps.append(get_net_overlap(prev_fname, this_fname))
        #seq_num+=1
        prev_fname = this_fname
        this_fname = the_path+projid+'__'+str(seq_num)+".edgelist"
        
    overlaps.append(-1.0)
    for i in range(len(overlaps),65):
        overlaps.append(-1.0)
    projname = str(df_proj_tech.loc[df_proj_tech["project_aliase"] == projid]["project_name"].values[0])
    #print(projname)
    df_tech_overlaps[projname] = pd.Series(overlaps) 
df_tech_overlaps.to_csv("./mix_overlaps"+str(time_resolution)+".csv", index=None, sep=',')
    



In [135]:
projects = os.listdir('./network_data'+str(time_resolution)+'/mix/')
project_names = [x.split('__')[0] for x in projects]
project_names = pd.Series(project_names).drop_duplicates().values

df_tech_overlaps = pd.DataFrame()
the_path = './network_data'+str(time_resolution)+'/mix/'
for projid in project_names:
    overlaps = []
    prev_fname = the_path+projid+'__'+str(0)+".edgelist"
    for seq_num in range(time_resolution,63, time_resolution):
        #seq_num = 1
        this_fname = the_path+projid+'__'+str(seq_num)+".edgelist"
        
        #while os.path.exists(this_fname):
        if os.path.exists(prev_fname):
            overlaps.append(get_net_jaccard(prev_fname, this_fname))
        #seq_num+=1
        prev_fname = this_fname
        this_fname = the_path+projid+'__'+str(seq_num)+".edgelist"
        
    overlaps.append(-1.0)
    for i in range(len(overlaps),65):
        overlaps.append(-1.0)
    projname = str(df_proj_tech.loc[df_proj_tech["project_aliase"] == projid]["project_name"].values[0])
    #print(projname)
    df_tech_overlaps[projname] = pd.Series(overlaps) 
df_tech_overlaps.to_csv("./mix_jaccards"+str(time_resolution)+".csv", index=None, sep=',')
    



In [124]:
projid

'warble'

In [125]:
df_proj_tech.loc[df_proj_tech["project_aliase"] == projid]

Unnamed: 0,project_name,project_aliase,project_status
2421,Warble,warble,retired
