In [37]:
import modin.pandas as mipd
import os
import time
import pandas as pd
from tqdm import tqdm
import re
import numpy as np
import json

In [8]:
from jellyfish import damerau_levenshtein_distance
from jellyfish import jaro_similarity
from jellyfish import jaro_winkler_similarity
from jellyfish import match_rating_comparison
from collections import defaultdict

## Preprocessing from Likang's

In [9]:
def partial_de_alising(author_name):
    return str(alias_dict.get(author_name, author_name)).lower().strip()

def preprocess_name(name):
    # remove nan
    if pd.isna(name):
        return name
    # if it is an email, take only the user domain
    name = name.split('@')[0]
    # remove text within brakets and parentheses
    name = re.sub(r"[\(\[].*?[\)\]]", "", name)
    # some names are of this pattern: "sg $ $date: 2008/10/07 10:18:51 $"
    if '$' in name:
        name = name.split('$')[0]
    # remove non-alphanumeric chars
    name = re.sub("[^a-zA-Z ]+", '', name)
    if ' via ' in name:
        name = name.split(' via ')[0]
    return name.strip()

# return True if it is sent by a bot
def is_bots(author_name):
    return bool(author_name in bots)

# return True if it is a coding file
def is_coding(file_path):
    if pd.isna(file_path):
        return False
    extension = '.' + file_path.split('/')[-1].split('.')[-1]
    return bool(extension in coding_extensions)

In [81]:
def indices_dict(lis):
    d = defaultdict(list)
    for i,(a,b) in enumerate(lis):
        d[a].append(i)
        d[b].append(i)
    return d

def disjoint_indices(lis):
    d = indices_dict(lis)
    sets = []
    while len(d):
        que = set(d.popitem()[1])
        ind = set()
        while len(que):
            ind |= que 
            que = set([y for i in que 
                         for x in lis[i] 
                         for y in d.pop(x, [])]) - ind
        sets += [ind]
    return sets

# union-find algo
def disjoint_sets(lis):
    return [list(set([x for i in s for x in lis[i]])) for s in disjoint_indices(lis)]


def process_name(name):
    print(name)
    # if it is an email, take only the user domain
    name = name.split('@')[0]
    # remove text within brakets and parentheses
    name = re.sub(r"[\(\[].*?[\)\]]", "", name)
    # remove non-alphanumeric chars
    name = re.sub("[^a-zA-Z ]+", '', name)
    if '$' in name:
        # some names are of this pattern: "sg $ $date: 2008/10/07 10:18:51 $"
        name = name.split('$')[0]
    return name.strip()


def check_segments(name1, name2):

    name_segs_1 = name1.split(' ')
    name_segs_2 = name2.split(' ')

    if len(name_segs_1) == len(name_segs_2) == 2:
        first_name_1, last_name_1 = name_segs_1
        first_name_2, last_name_2 = name_segs_2

        # option 1: first name 1 compare to first name 2, last name 1 compare to last name 2
        # e.g., "robert yates" v.s. "robert butts"
        first_name_score = jaro_winkler_similarity(first_name_1, first_name_2)
        last_name_score = jaro_winkler_similarity(last_name_1, last_name_2)
        if first_name_score < 0.8 or last_name_score < 0.8:
            return False
        # option 2: first name 1 compare to last name 2, last name 1 compare to first name 2
        # e.g., "yates robert" v.s. "robert butts"
        else:
            first_name_score = jaro_winkler_similarity(first_name_1, last_name_2)
            last_name_score = jaro_winkler_similarity(last_name_1, first_name_2)
            if first_name_score < 0.8 or last_name_score < 0.8:
                return False
    return True

In [47]:
df_aliase = pd.read_csv('./psql_aliase.csv')

In [48]:
#df_aliase['source'].isnull()

In [49]:
df_aliase

Unnamed: 0,aliase_id,mailaddress,person_id,personname,source
0,Matthias Pohl_m.pohl at m-click.de,m.pohl at m-click.de,,Matthias Pohl,
1,Dejung Gewissler_dejung.gewissler at oit.state...,dejung.gewissler at oit.state.nj.us,,Dejung Gewissler,
2,Richard didier_didier.richard at ign.fr,didier.richard at ign.fr,,Richard didier,
3,"Kenneth Skovhede, GEOGRAF A/S_ks at geograf.dk",ks at geograf.dk,,"Kenneth Skovhede, GEOGRAF A/S",
4,Stefano Bonnin_stefano.bonnin at comai.to,stefano.bonnin at comai.to,,Stefano Bonnin,
...,...,...,...,...,...
54193,Fengting Chen_fengting.chen@oracle.com,fengting.chen@oracle.com,,Fengting Chen,emails
54194,Benjamin Chartier_benjamin.chartier@cegetel.net,benjamin.chartier@cegetel.net,,Benjamin Chartier,emails
54195,=?ISO-8859-1?Q?Mohamed_Sa=E2d_HESSANE?=_saad.h...,saad.hessane@gmail.com,,=?ISO-8859-1?Q?Mohamed_Sa=E2d_HESSANE?=,emails
54196,rotulet,,,,Github


In [6]:
df_aliase.columns

Index(['aliase_id', 'mailaddress', 'person_id', 'personname', 'source'], dtype='object')

In [62]:
emails_df = df_aliase[df_aliase['source'] == 'emails']

In [63]:
emails_df

Unnamed: 0,aliase_id,mailaddress,person_id,personname,source
7790,Dimitris Kotzinos_kotzino@teiser.gr,kotzino@teiser.gr,,Dimitris Kotzinos,emails
8009,Le Jeune Yann_lj.yann@gmail.com,lj.yann@gmail.com,,Le Jeune Yann,emails
8015,philippe.gaudelet@libertysurf.fr_philippe.gaud...,philippe.gaudelet@libertysurf.fr,,philippe.gaudelet@libertysurf.fr,emails
8667,Eunice Lee_izeunice@gmail.com,izeunice@gmail.com,,Eunice Lee,emails
8774,Laurent Evrard_levrard@polytechnic.edu.na,levrard@polytechnic.edu.na,,Laurent Evrard,emails
...,...,...,...,...,...
54191,Allen Rongone_arongone@aer.com,arongone@aer.com,,Allen Rongone,emails
54192,Rob Iverson_rob.iverson@gmail.com,rob.iverson@gmail.com,,Rob Iverson,emails
54193,Fengting Chen_fengting.chen@oracle.com,fengting.chen@oracle.com,,Fengting Chen,emails
54194,Benjamin Chartier_benjamin.chartier@cegetel.net,benjamin.chartier@cegetel.net,,Benjamin Chartier,emails


In [64]:
emails_df.loc[emails_df['personname'].isnull(),'personname'] = emails_df[emails_df['personname'].isnull()]['mailaddress']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [65]:
emails_df['personname']

7790                           Dimitris Kotzinos
8009                               Le Jeune Yann
8015            philippe.gaudelet@libertysurf.fr
8667                                  Eunice Lee
8774                              Laurent Evrard
                          ...                   
54191                              Allen Rongone
54192                                Rob Iverson
54193                              Fengting Chen
54194                          Benjamin Chartier
54195    =?ISO-8859-1?Q?Mohamed_Sa=E2d_HESSANE?=
Name: personname, Length: 15441, dtype: object

In [66]:
commits_df = df_aliase[df_aliase['source'] == 'Github'].append(df_aliase[df_aliase['source'] == 'SVN'])


In [67]:
commits_df.loc[commits_df['personname'].isnull(),'personname'] = commits_df[commits_df['personname'].isnull()]['aliase_id']

In [68]:
commits_df

Unnamed: 0,aliase_id,mailaddress,person_id,personname,source
7791,normbetland,,,normbetland,Github
7792,eschou92,,,eschou92,Github
7807,sb4,,,sb4,Github
7808,ejb,mail@elliotbentley.com,,Elliot Bentley,Github
7810,MeTaNoV,pascal.gula@gmail.com,,Pascal Gula,Github
...,...,...,...,...,...
54086,vishalbangia,vishalbangia@svn.osgeo.org,,vishalbangia,SVN
54108,ychemin,ychemin@svn.osgeo.org,,ychemin,SVN
54145,gavincramer,gavincramer@svn.osgeo.org,,gavincramer,SVN
54184,caballero,caballero@devel.gvsig.org,,caballero,SVN


In [69]:
commits_dict = commits_df.to_dict('records')
emails_dict = emails_df.to_dict('records')

In [70]:
commits_dict

[{'aliase_id': 'normbetland',
  'mailaddress': nan,
  'person_id': nan,
  'personname': 'normbetland',
  'source': 'Github'},
 {'aliase_id': 'eschou92',
  'mailaddress': nan,
  'person_id': nan,
  'personname': 'eschou92',
  'source': 'Github'},
 {'aliase_id': 'sb4',
  'mailaddress': nan,
  'person_id': nan,
  'personname': 'sb4',
  'source': 'Github'},
 {'aliase_id': 'ejb',
  'mailaddress': 'mail@elliotbentley.com',
  'person_id': nan,
  'personname': 'Elliot Bentley',
  'source': 'Github'},
 {'aliase_id': 'MeTaNoV',
  'mailaddress': 'pascal.gula@gmail.com',
  'person_id': nan,
  'personname': 'Pascal Gula',
  'source': 'Github'},
 {'aliase_id': 'igeofr',
  'mailaddress': 'florian.boret@data-wax.com',
  'person_id': nan,
  'personname': 'Flø',
  'source': 'Github'},
 {'aliase_id': 'arc12',
  'mailaddress': nan,
  'person_id': nan,
  'personname': 'AdamCooper',
  'source': 'Github'},
 {'aliase_id': 'fuggii',
  'mailaddress': nan,
  'person_id': nan,
  'personname': 'fuggii',
  'source'

In [71]:
committers = {}
contributors = {}

for commit in commits_dict:
    project_name = commit['source']
    author_full_name = commit['personname']
    if project_name not in committers:
        committers[project_name] = set()
    committers[project_name].add(author_full_name)

for email in emails_dict:
    project_name = email['source']
    author_full_name = email['personname']
    if project_name not in contributors:
        contributors[project_name] = set()
    contributors[project_name].add(author_full_name)


In [84]:

project_alias_clustering = {}

clustering_pairs = []
committer_names = set(committers['Github']).union(set(committers['SVN']))
contributor_names = set(contributors['emails'])
developer_names = list(committer_names.union(contributor_names))

developer_names.remove(np.nan)
#contributor_names.remove(np.nan)
for i in tqdm(range(len(developer_names))):
    p1 = preprocess_name(developer_names[i])
    
    for j in range(i+1, len(developer_names)):
        # if it is an email, take only the user domain
        p2 = preprocess_name(developer_names[j])

        # reslove issues that two different devs sharing same first name, 
        # e.g., "robert ottaway", "robert sayre"
        if not check_segments(p1, p2):
            continue

        jaro_winkler_similarity_score = jaro_winkler_similarity(p1, p2)
        # sounding_match_score = any([match_rating_comparison(s1, s2) for s1 in name_segs_1 for s2 in name_segs_2])
        # sounding_match_score = any([sounding_match_score, match_rating_comparison(p1, p2)])

        # add pairs if:
        # (1) if the score fall down to 0.85 
        # (2) or if the score fall down to 0.82 then we use pronouncetion to help make decision
        if jaro_winkler_similarity_score > 0.85: # or (jaro_winkler_similarity_score > 0.82 and sounding_match_score):
            clustering_pairs.append([developer_names[i], developer_names[j]])

project_alias_clustering = disjoint_sets(clustering_pairs)

with open('./project_alias_clustering.json', 'w') as f:
    json.dump(project_alias_clustering, f, indent = 4)

with open('./project_alias_clustering.json', 'r') as f:
    project_alias_clustering = json.load(f)


  6%|▌         | 2027/33205 [03:54<57:11,  9.09it/s]