In [1]:
import scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import spatial
import json
import tabulate
import re
import math

In [101]:
# fill the whole window
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [9]:
repos = pd.read_csv("all_repos.csv", low_memory=False)


Now take out all mirrors

1. take all those that mention "mailing" in the description
2. take all those that mention "sr.ht" or "sourcehut" in the description: sourcehut uses mailinglists
3. take all those that are marked as mirrors by github
4. take all those that contain "mirror"  

In [5]:
candidates = repos[repos['description'].str.contains('(?i)mailing|mirror',regex=True, na=False)]
print("Number of repos that contain mailing, mirror, sr.ht, sourcehut in the description: " + str(len(candidates.index)))

has_mirror = repos[repos['mirror_url'].notnull()]
print("Number of repos marked as mirror: " + str(len(has_mirror.index)))

candidates = pd.concat([candidates, has_mirror]).drop_duplicates()
print("Union of the ones above (without duplicates): " + str(len(candidates.index)))

candidates["has_mirror_url"] = candidates['mirror_url'].notnull()
candidates["has_mailing_desc"] = candidates['description'].str.contains('(?i)mailing',regex=True, na=False)
candidates["has_mirror_desc"] = candidates['description'].str.contains('(?i)mirror',regex=True, na=False)


gitlab_desc = candidates[candidates['description'].str.contains('(?i)gitlab',regex=True, na=False)]
gitlab_url = candidates[candidates['mirror_url'].str.contains('(?i)gitlab',regex=True, na=False)]
gitlab = pd.concat([gitlab_desc, gitlab_url]).drop_duplicates()
print("number of repos on gitlab: " + str(len(gitlab.index)))

apache = candidates[candidates['name'].str.contains('(?i)^apache/',regex=True, na=False)]
print("Number of repos from apache: " + str(len(apache.index)))

gitlab2 = repos[repos['mirror_url'].str.contains('(?i)gitlab',regex=True, na=False)]
print("Number of repos from gitlab url: " + str(len(gitlab2.index)))

# remove the ones that are on gitlab
candidates = candidates.drop(gitlab.index)
print("Number of repos that are not on gitlab: " + str(len(candidates.index)))
candidates = candidates.drop(apache.index)
print("Number of repos that are not from apache and not on gitlab: " + str(len(candidates.index)))

# now write them to a file for manual inspection
#candidates[['name', 'has_mirror_url', 'has_mailing_desc', 'has_mirror_desc']].to_csv('mailing_list_candidates.csv', index=False)

Number of repos that contain mailing, mirror, sr.ht, sourcehut in the description: 740
Number of repos marked as mirror: 52
Union of the ones above (without duplicates): 769
number of repos on gitlab: 95
Number of repos from apache: 60
Number of repos from gitlab url: 5
Number of repos that are not on gitlab: 674
Number of repos that are not from apache: 614


Now it's time for a manual check of the repositories: mailing_list_candidates will have one more column that describes if the repo uses mailing list (True) of if it doesn't (False).

The following is a 

In [7]:
# load after manual check
mailing = pd.read_csv("manual_check.csv")
mailing = mailing[mailing['ml_type']=="ok"].reset_index(drop=True)
print(mailing)

                    name  has_mirror_url  has_mailing_desc  has_mirror_desc  \
0           bzg/org-mode           False             False             True   
1       guix-mirror/guix           False             False             True   
2       exg/rxvt-unicode           False             False             True   
3   zx2c4/password-store           False              True             True   
4       coreboot/seabios           False             False             True   
5              gpg/gnupg           False              True             True   
6        lwip-tcpip/lwip           False             False             True   
7         mirror/busybox           False             False             True   
8     emacs-mirror/emacs            True             False             True   
9    buildroot/buildroot           False             False             True   
10           nginx/nginx           False              True             True   

   has_ml   mirror direct_pull indirect_pull ml_typ

In [17]:
repos['mailinglist'] = repos['name'].isin(mailing['name'])
repos.loc[repos['name'] == 'buildroot/buildroot', 'mailinglist'] = True
repos.loc[repos['name'] == 'michaelforney/cproc', 'mailinglist'] = False
#repos.query("mailinglist == True")['name'].to_csv('mailing_list_git_clone.csv', index=False)
selected_mailing = repos[repos['mailinglist']].reset_index(drop=True)
print(selected_mailing)

                    name                                        description  \
0           bzg/org-mode             This is a MIRROR only, do not send PR.   
1       guix-mirror/guix  Read-only mirror of GNU Guix — pull requests a...   
2       exg/rxvt-unicode          git mirror of rxvt-unicode cvs repository   
3   zx2c4/password-store  Read-only mirror of https://git.zx2c4.com/pass...   
4       coreboot/seabios                                  mirror of seabios   
5              gpg/gnupg  The GNU Privacy Guard. NOTE: Maintainers are n...   
6        lwip-tcpip/lwip  lwIP mirror from http://git.savannah.gnu.org/c...   
7         mirror/busybox                                     BusyBox mirror   
8     emacs-mirror/emacs                                Mirror of GNU Emacs   
9    buildroot/buildroot  Buildroot, making embedded Linux easy. Note th...   
10           nginx/nginx  An official read-only mirror of http://hg.ngin...   

    is_fork  forks_count                           

In [18]:

mailing = pd.read_csv("manual_check.csv", low_memory=False)
mailing = mailing[mailing['ml_type']=="ok"].reset_index(drop=True)
manual_check = list(mailing["name"])
print(manual_check)


['bzg/org-mode', 'guix-mirror/guix', 'exg/rxvt-unicode', 'zx2c4/password-store', 'coreboot/seabios', 'gpg/gnupg', 'lwip-tcpip/lwip', 'mirror/busybox', 'emacs-mirror/emacs', 'buildroot/buildroot', 'nginx/nginx']


Now we have to rebuild the dataset and add a column for the mailinglist True vs False

In [19]:
def string_to_dict(s):
    res = s.replace("'",'"')
    res = res.replace('p"n',"p'n")
    res = res.replace('n"P',"n'P")
    res = res.replace('not found','{}')
    return json.loads(res)

def total_lines(langs):
    try:
        return sum(langs.values())
    except:
        return 0
repos["lang_dict"] = repos["languages"].apply(string_to_dict)
repos["total_lines"] = repos["lang_dict"].apply(total_lines)


In [20]:
# assign an id to all languages
dict_set = {""}
for d in repos["lang_dict"]:
    try:
        for k in d.keys():
            dict_set.add(k)
    except:
        continue #again it's the empty stuff
mapping_l_index = {l:i for i,l in enumerate(dict_set)}

In [21]:
repos["group"] = repos["total_lines"].rank()

Defining functions for the matching:

- `from_dict_to_vec`: the language dictionary of each repo becomes a vector in a multidimensional space
- `neighbors`: we define the neighobors of each repo by considering 2% of the repo that have a similar number of lines
- `compute_dist`: the distance between two repo in the language space
- `match`: the main function combining the different pieces

In [22]:
import datetime
def from_dict_to_vec(dic):
    v = np.zeros(len(mapping_l_index))
    try:
        for k,values in dic.items():
            idx = mapping_l_index[k]
            v[idx] = values
        return v
    except:
        print("error")
        return v

def neighbors(name, df):
    percentile_df = df.query('name.eq(@name)')["group"]
    percentile = percentile_df.reset_index(drop=True).at[0]
    epsilon = int(len(df.index)/200)
    lb = percentile - epsilon
    ub = percentile + epsilon

    # exclude neighbors that are not marked with mailing list and with only 1 contributor
    df_neighbors = df.reset_index(drop=True).query('group > @lb and group < @ub and mailinglist == False and contributors > 1')
    return df_neighbors

def compute_dist(dict1,dict2):
    vec1 = from_dict_to_vec(dict1)
    vec2 = from_dict_to_vec(dict2)
    result = spatial.distance.cosine(vec1, vec2)
    return result


def match(ml_name, df):
    # first get the neighbors
    df_neigh = neighbors(ml_name,df)
    ml_languages = df.query('name.eq(@ml_name)')["lang_dict"].reset_index(drop=True).at[0]
    ml_lines = df.query('name.eq(@ml_name)')["total_lines"].reset_index(drop=True).at[0]
    
    df_neigh["distance"] = df_neigh["lang_dict"].apply(lambda x: compute_dist(ml_languages, x))
    df_neigh["lines_dist"] = df_neigh["total_lines"].apply(lambda x: abs(x-ml_lines))
    df_neigh["lines_dist_prc"] = df_neigh["total_lines"].apply(lambda x: round(abs(x-ml_lines)/ml_lines*100))
    
    # remove all results for which there is more than a 45 deg angle in the vector
    max_distance = 1 - math.sqrt(2)/2
    df_neigh = df_neigh.reset_index(drop=True).query('distance < @max_distance')

    # now take the minimum distance
    result = df_neigh.sort_values(by=["distance", "lines_dist"]).head(10).reset_index(drop=True)

    result = pd.concat([df.query('name.eq(@ml_name)'),result])
    result["last_change"] = result["last_change"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d"))

    return result[["name", "distance", "lines_dist_prc", "total_lines", "creation_date", "last_change", "commits", "contributors", "issues", "stars", "lang_dict"]]


In [23]:
manual_check_sorted_by_total_lines = repos.query('name in @manual_check').sort_values(["total_lines"])["name"].values
for mailinglist in manual_check_sorted_by_total_lines:
    print(match(mailinglist,repos).to_markdown(tablefmt="orgtbl",index=False))
    print("\n\n")

| name                                        |    distance |   lines_dist_prc |   total_lines | creation_date   | last_change   |   commits |   contributors |   issues |   stars | lang_dict                                                                                                                                      |
|---------------------------------------------+-------------+------------------+---------------+-----------------+---------------+-----------+----------------+----------+---------+------------------------------------------------------------------------------------------------------------------------------------------------|
| zx2c4/password-store                        | nan         |              nan |        134003 | 2017-02-15      | 2022-12-26    |       516 |            103 |        3 |     538 | {'Shell': 65051, 'Python': 31707, 'Emacs Lisp': 14393, 'Ruby': 13414, 'Makefile': 3105, 'AppleScript': 3004, 'Perl': 1768, 'Vim Script': 1561} |
| pixelb/scripts      

| name                         |      distance |   lines_dist_prc |   total_lines | creation_date   | last_change   |   commits |   contributors |   issues |   stars | lang_dict                                                                                                |
|------------------------------+---------------+------------------+---------------+-----------------+---------------+-----------+----------------+----------+---------+----------------------------------------------------------------------------------------------------------|
| nginx/nginx                  | nan           |              nan |       5587530 | 2015-06-23      | 2023-04-11    |      7155 |             81 |       79 |   18284 | {'C': 5434680, 'Vim Script': 116017, 'XS': 25346, 'Perl': 6774, 'Makefile': 4121, 'C++': 592}            |
| openresty/openresty          |   0.000239718 |                5 |       5290412 | 2010-01-20      | 2023-03-30    |      1692 |             31 |      901 |   11247 | {'C': 5

| name         |   distance |   lines_dist_prc |   total_lines | creation_date   | last_change   |   commits |   contributors |   issues |   stars | lang_dict                                                                               |
|--------------+------------+------------------+---------------+-----------------+---------------+-----------+----------------+----------+---------+-----------------------------------------------------------------------------------------|
| bzg/org-mode |        nan |              nan |       6982855 | 2012-09-22      | 2022-11-03    |     25731 |            731 |        2 |     313 | {'Emacs Lisp': 6951013, 'Makefile': 20330, 'Perl': 10477, 'LilyPond': 1022, 'HTML': 13} |



| name                     |      distance |   lines_dist_prc |   total_lines | creation_date   | last_change   |   commits |   contributors |   issues |   stars | lang_dict                                                                                                           

| name             |   distance |   lines_dist_prc |   total_lines | creation_date   | last_change   |   commits |   contributors |   issues |   stars | lang_dict                                                                                                                                                                                                                |
|------------------+------------+------------------+---------------+-----------------+---------------+-----------+----------------+----------+---------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| guix-mirror/guix |        nan |              nan |      41258661 | 2015-12-12      | 2023-04-14    |    111060 |            876 |        2 |     220 | {'Scheme': 40280865, 'C++': 478005, 'Shell': 243263, 'Makefile': 145157, 'M4': 29654, 'C': 24824, 'Emac

Manually check the candidates: automatically open the candidates in firefox tabs and check them