In [1]:
import scipy.stats as stats
from pathlib import Path
import json
import numpy as np
from sklearn.metrics import classification_report
import pandas as pd

# Link type map
from model.linktypes import fine_linktype_map

In [2]:
res_dir = Path.cwd() / 'data' / 'results' / 'bert-base-uncased'

REPOS = []
cls_rep_d = {}

for run_dir in res_dir.iterdir():
    run_name = run_dir.name
    
    if 'linktype' in (run_name.split('_')) and 'plus' in (run_name.split('_')):
        repo = run_name.split('_')[0]
        print(repo)
        REPOS.append(repo)
        
        with (run_dir / 'run_config.json').open('r', encoding='utf8') as f:
            run_config = json.load(f)
        test_output = np.load(run_dir / 'test_output.npz')
        
        cls_rep = classification_report(
            test_output['labels'],
            test_output['logits'].argmax(axis=-1),
            target_names=run_config['label_names'],
            output_dict=True
        )
        
        cls_rep_df = pd.DataFrame(cls_rep).transpose()
        cls_rep_df.to_csv(run_dir / (repo + '_LT_plus_cls_report.csv'))
        
        cls_rep_d[repo] = cls_rep_df
    

JFrog
Sakai
Hyperledger
MariaDB
Jira
MongoDB
Apache
Mojang
Spring
Qt
JiraEcosystem
IntelDAOS
RedHat
SecondLife
Sonatype


In [3]:
tldbert_res_dir = Path.cwd() / 'data' / 'results' / 'tldbert'

tb_cls_rep_d = {}

for run_dir in tldbert_res_dir.iterdir():
    run_name = run_dir.name
    
    if 'linktype' in (run_name.split('_')) and 'plus' in (run_name.split('_')):
        repo = run_name.split('_')[0]
        print(repo)
        
        with (run_dir / 'run_config.json').open('r', encoding='utf8') as f:
            run_config = json.load(f)
        test_output = np.load(run_dir / 'test_output.npz')
        
        cls_rep = classification_report(
            test_output['labels'],
            test_output['logits'].argmax(axis=-1),
            target_names=run_config['label_names'],
            output_dict=True
        )
        
        cls_rep_df = pd.DataFrame(cls_rep).transpose()
        cls_rep_df.to_csv(run_dir / (repo + '_LT_plus_cls_report.csv'))
        
        tb_cls_rep_d[repo] = cls_rep_df
        
        

JFrog
IntelDAOS
Hyperledger
Mojang
MariaDB
JiraEcosystem
Spring
Sonatype
SecondLife
Jira
RedHat
Apache
Sakai
MongoDB
Qt


In [4]:
# Sort repos alphabetically
REPOS.sort()
REPOS

['Apache',
 'Hyperledger',
 'IntelDAOS',
 'JFrog',
 'Jira',
 'JiraEcosystem',
 'MariaDB',
 'Mojang',
 'MongoDB',
 'Qt',
 'RedHat',
 'Sakai',
 'SecondLife',
 'Sonatype',
 'Spring']

In [5]:
for repo in REPOS:
    beta = np.round(cls_rep_d[repo]['support'].loc['macro avg'] / cls_rep_d[repo]['support'], 0) ** 2
    pre = cls_rep_d[repo]['precision']
    rec = cls_rep_d[repo]['recall']
    cls_rep_d[repo]['fbeta-score'] = (1 + beta) * (pre * rec) / ((beta * pre) + rec)
    cls_rep_d[repo]['fbeta-score'].fillna(0, inplace=True)

    tb_beta = np.round(tb_cls_rep_d[repo]['support'].loc['macro avg'] / tb_cls_rep_d[repo]['support'], 0) ** 2
    tb_pre = tb_cls_rep_d[repo]['precision']
    tb_rec = tb_cls_rep_d[repo]['recall']
    tb_cls_rep_d[repo]['fbeta-score'] = (1 + tb_beta) * (tb_pre * tb_rec) / ((tb_beta * tb_pre) + tb_rec)
    tb_cls_rep_d[repo]['fbeta-score'].fillna(0, inplace=True)

In [6]:
print('TLD results of BERT')
for repo in REPOS:
    print(repo)
    print('-------------------------------')
    print(cls_rep_d[repo])
    print('-------------------------------\n')

TLD results of BERT
Apache
-------------------------------
              precision    recall  f1-score       support  fbeta-score
Subtask        0.892710  0.928030  0.910027  16757.000000     0.924373
Relate         0.591191  0.704126  0.642735  14469.000000     0.696302
Duplicate      0.557914  0.420829  0.479771   5185.000000     0.421856
Block          0.529252  0.475130  0.500733   2875.000000     0.475280
Epic           0.955364  0.975610  0.965381   2501.000000     0.975563
Depend         0.539989  0.402400  0.461151   2500.000000     0.402632
Non-Link       0.766384  0.739355  0.752627   2325.000000     0.739404
Incorporate    0.589456  0.503594  0.543152   2087.000000     0.503711
Breaks         0.352795  0.326437  0.339104    870.000000     0.326444
Clone          0.677650  0.547454  0.605634    864.000000     0.547482
Require        0.528509  0.332873  0.408475    724.000000     0.332897
Supercede      0.366834  0.224615  0.278626    650.000000     0.224629
Cause          0.3

In [7]:
print('TLD results of TLDBERT')
for repo in REPOS:
    print(repo)
    print('-------------------------------')
    print(tb_cls_rep_d[repo])
    print('-------------------------------\n')

TLD results of TLDBERT
Apache
-------------------------------
              precision    recall  f1-score       support  fbeta-score
Subtask        0.898092  0.932446  0.914947  16757.000000     0.928893
Relate         0.603103  0.714562  0.654119  14469.000000     0.706878
Duplicate      0.574593  0.435294  0.495336   5185.000000     0.436341
Block          0.583788  0.483478  0.528919   2875.000000     0.483734
Epic           0.958545  0.980008  0.969158   2501.000000     0.979958
Depend         0.531326  0.437600  0.479930   2500.000000     0.437775
Non-Link       0.822711  0.788387  0.805183   2325.000000     0.788449
Incorporate    0.619825  0.509344  0.559179   2087.000000     0.509489
Breaks         0.388498  0.380460  0.384437    870.000000     0.380462
Clone          0.682306  0.589120  0.632298    864.000000     0.589142
Require        0.505660  0.370166  0.427432    724.000000     0.370185
Supercede      0.422164  0.246154  0.310982    650.000000     0.246169
Cause          

In [8]:
repo_overview = pd.read_csv('./data/repo_overview.csv', encoding='UTF-8', low_memory=False, sep=',', index_col=0)
repo_overview.drop(index=['Mindville'], inplace=True)
repo_overview

Unnamed: 0,Year,#Issues,#Comments,#Links,#Linktypes,#Projects,%IssuesWithLinks,%CrossProject
Apache,2000,1014926,4608221,255767,16,646,28.5,5.23
Hyperledger,2016,28146,44590,16304,8,32,54.9,4.62
IntelDAOS,2016,9474,32203,2599,11,2,30.8,3.27
JFrog,2006,15535,13152,3229,10,10,28.6,8.24
Jira,2002,274545,779104,99819,16,30,46.7,43.42
JiraEcosystem,2004,41866,68387,11398,14,101,33.0,6.77
MariaDB,2009,31229,0,14618,8,11,44.5,2.54
Mojang,2012,420819,933348,215527,5,8,53.7,5.43
MongoDB,2009,137172,368976,63821,14,27,45.2,19.09
Qt,2005,148579,421771,40105,11,21,30.2,6.92


In [9]:
# Different Types of Users in issue trackers
user_numbers = pd.read_csv('./data/user_numbers.csv', encoding='UTF-8', low_memory=False, sep=',', index_col=0)
user_numbers.drop(index = ['Mindville'], inplace=True)
user_numbers['Ages'] = 2023 - repo_overview['Year']
user_numbers

Unnamed: 0,TotalUsers,Creators,Reporters,Assignees,LinkSetters,Ages
Apache,115394,112718,113477,17513,11020,23
Hyperledger,1805,1638,1643,913,329,7
IntelDAOS,85,77,77,76,51,7
JFrog,2713,2677,2664,182,303,17
Jira,50930,41325,50203,2143,6300,21
JiraEcosystem,4166,3918,4006,786,760,19
MariaDB,4229,4216,4220,133,490,14
Mojang,242589,242506,242240,79,339,11
MongoDB,7777,7655,7680,646,929,14
Qt,21386,21189,21173,1347,614,18


In [10]:
repo_overview = repo_overview.transpose()
repo_overview

Unnamed: 0,Apache,Hyperledger,IntelDAOS,JFrog,Jira,JiraEcosystem,MariaDB,Mojang,MongoDB,Qt,RedHat,Sakai,SecondLife,Sonatype,Spring
Year,2000.0,2016.0,2016.0,2006.0,2002.0,2004.0,2009.0,2012.0,2009.0,2005.0,2001.0,2004.0,2007.0,2008.0,2003.0
#Issues,1014926.0,28146.0,9474.0,15535.0,274545.0,41866.0,31229.0,420819.0,137172.0,148579.0,353000.0,50550.0,1867.0,87284.0,69156.0
#Comments,4608221.0,44590.0,32203.0,13152.0,779104.0,68387.0,0.0,933348.0,368976.0,421771.0,859880.0,180191.0,15728.0,339127.0,186077.0
#Links,255767.0,16304.0,2599.0,3229.0,99819.0,11398.0,14618.0,215527.0,63821.0,40105.0,119669.0,19803.0,631.0,4465.0,14462.0
#Linktypes,16.0,8.0,11.0,10.0,16.0,14.0,8.0,5.0,14.0,11.0,15.0,8.0,6.0,11.0,7.0
#Projects,646.0,32.0,2.0,10.0,30.0,101.0,11.0,8.0,27.0,21.0,241.0,53.0,2.0,5.0,80.0
%IssuesWithLinks,28.5,54.9,30.8,28.6,46.7,33.0,44.5,53.7,45.2,30.2,39.2,42.4,39.9,7.0,25.6
%CrossProject,5.23,4.62,3.27,8.24,43.42,6.77,2.54,5.43,19.09,6.92,23.54,1.37,2.38,1.5,9.96


In [11]:
user_num = user_numbers.transpose()
user_num

Unnamed: 0,Apache,Hyperledger,IntelDAOS,JFrog,Jira,JiraEcosystem,MariaDB,Mojang,MongoDB,Qt,RedHat,Sakai,SecondLife,Sonatype,Spring
TotalUsers,115394,1805,85,2713,50930,4166,4229,242589,7777,21386,17698,1584,341,45651,15222
Creators,112718,1638,77,2677,41325,3918,4216,242506,7655,21189,17288,1441,332,45595,15188
Reporters,113477,1643,77,2664,50203,4006,4220,242240,7680,21173,17365,1462,327,45626,15189
Assignees,17513,913,76,182,2143,786,133,79,646,1347,2616,696,95,90,260
LinkSetters,11020,329,51,303,6300,760,490,339,929,614,2685,378,194,1004,121
Ages,23,7,7,17,21,19,14,11,14,18,22,19,16,15,20


In [12]:
# Get different attributes of repositories
repos = []
bert_acc = []       # bert accuracy
tb_acc = []         # tldbert accuracy
bert_mf = []        # bert macro f1-score
tb_mf = []          # tldbert macro f1-score

num_issues = []
num_comms = []
num_links = []
num_linktypes = []  # documented link types
num_predlts = []    # predicted link types
num_projs = []
coverage = []       # %Cov.
crossproject = []   # %CP

num_totalusers = []
age = []

for repo in REPOS:
    cls_rep_df = cls_rep_d[repo]
    tb_cls_rep_df = tb_cls_rep_d[repo]
    
    repos.append(repo)

    bert_acc.append(cls_rep_df['f1-score'].loc['accuracy'])
    tb_acc.append(tb_cls_rep_df['f1-score'].loc['accuracy'])

    bert_mf.append(cls_rep_df['f1-score'].loc['macro avg'])
    tb_mf.append(tb_cls_rep_df['f1-score'].loc['macro avg'])
    
    num_issues.append(repo_overview.loc['#Issues'][repo])
    num_comms.append(repo_overview.loc['#Comments'][repo])
    num_links.append(repo_overview.loc['#Links'][repo])
    num_linktypes.append(repo_overview.loc['#Linktypes'][repo])
    num_projs.append(repo_overview.loc['#Projects'][repo])
    coverage.append(repo_overview.loc['%IssuesWithLinks'][repo])
    crossproject.append(repo_overview.loc['%CrossProject'][repo])

    num_totalusers.append(user_num.loc['TotalUsers'][repo])
    age.append(user_num.loc['Ages'][repo])
    
    filename = './data/joined/links_plus/' + repo + '.csv'
    links = pd.read_csv(filename, encoding='UTF-8', low_memory=False, sep=';', usecols=['linktype', 'issue_id_1', 'issue_id_2'])
    
    links['mappedtype'] = links['linktype'].map(fine_linktype_map)
    linktypes = (links.mappedtype.value_counts() >= len(links) * 0.01).rename_axis('mappedtype').reset_index(name='valid')
    valid_types = set(linktypes[linktypes['valid'] == True]['mappedtype'])

    num_predlts.append(len(valid_types) + 1)  # add Non-Link type

stat_dict = {
    'repos': repos,
    'bert_acc': bert_acc,
    'tb_acc': tb_acc,
    'bert_mf': bert_mf,
    'tb_mf': tb_mf, 
    'num_issues': num_issues,
    'num_comms': num_comms, 
    'num_links' : num_links,
    'num_linktypes': num_linktypes,
    'num_predlts': num_predlts,
    'num_projs': num_projs,
    'norm_projs': np.array(num_projs) / np.linalg.norm(np.array(num_projs)),
    'coverage': coverage,
    'crossproject': crossproject,
    'num_totalusers': num_totalusers,
    'age': age
    }

stat_df = pd.DataFrame(stat_dict)

In [13]:
# Calculate issue-user ratios
stat_df['issue_user_ratio'] = stat_df['num_issues'] / stat_df['num_totalusers']

stat_df['acc_delta'] = stat_df['tb_acc'] - stat_df['bert_acc']
stat_df['acc_re'] = (stat_df['acc_delta'] / stat_df['bert_acc'])

stat_df['mf_delta'] = stat_df['tb_mf'] - stat_df['bert_mf']
stat_df['mf_re'] = (stat_df['mf_delta'] / stat_df['bert_mf'])


In [14]:
stat_df.sort_values(by='repos')

Unnamed: 0,repos,bert_acc,tb_acc,bert_mf,tb_mf,num_issues,num_comms,num_links,num_linktypes,num_predlts,...,norm_projs,coverage,crossproject,num_totalusers,age,issue_user_ratio,acc_delta,acc_re,mf_delta,mf_re
0,Apache,0.702373,0.715496,0.554467,0.576988,1014926.0,4608221.0,255767.0,16.0,13,...,0.915213,28.5,5.23,115394,23,8.79531,0.013123,0.018683,0.022521,0.040618
1,Hyperledger,0.844469,0.854326,0.731951,0.743925,28146.0,44590.0,16304.0,8.0,7,...,0.045336,54.9,4.62,1805,7,15.593352,0.009858,0.011673,0.011974,0.016359
2,IntelDAOS,0.657609,0.701087,0.686222,0.720358,9474.0,32203.0,2599.0,11.0,8,...,0.002833,30.8,3.27,85,7,111.458824,0.043478,0.066116,0.034136,0.049745
3,JFrog,0.647564,0.666189,0.486652,0.488704,15535.0,13152.0,3229.0,10.0,7,...,0.014167,28.6,8.24,2713,17,5.726133,0.018625,0.028761,0.002053,0.004218
4,Jira,0.825086,0.832744,0.727519,0.743571,274545.0,779104.0,99819.0,16.0,8,...,0.042502,46.7,43.42,50930,21,5.390634,0.007658,0.009282,0.016052,0.022065
5,JiraEcosystem,0.717577,0.729096,0.536534,0.546523,41866.0,68387.0,11398.0,14.0,11,...,0.143091,33.0,6.77,4166,19,10.049448,0.011519,0.016052,0.009989,0.018618
6,MariaDB,0.730371,0.736458,0.697081,0.696774,31229.0,0.0,14618.0,8.0,8,...,0.015584,44.5,2.54,4229,14,7.384488,0.006086,0.008333,-0.000307,-0.00044
7,Mojang,0.94755,0.947473,0.876324,0.876892,420819.0,933348.0,215527.0,5.0,3,...,0.011334,53.7,5.43,242589,11,1.734699,-7.8e-05,-8.2e-05,0.000568,0.000649
8,MongoDB,0.730436,0.738575,0.718808,0.726535,137172.0,368976.0,63821.0,14.0,9,...,0.038252,45.2,19.09,7777,14,17.638164,0.008139,0.011143,0.007727,0.01075
9,Qt,0.714912,0.729455,0.667135,0.677373,148579.0,421771.0,40105.0,11.0,8,...,0.029752,30.2,6.92,21386,18,6.947489,0.014543,0.020342,0.010237,0.015345


In [15]:
print('Statistical significance of accuracy increase')
print(stats.ttest_rel(stat_df['bert_acc'], stat_df['tb_acc']))

Statistical significance of accuracy increase
TtestResult(statistic=-4.297568321369522, pvalue=0.0007370030015306789, df=14)


In [16]:
print('Statistical significance of macro F1-score increase')
print(stats.ttest_rel(stat_df['bert_mf'], stat_df['tb_mf']))

Statistical significance of macro F1-score increase
TtestResult(statistic=-4.045957046265094, pvalue=0.0012027326106615836, df=14)


In [17]:
print('Number and median of predicted link types (>1%)')
print(stat_df['num_predlts'], stat_df['num_predlts'].median())

Number and median of predicted link types (>1%)
0     13
1      7
2      8
3      7
4      8
5     11
6      8
7      3
8      9
9      8
10     9
11     7
12     7
13     9
14     7
Name: num_predlts, dtype: int64 8.0


In [18]:
acc_re = stat_df['acc_re'].apply(lambda x : format(x, '.1%'))
mf_re = stat_df['mf_re'].apply(lambda x : format(x, '.1%'))

In [19]:
print('Relative increase of accuracy')
print(acc_re)

Relative increase of accuracy
0      1.9%
1      1.2%
2      6.6%
3      2.9%
4      0.9%
5      1.6%
6      0.8%
7     -0.0%
8      1.1%
9      2.0%
10     0.4%
11    -0.4%
12     3.5%
13     3.3%
14     1.4%
Name: acc_re, dtype: object


In [20]:
print('Relative increase of macro F1-score')
print(mf_re)

Relative increase of macro F1-score
0      4.1%
1      1.6%
2      5.0%
3      0.4%
4      2.2%
5      1.9%
6     -0.0%
7      0.1%
8      1.1%
9      1.5%
10     1.6%
11     0.3%
12     0.9%
13     9.7%
14     4.1%
Name: mf_re, dtype: object


In [21]:
print('Caculate Pearson correlation')
for i in ['acc_delta', 'mf_delta']:
    print(i)
    for j in ['num_issues', 'num_comms', 'num_links', 'num_linktypes', 'num_predlts', 'num_projs', 'norm_projs', 'coverage', 'crossproject', 'num_totalusers', 'issue_user_ratio', 'age']:
        print(j)
        r = stats.pearsonr(stat_df[i], stat_df[j])
        print(np.round(r, 4))
    print('-------------------------------------')

Caculate Pearson correlation
acc_delta
num_issues
[-0.2447  0.3793]
num_comms
[-0.1243  0.659 ]
num_links
[-0.3811  0.161 ]
num_linktypes
[0.0091 0.9744]
num_predlts
[0.1629 0.5619]
num_projs
[-0.1353  0.6308]
norm_projs
[-0.1353  0.6308]
coverage
[-0.4771  0.0722]
crossproject
[-0.2706  0.3293]
num_totalusers
[-0.3051  0.2688]
issue_user_ratio
[0.5935 0.0197]
age
[-0.3808  0.1614]
-------------------------------------
mf_delta
num_issues
[0.0861 0.7602]
num_comms
[0.1686 0.5481]
num_links
[-0.0911  0.7468]
num_linktypes
[0.2578 0.3535]
num_predlts
[0.3502 0.2006]
num_projs
[0.1533 0.5855]
norm_projs
[0.1533 0.5855]
coverage
[-0.7093  0.0031]
crossproject
[-0.0401  0.8871]
num_totalusers
[-0.0603  0.8308]
issue_user_ratio
[0.3325 0.226 ]
age
[-0.0598  0.8323]
-------------------------------------
