In [1]:
import pandas as pd
from tqdm import tqdm
import random
import os

In [2]:
tqdm.pandas()

In [3]:
REPOS = ['Apache', 'Hyperledger', 'IntelDAOS', 'JFrog', 'Jira', 
         'JiraEcosystem', 'MariaDB', 'Mojang', 'MongoDB', 'Qt', 
         'RedHat', 'Sakai', 'SecondLife', 'Sonatype', 'Spring']

In [4]:
CLEANED_ISSUE_DIR = '../data/processed/cleaned_issues/'
LINK_DIR = '../data/raw/links/'

issue_paths = []
for root, dirs, file_names in os.walk(CLEANED_ISSUE_DIR):
    for file_name in file_names:
        issue_paths.append(os.path.join(root, file_name))

In [5]:
def load_issues(repo: str):
    issue_path_ls = [issue_path for issue_path in issue_paths if repo in issue_path]
    issue_df = pd.DataFrame()
    for issue_path in issue_path_ls:
        tmp_df = pd.read_csv(issue_path, sep=';', encoding='utf-8', low_memory=False, 
                             usecols=lambda x : x not in ['created', 'updated', 'comments', 'components'], index_col=['issue_id'])
        issue_df = pd.concat([issue_df, tmp_df])
    return issue_df

In [6]:
def load_links(repo: str):
    link_path = LINK_DIR + repo + '.csv'
    link_df = pd.read_csv(link_path, sep=';', encoding="UTF-8", low_memory=False).drop_duplicates()
    return link_df

In [7]:
# This function is to identify doubled issue pairs
def add_linked_issues_to_df(df):
    df['issues'] = ''
    for i in tqdm(range(len(df))):
        df['issues'].iloc[i] = str(sorted(set([df.iloc[i]['issue_id_1'], df.iloc[i]['issue_id_2']])))

In [8]:
def clean_links(link_df, issue_df):    

    # Remove links with uncrawled and filtered issues
    link_df = link_df[link_df[['issue_id_1', 'issue_id_2']].isin(issue_df.index.values).all(axis=1)]
    print(f'Left with {len(link_df)} links after removing half-private links')
    
    # Cleanup links, only allow one link per issue-pair
    link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
    print(f'Left with {len(link_df)} links after removing issue-pairs with multiple links between them')

    # In case the name is the otherway around, like issue-1_issue-2 and issue-2_issue-1
    doublelinks = (link_df.issues.value_counts() > 1).rename_axis('doubles').reset_index(name='valid')
    valid_doubles = set(doublelinks[doublelinks['valid'] == True]['doubles'])

    for i in tqdm(valid_doubles):
        if len(set(link_df[link_df['issues'] == i]['linktype'])) > 1:
            link_df = link_df[link_df.issues != i]
    print(f'Left with {len(link_df)} links after removing issue-pairs with multiple link types between them')

    # Multiple links complete remove
    link_df.drop_duplicates(subset=['issues'], inplace=True)
    print(f'Left with {len(link_df)} links after removing issue-pairs with multiple entries')

    link_df.reset_index(inplace=True, drop=True)
    
    return link_df

In [9]:
# Create non-links from randomly selected issues
def create_non_links(link_df, issue_df, linked_pairs):
   
    non_link_issues = set(issue_df[~issue_df['resolution'].isin(['Duplicate'])].index)
    
    cols = ['name', 'linktype', 'issue_id_1', 'issue_id_2', 'issues']
    non_link_df = pd.DataFrame(columns=cols)

    for i in tqdm(range(int(link_df.linktype.value_counts().mean()))):
        sample = random.sample(non_link_issues, 2)
        if not set([sample[0], sample[1]]) in (linked_pairs):
            name = str(sample[0]) + "_" + str(sample[1])
            non_link = pd.DataFrame([{
                "name": name,
                "linktype": "Non-Link",
                "issue_id_1":  sample[0],
                "issue_id_2":  sample[1],
                "issues": str(sorted(set([sample[0], sample[1]])))
            }])
            non_link_df = pd.concat([non_link_df, non_link], ignore_index=True)
        else:
            print('hi')

    link_plus_df = pd.concat([link_df, non_link_df], ignore_index=True)
    
    return link_plus_df

In [10]:
# Join links and issues
def joined_links(link_df, issue_df):
    link_df = link_df[['linktype', 'issue_id_1', 'issue_id_2']]
    joined_df = link_df \
        .join(issue_df.add_suffix('_1'), on='issue_id_1') \
        .join(issue_df.add_suffix('_2'), on='issue_id_2')
    return joined_df

In [11]:
for repo in REPOS:
    issue_df = load_issues(repo)
    link_df = load_links(repo)
    print(f'Loaded {len(issue_df)} issues and {len(link_df)} links for {repo}')
    
    add_linked_issues_to_df(link_df)
    link_df = clean_links(link_df, issue_df)

    print(f'Cleaned {len(link_df)} links for {repo}')
    print(link_df.linktype.value_counts())

    linked_pairs = set(link_df['issues'])
    link_plus_df = create_non_links(link_df, issue_df, linked_pairs)
    
    print(f'Total cleaned links plus non_links is {len(link_plus_df)} for {repo}')

    link_df = joined_links(link_df, issue_df)
    link_df.to_csv('./data/joined/links/' + repo + '.csv', sep=';', encoding='utf-8', index=True)
    
    link_plus_df = joined_links(link_plus_df, issue_df)
    link_plus_df.to_csv('./data/joined/links_plus/' + repo + '.csv', sep=';', encoding='utf-8', index=True)

    print("----------------------------\n")

Loaded 1014926 issues and 264107 links for Apache


100%|██████████| 264107/264107 [00:47<00:00, 5551.72it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 263647 links after removing half-private links
Left with 259717 links after removing issue-pairs with multiple links between them


100%|██████████| 2218/2218 [00:52<00:00, 41.85it/s]


Left with 256253 links after removing issue-pairs with multiple link types between them
Left with 255767 links after removing issue-pairs with multiple entries
Cleaned 255767 links for Apache
Subtask             83783
Reference           68973
Duplicate           25925
Blocker             14377
Epic-Relation       12506
dependent           12498
Incorporates         6923
Regression           4350
Cloners              4321
Required             3620
Container            3508
Related              3369
Supercedes           3248
Problem/Incident     3106
Child-Issue          2508
Blocked              1172
Completes             914
Dependent             399
Dependency            112
Testing                79
Parent Feature         50
Issue split            26
Name: linktype, dtype: int64


100%|██████████| 11625/11625 [06:39<00:00, 29.13it/s]


Total cleaned links plus non_links is 267392 for Apache
----------------------------

Loaded 28146 issues and 16846 links for Hyperledger


100%|██████████| 16846/16846 [00:03<00:00, 5256.78it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 16733 links after removing half-private links
Left with 16565 links after removing issue-pairs with multiple links between them


100%|██████████| 141/141 [00:00<00:00, 420.75it/s]


Left with 16325 links after removing issue-pairs with multiple link types between them
Left with 16304 links after removing issue-pairs with multiple entries
Cleaned 16304 links for Hyperledger
Epic-Relation      6459
Subtask            4494
Relates            2807
Blocks             1344
Duplicate           638
Cloners             477
Issue split          78
Git Code Review       7
Name: linktype, dtype: int64


100%|██████████| 2038/2038 [00:01<00:00, 1173.10it/s]


Total cleaned links plus non_links is 18342 for Hyperledger
----------------------------

Loaded 9474 issues and 2667 links for IntelDAOS


100%|██████████| 2667/2667 [00:00<00:00, 5110.52it/s]


Left with 2667 links after removing half-private links
Left with 2641 links after removing issue-pairs with multiple links between them


100%|██████████| 24/24 [00:00<00:00, 1872.32it/s]


Left with 2605 links after removing issue-pairs with multiple link types between them
Left with 2599 links after removing issue-pairs with multiple entries
Cleaned 2599 links for IntelDAOS
Related                 1016
Blocker                  663
Subtask                  274
Duplicate                252
Cloners (migrated)       176
Gantt End to Start       100
Verify                    39
Cloners                   38
Implement                 17
Gantt End to End          16
Relates                    6
Blocks                     1
Gantt Start to Start       1
Name: linktype, dtype: int64


100%|██████████| 199/199 [00:00<00:00, 2313.31it/s]


Total cleaned links plus non_links is 2798 for IntelDAOS
----------------------------

Loaded 15535 issues and 3303 links for JFrog


100%|██████████| 3303/3303 [00:00<00:00, 5086.18it/s]


Left with 3303 links after removing half-private links
Left with 3273 links after removing issue-pairs with multiple links between them


100%|██████████| 24/24 [00:00<00:00, 1857.50it/s]


Left with 3233 links after removing issue-pairs with multiple link types between them
Left with 3229 links after removing issue-pairs with multiple entries
Cleaned 3229 links for JFrog
Subtask                             1164
Relationship                         884
Duplicate                            643
Dependency                           256
Trigger                              202
Contains(WBSGantt)                    44
Cloners                               27
Gantt End to End                       4
Gantt End to Start                     3
Finish-to-Finish link (WBSGantt)       1
Gantt Start to Start                   1
Name: linktype, dtype: int64


100%|██████████| 293/293 [00:00<00:00, 2005.80it/s]


Total cleaned links plus non_links is 3522 for JFrog
----------------------------

Loaded 316411 issues and 110507 links for Jira


100%|██████████| 110507/110507 [00:20<00:00, 5441.07it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 102574 links after removing half-private links
Left with 101181 links after removing issue-pairs with multiple links between them


100%|██████████| 819/819 [00:07<00:00, 113.27it/s]


Left with 100097 links after removing issue-pairs with multiple link types between them
Left with 99820 links after removing issue-pairs with multiple entries
Cleaned 99820 links for Jira
Reference          63348
Duplicate          21685
Cloners             2866
Subtask             2473
Part                2449
Detail              1870
Cause               1784
Blocker              987
Derived              518
Supersession         476
Regression           378
Relate               262
Bonfire Testing      227
Split                171
Depends              165
Resolve               64
Follows               49
Related               38
Issue split           10
Name: linktype, dtype: int64


100%|██████████| 5253/5253 [00:44<00:00, 119.36it/s]


Total cleaned links plus non_links is 105073 for Jira
----------------------------

Loaded 41866 issues and 12439 links for JiraEcosystem


100%|██████████| 12439/12439 [00:02<00:00, 5478.09it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 11598 links after removing half-private links
Left with 11514 links after removing issue-pairs with multiple links between them


100%|██████████| 66/66 [00:00<00:00, 888.62it/s]


Left with 11414 links after removing issue-pairs with multiple link types between them
Left with 11398 links after removing issue-pairs with multiple entries
Cleaned 11398 links for JiraEcosystem
Epic-Relation                   2743
Relate                          2468
Subtask                         2284
Duplicate                       1741
Blocker                          676
Cause                            440
Part                             204
Cloners                          201
Reference                        145
Depends                          127
Split                            120
Follows                           99
Bonfire testing                   56
Bonfire Testing                   46
Epic                              21
Issue split                       16
Preceded By                        7
Blocks                             2
Polaris datapoint issue link       1
Polaris issue link                 1
Name: linktype, dtype: int64


100%|██████████| 569/569 [00:00<00:00, 949.86it/s] 


Total cleaned links plus non_links is 11967 for JiraEcosystem
----------------------------

Loaded 31229 issues and 14950 links for MariaDB


100%|██████████| 14950/14950 [00:02<00:00, 5380.22it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 14929 links after removing half-private links
Left with 14773 links after removing issue-pairs with multiple links between them


100%|██████████| 98/98 [00:00<00:00, 737.01it/s]


Left with 14659 links after removing issue-pairs with multiple link types between them
Left with 14618 links after removing issue-pairs with multiple entries
Cleaned 14618 links for MariaDB
Relates             7464
Blocks              1899
Duplicate           1374
PartOf              1154
Epic-Relation        942
Subtask              891
Problem/Incident     872
Issue split           22
Name: linktype, dtype: int64


100%|██████████| 1827/1827 [00:01<00:00, 1235.51it/s]


Total cleaned links plus non_links is 16445 for MariaDB
----------------------------

Loaded 420819 issues and 215821 links for Mojang


100%|██████████| 215821/215821 [00:39<00:00, 5443.21it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 215807 links after removing half-private links
Left with 215663 links after removing issue-pairs with multiple links between them


100%|██████████| 73/73 [00:01<00:00, 49.19it/s]


Left with 215547 links after removing issue-pairs with multiple link types between them
Left with 215532 links after removing issue-pairs with multiple entries
Cleaned 215532 links for Mojang
Duplicate          193993
Relates             20378
Cloners               650
Bonfire Testing       272
Blocks                239
Name: linktype, dtype: int64


100%|██████████| 43106/43106 [06:25<00:00, 111.92it/s]


Total cleaned links plus non_links is 258638 for Mojang
----------------------------

Loaded 137172 issues and 92362 links for MongoDB


100%|██████████| 92362/92362 [00:17<00:00, 5399.12it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 65240 links after removing half-private links
Left with 64537 links after removing issue-pairs with multiple links between them


100%|██████████| 389/389 [00:03<00:00, 108.05it/s]


Left with 63883 links after removing issue-pairs with multiple link types between them
Left with 63821 links after removing issue-pairs with multiple entries
Cleaned 63821 links for MongoDB
Related                 25471
Depends                 13933
Epic-Relation           10149
Duplicate                8587
Documented               1825
Problem/Incident         1082
Subtask                   888
Issue split               752
Gantt Dependency          657
Cloners                   202
Backports                 170
Tested                     62
Gantt End to End           41
Gantt Start to Start        1
Initiative                  1
Name: linktype, dtype: int64


100%|██████████| 4254/4254 [00:11<00:00, 370.78it/s]


Total cleaned links plus non_links is 68075 for MongoDB
----------------------------

Loaded 148579 issues and 41426 links for Qt


100%|██████████| 41426/41426 [00:07<00:00, 5449.95it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 40646 links after removing half-private links
Left with 40424 links after removing issue-pairs with multiple links between them


100%|██████████| 171/171 [00:00<00:00, 184.35it/s]


Left with 40128 links after removing issue-pairs with multiple link types between them
Left with 40105 links after removing issue-pairs with multiple entries
Cleaned 40105 links for Qt
Subtask           9804
Relates           8990
Dependency        6260
Epic-Relation     5428
Duplicate         4243
Work Breakdown    2667
Replacement       2582
Test                50
Cloners             34
Issue split         31
Blocks              14
Covered              2
Name: linktype, dtype: int64


100%|██████████| 3342/3342 [00:12<00:00, 273.65it/s]


Total cleaned links plus non_links is 43447 for Qt
----------------------------

Loaded 353000 issues and 127369 links for RedHat


100%|██████████| 127369/127369 [00:24<00:00, 5237.09it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 123000 links after removing half-private links
Left with 121612 links after removing issue-pairs with multiple links between them


100%|██████████| 1205/1205 [00:14<00:00, 84.16it/s]


Left with 120136 links after removing issue-pairs with multiple link types between them
Left with 119669 links after removing issue-pairs with multiple entries
Cleaned 119669 links for RedHat
Related                          31006
Subtask                          24928
Blocks                           18186
Cloners                          16969
Superset                         10661
Duplicate                         5913
Sequence                          5129
Causality                         3168
Cloners (old)                     1504
Documentation                      670
Parent-Relation                    621
multi-level hierarchy [GANTT]      251
finish-start [GANTT]               184
Account                            166
Issue split                        162
Gantt: finish-start                 46
Gantt: start-finish                 40
finish-finish [GANTT]               40
Gantt: finish-finish                20
start-finish [GANTT]                 4
Gantt: start-start          

100%|██████████| 5698/5698 [01:12<00:00, 78.39it/s]


Total cleaned links plus non_links is 125367 for RedHat
----------------------------

Loaded 50550 issues and 20292 links for Sakai


100%|██████████| 20292/20292 [00:03<00:00, 5557.42it/s]


Left with 20292 links after removing half-private links
Left with 20040 links after removing issue-pairs with multiple links between them


100%|██████████| 143/143 [00:00<00:00, 510.59it/s]


Left with 19852 links after removing issue-pairs with multiple link types between them
Left with 19803 links after removing issue-pairs with multiple entries
Cleaned 19803 links for Sakai
1 - Relate             9711
Subtask                3373
5 - Depend             2578
3 - Duplicate          1851
4 - Incorporate        1334
2 - Cloned              949
6 - Blocks                6
7 - Git Code Review       1
Name: linktype, dtype: int64


100%|██████████| 2475/2475 [00:02<00:00, 857.29it/s]


Total cleaned links plus non_links is 22278 for Sakai
----------------------------

Loaded 1867 issues and 674 links for SecondLife


100%|██████████| 674/674 [00:00<00:00, 5850.78it/s]


Left with 674 links after removing half-private links
Left with 662 links after removing issue-pairs with multiple links between them


100%|██████████| 17/17 [00:00<00:00, 3079.92it/s]


Left with 634 links after removing issue-pairs with multiple link types between them
Left with 631 links after removing issue-pairs with multiple entries
Cleaned 631 links for SecondLife
Subtask         314
Relates         186
Cloners          48
Parent/Child     41
Depends          28
Collection       14
Name: linktype, dtype: int64


100%|██████████| 105/105 [00:00<00:00, 3036.46it/s]


Total cleaned links plus non_links is 736 for SecondLife
----------------------------

Loaded 87284 issues and 4975 links for Sonatype


100%|██████████| 4975/4975 [00:00<00:00, 5402.10it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 4534 links after removing half-private links
Left with 4498 links after removing issue-pairs with multiple links between them


100%|██████████| 17/17 [00:00<00:00, 1228.64it/s]


Left with 4466 links after removing issue-pairs with multiple link types between them
Left with 4465 links after removing issue-pairs with multiple entries
Cleaned 4465 links for Sonatype
Relates            1785
Subtask            1343
Bonfire Testing     361
Duplicate           342
Caused              235
dependent           162
Supercedes          108
Fixes               103
Implements           11
Epic-Relation         9
Issue split           6
Name: linktype, dtype: int64


100%|██████████| 405/405 [00:00<00:00, 574.29it/s]


Total cleaned links plus non_links is 4870 for Sonatype
----------------------------

Loaded 69156 issues and 14716 links for Spring


100%|██████████| 14716/14716 [00:02<00:00, 5392.18it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 14616 links after removing half-private links
Left with 14550 links after removing issue-pairs with multiple links between them


100%|██████████| 52/52 [00:00<00:00, 722.61it/s]


Left with 14478 links after removing issue-pairs with multiple link types between them
Left with 14462 links after removing issue-pairs with multiple entries
Cleaned 14462 links for Spring
Relate           5909
Subtask          1941
Duplicate        1745
Epic-Relation    1635
Depend           1259
Related           993
Depends           488
Supersede         478
Cloners            14
Name: linktype, dtype: int64


100%|██████████| 1606/1606 [00:02<00:00, 767.14it/s]


Total cleaned links plus non_links is 16068 for Spring
----------------------------

