In [1]:
import keras
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

Using TensorFlow backend.


In [2]:
DOMAIN = 'eclipse'
DIR = 'data/processed/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)

In [3]:
df_train_pair = pd.read_csv(os.path.join(DIR_PAIRS, 'eclipse_pairs.csv'))
df = pd.read_csv(os.path.join(DIR_PAIRS, 'eclipse.csv'))

In [30]:
def create_bucket(data):
    print("Creating the buckets...")
    buckets = {}
    # Reading the buckets
    df_buckets = data[data['dup_id'] == '[]']
    loop = tqdm(total=df_buckets.shape[0])
    for row in df_buckets.iterrows():
        name = row[1]['bug_id']
        buckets[name] = set()
        buckets[name].add(name)
        loop.update(1)
    loop.close()
    # Fill the buckets
    df_duplicates = data[data['dup_id'] != '[]']
    loop = tqdm(total=df_duplicates.shape[0])
    for row_bug_id, row_dup_id in df_duplicates[['bug_id', 'dup_id']].values:
        bucket_name = int(row_dup_id)
        dup_id = row_bug_id
        while bucket_name not in buckets:
            query = df_duplicates[df_duplicates['bug_id'] == bucket_name]
            bucket_name = int(query['dup_id'])
        buckets[bucket_name].add(dup_id)
        loop.update(1)
    loop.close()
    return buckets

In [31]:
buckets = create_bucket(df)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




HBox(children=(IntProgress(value=0, max=39523), HTML(value='')))




In [42]:
issues_by_buckets = {}
for bucket in tqdm(buckets):
    issues_by_buckets[bucket] = bucket
    for issue in np.array(buckets[bucket]).tolist():
        issues_by_buckets[issue] = bucket

HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




In [35]:
buckets_at_least_2_dups = [bucket for bucket in buckets if len(buckets[bucket]) > 1]
buckets_alone = [bucket for bucket in buckets if len(buckets[bucket]) <= 1]

In [36]:
print("buckets_at_least_2_dups:", len(buckets_at_least_2_dups))
print("buckets_alone:", len(buckets_alone))

buckets_at_least_2_dups: 24413
buckets_alone: 297070


In [49]:
issues_by_buckets[214181], buckets[issues_by_buckets[214181]]

(214181, {214181})

#### Example 1

In [10]:
buckets[210304]

{214411, 216663, 216725, 218239}

In [11]:
df[df['bug_id'].isin([214411, 210304, 216725, 216725, 216663, 218239])]

Unnamed: 0,bug_id,bug_severity,bug_status,component,creation_ts,delta_ts,description,dup_id,priority,product,resolution,short_desc,version
194058,210304,normal,VERIFIED,IDE,2007-11-19 16:31:00 -0500,2008-02-07 16:11:08 -0500,+++ This bug was initially created as a clone ...,[],P3,Platform,FIXED,[Navigator] resourceFilters incorrectly read f...,3.3.1
197589,214411,normal,RESOLVED,IDE,2008-01-04 17:47:00 -0500,2009-10-01 09:36:04 -0400,I just started using Eclipse version 3.3.1.1 (...,210304,P3,Platform,DUPLICATE,ResourceNavigator filters appear to be totally...,3.3.1
199547,216663,normal,RESOLVED,IDE,2008-01-25 20:49:00 -0500,2009-10-01 09:36:04 -0400,Build ID: M20071023-1652\n\nSteps To Reproduce...,210304,P3,Platform,DUPLICATE,Navigator Filters dialog is empty,3.3.1
199591,216725,normal,RESOLVED,IDE,2008-01-28 04:06:00 -0500,2009-10-01 09:36:04 -0400,Build ID: M20071023-1652\n\nSteps To Reproduce...,210304,P3,Platform,DUPLICATE,resourceFilters extension point doesn't work,3.3.1
200918,218239,normal,RESOLVED,UI,2008-02-07 14:54:00 -0500,2008-02-07 16:11:08 -0500,Build ID: M20071023-1652\n\nSteps To Reproduce...,210304,P3,Platform,DUPLICATE,[Navigator View] Filters dialog is empty when...,3.3.1


#### Example 2

In [12]:
buckets[119056]

{119285, 119619, 119712, 119845, 120183}

In [13]:
df[df['bug_id'].isin(buckets[119056])]

Unnamed: 0,bug_id,bug_severity,bug_status,component,creation_ts,delta_ts,description,dup_id,priority,product,resolution,short_desc,version
112427,119285,normal,RESOLVED,Team,2005-12-05 13:38:00 -0500,2005-12-07 15:12:06 -0500,I20051130-1215\n*I have a java class in my wor...,119056,P3,Platform,DUPLICATE,Replace with uncommitted changes dialog shows ...,3.2
112736,119619,normal,RESOLVED,Team,2005-12-07 07:30:00 -0500,2005-12-09 09:07:33 -0500,I20051206\n\n1. Check out a Java project from ...,119056,P3,Platform,DUPLICATE,Replace with latest propose the same .java fil...,3.1
112812,119712,normal,RESOLVED,Team,2005-12-07 14:26:00 -0500,2005-12-07 15:12:06 -0500,Build: I-20051206 (and even the one from the w...,119285,P3,Platform,DUPLICATE,seeing double,3.1
112940,119845,normal,RESOLVED,Team,2005-12-08 07:01:00 -0500,2005-12-08 10:13:29 -0500,Take a project with some local changes you wan...,119056,P3,Platform,DUPLICATE,Replace with dialog shows projects multiple times,3.2
113262,120183,normal,RESOLVED,Team,2005-12-09 16:39:00 -0500,2005-12-09 16:52:38 -0500,See attached screenshot.\n\nThe dialog title i...,119056,P3,Platform,DUPLICATE,"Duplicate entries in ""Replace with uncommitted...",3.2


#### Example 3

In [14]:
buckets[288347]

{213528, 214971}

In [15]:
df[df['bug_id'].isin([214971, 288347, 213528])]

Unnamed: 0,bug_id,bug_severity,bug_status,component,creation_ts,delta_ts,description,dup_id,priority,product,resolution,short_desc,version
196813,213528,enhancement,RESOLVED,Jira,2007-12-19 15:58:00 -0500,2009-09-02 18:37:37 -0400,-- Created from Comment --\nURL: https://bugs....,288347,P2,Mylyn,DUPLICATE,only submit comment if none of the attributes ...,unspecified
198079,214971,normal,RESOLVED,Jira,2008-01-10 17:52:00 -0500,2009-09-17 19:45:23 -0400,Is unable to reassign a tasks through the acti...,288347,P3,Mylyn,DUPLICATE,unable to reassign tasks if no permissions to ...,2.2
259921,288347,enhancement,RESOLVED,Jira,2009-09-02 08:19:00 -0400,2009-09-24 05:37:43 -0400,User-Agent: Mozilla/5.0 (Macintosh; U; I...,[],P2,Mylyn,FIXED,only update comment or reassign if this is the...,3.2
