## Database statisctics

In [41]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

%matplotlib inline

In [110]:
def create_bucket(df):
    print("Creating the buckets...")
    buckets = {}
    # Reading the buckets
    df_buckets = df[df['dup_id'] == '[]']
    loop = tqdm(total=df_buckets.shape[0])
    for row in df_buckets.iterrows():
        name = row[1]['bug_id']
        buckets[name] = set()
        buckets[name].add(name)
        loop.update(1)
    loop.close()
    # Fill the buckets
    df_duplicates = df[df['dup_id'] != '[]']
    loop = tqdm(total=df_duplicates.shape[0])
    for row_bug_id, row_dup_id in df_duplicates[['bug_id', 'dup_id']].values:
        bucket_name = int(row_dup_id)
        dup_id = row_bug_id
        while bucket_name not in buckets:
            query = df_duplicates[df_duplicates['bug_id'] == bucket_name]
            if query.shape[0] <= 0: 
                break
            bucket_name = int(query['dup_id'])
        '''
            Some bugs duplicates point to one master that
            does not exist in the dataset like openoffice master=152778
        '''
        if bucket_name in buckets:
            buckets[bucket_name].add(dup_id)
        loop.update(1)
    loop.close()
    return buckets

def read_pairs(file_path):
    n = 0
    with open(file_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        for row in f:
            pairs = row.split(' ')
            n += len(pairs) // 2
    return n

def getting_pairs(array):
    res = []
    for row in array:
        _, dups = row
        dups = list(dups)
        while len(dups) > 1:
            bucket = dups[0]
            dups.remove(bucket)
            for d in dups:
                res.append([bucket, d])
    return res

In [113]:
domains = ['eclipse', 'netbeans', 'openoffice']

In [38]:
df = pd.DataFrame(columns=['domain', 'train_pairs', 'test_pairs', 'total_bugs'])

rows = []

for DOMAIN in domains:
    METHOD = 'baseline'
    DIR = 'data/processed/{}'.format(DOMAIN)
    DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
    TRAIN = os.path.join(DIR, 'train.txt')
    TEST = os.path.join(DIR, 'test.txt')
    
    dataset = pd.read_csv(DATASET)
    
    n_train = read_pairs(TRAIN)
    n_test = read_pairs(TEST)
    
    rows.append({ 'domain' : DOMAIN, 'train_pairs' : n_train, 'test_pairs' : n_test, 'total_bugs' :  dataset.shape[0] })
    
df.append(rows)

Unnamed: 0,domain,train_pairs,test_pairs,total_bugs
0,eclipse,79073,7591,361006
1,netbeans,87543,7930,216715
2,openoffice,53740,4549,98070


### Creating split 90% train and 10% test

In [118]:
rows = []

for DOMAIN in domains:
    METHOD = 'baseline'
    DIR = 'data/processed/{}'.format(DOMAIN)
    DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
    
    dataset = pd.read_csv(DATASET)
    buckets = create_bucket(dataset)
    bucket_dups = []

    for key in buckets:
        if len(buckets[key]) > 1:
            bucket_dups.append([key, list(buckets[key])])
            
    pairs = getting_pairs(bucket_dups)
    
    VALIDATION_SPLIT = 0.9
    split_idx = int(len(pairs) * VALIDATION_SPLIT)

    with open(os.path.join(DIR, 'train_chronological.txt'), 'w') as f:
        for pair in pairs[:split_idx]:
            f.write("{} {}\n".format(pair[0], pair[1]))

    test_data = {}
    for pair in pairs[split_idx:]:
        bug1 = int(pair[0])
        bug2 = int(pair[1])
        if bug1 not in test_data:
            test_data[bug1] = set()
        test_data[bug1].add(bug2)
    with open(os.path.join(DIR, 'test_chronological.txt'), 'w') as f:
        for bug in test_data.keys():
            f.write("{} {}\n".format(bug, ' '.join([str(x) for x in test_data[bug]])))
    print('Train and test created')
    
    n_train = len(pairs[:split_idx])
    n_test = len(pairs[split_idx:])
    
    rows.append({ 'domain' : DOMAIN, 'train_pairs' : n_train, 'test_pairs' : n_test, 'total_bugs' :  dataset.shape[0] })
    
df = pd.DataFrame(data=rows, columns=['domain', 'train_pairs', 'test_pairs', 'total_bugs'])

Creating the buckets...


HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




HBox(children=(IntProgress(value=0, max=39523), HTML(value='')))


Train and test created
Creating the buckets...


HBox(children=(IntProgress(value=0, max=180483), HTML(value='')))




HBox(children=(IntProgress(value=0, max=36232), HTML(value='')))


Train and test created
Creating the buckets...


HBox(children=(IntProgress(value=0, max=83503), HTML(value='')))




HBox(children=(IntProgress(value=0, max=14567), HTML(value='')))


Train and test created


In [119]:
df

Unnamed: 0,domain,train_pairs,test_pairs,total_bugs
0,eclipse,78182,8687,361006
1,netbeans,85355,9484,216715
2,openoffice,52020,5781,98070


### Visualize a single split

In [102]:
DOMAIN = 'netbeans'
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
DIR = 'data/processed/{}'.format(DOMAIN)
dataset = pd.read_csv(DATASET)

In [94]:
buckets = create_bucket(dataset)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=180483), HTML(value='')))




HBox(children=(IntProgress(value=0, max=36232), HTML(value='')))




In [95]:
len(buckets)

180483

In [96]:
bucket_dups = []

for key in buckets:
    if len(buckets[key]) > 1:
        bucket_dups.append([key, list(buckets[key])])

In [97]:
len(bucket_dups)

18602

In [99]:
pairs = getting_pairs(bucket_dups)

In [100]:
pairs[:10]

[[90024, 1289],
 [1408, 6256],
 [1787, 14975],
 [166804, 2020],
 [2337, 31362],
 [2337, 46020],
 [2337, 15205],
 [2337, 32942],
 [2337, 35023],
 [2337, 57495]]

In [101]:
len(pairs)

94839

In [108]:
print("Train: {}".format(len(pairs[:split_idx])))
print("Test: {}".format(len(pairs[split_idx:])))

Train: 85355
Test: 9484


In [107]:
VALIDATION_SPLIT = 0.9
split_idx = int(len(pairs) * VALIDATION_SPLIT)

with open(os.path.join(DIR, 'train_chronological.txt'), 'w') as f:
    for pair in pairs[:split_idx]:
        f.write("{} {}\n".format(pair[0], pair[1]))
        
test_data = {}
for pair in pairs[split_idx:]:
    bug1 = int(pair[0])
    bug2 = int(pair[1])
    if bug1 not in test_data:
        test_data[bug1] = set()
    test_data[bug1].add(bug2)
with open(os.path.join(DIR, 'test_chronological.txt'), 'w') as f:
    for bug in test_data.keys():
        f.write("{} {}\n".format(bug, ' '.join([str(x) for x in test_data[bug]])))
print('Train and test created')

Train and test created
