In [None]:
import os, glob, json, csv, subprocess, sys, re, operator
from git import *
from subprocess import Popen, PIPE
from os import path
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import plot, init_notebook_mode, iplot

%matplotlib inline

# Defining repository

In [None]:
userhome = os.path.expanduser('~')
repository = userhome + r'/different-diff/dataset/hbase/'
analyze_dir = userhome + r'/different-diff/analyze/analyze_hbase/'

# Counting the number of buglines across project

In [None]:
algorithm = ['histogram','minimal','myers','patience']

In [None]:
with open(analyze_dir + 'step_3/disagreement_ratio/total_buglines.csv', 'w') as myfile:
    header = ['filename','parent_id','commit_id','bug_id','algorithm','number_of_buglines']
    writers = csv.writer(myfile)
    writers.writerow(header)
    for x, algo in enumerate(algorithm):    
        f = 1
        temp = []
        for filename in glob.iglob(analyze_dir + 'file-diff/buggylines/' + algo + '/*', recursive=True):
            sys.stdout.write("\r{} Counting number of buglines in file: {}".format(x+1,f))
            sys.stdout.flush()
            f += 1
            
            pattern = re.search('(?P<filename>\w+[-]?\w+[.]\w+|\w+)_(?P<parent_id>\w+)-(?P<commit_id>\w+)_(?P<bug_id>\w+[-]\d+)_(?P<algorithm>'+algo+')', filename)
            filecontain = re.search('(\w+[-]?\w+[.]?\w+-\w+HBASE-[0-9]*)', filename)
            filecontain = filecontain.groups()[0]
            
            #if filecontain not in errorfile:
            fnames = re.search((algo), filename)
            if fnames:
                bugfile = open(filename)
                bugline = bugfile.read().split('\n')
                tmp = pattern.groups()
                if "--- /dev" in bugline[0]:
                    num = str(len(bugline)-2)
                else:
                    num = str(len(bugline)-1)
                
                #added an element of number of buglines in tuple
                tmp = tmp + (num,)
                temp.append(tmp)
            else:
                pass
            
        #sorting list based on (1) algorithm, (2) bug_id, and (3) filename
        temp = sorted(temp, key=operator.itemgetter(4, 0, 3))
        
        #writing list elements in csv file
        for line in temp:
            writers.writerow(line)
        
print("\nCounting total number of buglines is complete")

In [None]:
dftotal = pd.read_csv(analyze_dir + 'step_3/disagreement_ratio/total_buglines.csv')
dfbugtotalhist = dftotal['number_of_buglines'][dftotal['algorithm'] == 'histogram'].sum()
dfbugtotalmin = dftotal['number_of_buglines'][dftotal['algorithm'] == 'minimal'].sum()
dfbugtotalmyers = dftotal['number_of_buglines'][dftotal['algorithm'] == 'myers'].sum()
dfbugtotalpat = dftotal['number_of_buglines'][dftotal['algorithm'] == 'patience'].sum()

# Visualizing the total number of buglines for each algorithm

In [None]:
x = ['histogram', 'minimal', 'myers', 'patience']
y = [dfbugtotalhist, dfbugtotalmin, dfbugtotalmyers, dfbugtotalpat]

trace = [go.Bar(
    x = x,
    y = y,
    text = y,
    textposition = 'auto',
    marker = dict(
        color = 'rgb(150,255,200)',
        line = dict(
            color='rgb(8,48,107)',
            width=1.5),
    ),
    opacity=0.6
)]

layout = go.Layout(
    title='Total number of buglines found for each algorithm in HBASE project',
)

init_notebook_mode(connected=True)
fig = go.Figure(data=trace, layout=layout)
iplot(fig, show_link=False)

In [None]:
dftotal = dftotal.groupby('algorithm', as_index=False).agg({"number_of_buglines": "sum"})
dftotal

In [None]:
temporary = []

for b in range(0, len(dftotal)):
    temps = [dftotal.iloc[b][0], dftotal.iloc[b][1]]
    temporary.append(temps)

#save into csv file
with open(analyze_dir + 'step_3/disagreement_ratio/totalbuglines_of_each_algorithm.csv', 'w') as bugfile:
    cols = ['algorithm', 'number_of_buglines']
    writers = csv.writer(bugfile)
    writers.writerow(cols)
    for gg in temporary:
        writers.writerow(gg)

# Comparing the number of bug class of each algorithm

In [None]:
#set the filepath of the filenames
filepath = analyze_dir + 'step_3/diff-file_blame-file_comparison/'

# Histogram

In [None]:
file_list = glob.glob(filepath + 'histogram_comparison/*')
print ("Found " + str(len(file_list)) + " files")

In [None]:
result = []

for ii, alg in enumerate(algorithm):
    bug_intro_qty = incorrect_qty = unknown_qty = sumln = 0
    i = 1
    for fname in glob.iglob(filepath + alg + '_comparison/*', recursive=True):
        sys.stdout.write('\r%i ' %(ii+1) + 'Counting the number of bug-class in file %i' % i)
        sys.stdout.flush()
        i += 1
        #algo = re.search(("__(?P<algorithm>\w+)(buglines)"), fname)
        #algo = algo.groups()[0]

        #if algo == alg:
        dframe = pd.read_csv(fname)
        op = open(fname,'r')
        lines = op.read().split('\n')
        if "-- /dev" not in lines[1]:
            sumln = (len(lines)-2) + sumln
            bug_intro_qty = dframe['bug_class'][dframe['bug_class'] == 'bug-introducing change'].count() + bug_intro_qty
            incorrect_qty = dframe['bug_class'][dframe['bug_class'] == 'incorrect'].count() + incorrect_qty
            unknown_qty = dframe['bug_class'][dframe['bug_class'] == 'unknown'].count() + unknown_qty
        else:
            sumln = (len(lines)-3) + sumln
            pass
    
    result.append([bug_intro_qty, incorrect_qty, unknown_qty, sumln])
print ('\nCounting the number of bug-introducing change is complete')

In [None]:
result

In [None]:
result_matrix = [list(i) for i in zip(*result)]
label = ['#bug-introducing_change', '#incorrect', '#unknown', 'total']
for s, lbl in enumerate(label):
    result_matrix[s].insert(0, lbl)
result_matrix

In [None]:
with open(analyze_dir + 'step_3/disagreement_ratio/quantity_of_bug_class.csv','w') as myfile:
    header = ['class', 'histogram', 'minimal', 'myers', 'patience']
    writers = csv.writer(myfile)
    writers.writerow(header)
    for item in result_matrix:
        writers.writerow(item)

In [None]:
df_bugclass = pd.read_csv(analyze_dir + 'step_3/disagreement_ratio/quantity_of_bug_class.csv')
df_bugclass = df_bugclass[header]
df_bugclass

In [None]:
trace_hist = go.Bar(
    x = df_bugclass['class'][:2],
    y = df_bugclass['histogram'][:2],
    name = 'diff-algorithm = histogram',
    text = df_bugclass['histogram'],
    textposition = 'auto'
)

trace_min = go.Bar(
    x = df_bugclass['class'][:2],
    y = df_bugclass['minimal'][:2],
    name = 'diff-algorithm = minimal',
    text = df_bugclass['minimal'],
    textposition = 'auto'
)

trace_myers = go.Bar(
    x = df_bugclass['class'][:2],
    y = df_bugclass['myers'][:2],
    name = 'diff-algorithm = myers',
    text = df_bugclass['myers'],
    textposition = 'auto'
)
                    
trace_pat = go.Bar(
    x = df_bugclass['class'][:2],
    y = df_bugclass['patience'][:2],
    name = 'diff-algorithm = patience',
    text = df_bugclass['patience'],
    textposition = 'auto'
)

data = [trace_hist, trace_min, trace_myers, trace_pat]

layout = go.Layout(
    barmode = 'group',
    title = 'Number of bug-class for each diff algorithm in HBase Project'
)

init_notebook_mode(connected=True)
fig = go.Figure(data=data, layout=layout)
iplot(fig, show_link=False)

# Counting the disagreement ratio (R) in project

In [None]:
df_bugclass

In [None]:
dftotal

In [None]:
#disagreement ratio (R) = the number of disagreement (D) / total number of bugs (B)
ratio_details = []

for n, algor in enumerate(algorithm):
    D = df_bugclass[algor][df_bugclass['class'] == '#incorrect'].iloc[0]
    B = dftotal['number_of_buglines'][dftotal['algorithm'] == algor].iloc[0]
    R = D/B
    summary = [algor, D, B, R]
    ratio_details.append(summary)

In [None]:
with open(analyze_dir + 'step_3/disagreement_ratio/disagreement_ratio.csv', 'w') as ratiofile:
    columns = ['algorithm', 'number_of_incorrect', 'total_buglines', 'disagreement_ratio']
    writers = csv.writer(ratiofile)
    writers.writerow(columns)
    for item in ratio_details:
        writers.writerow(item)

In [None]:
df_ratio = pd.read_csv(analyze_dir + 'step_3/disagreement_ratio/disagreement_ratio.csv')
df_ratio = df_ratio[['algorithm', 'number_of_incorrect', 'total_buglines', 'disagreement_ratio']]
df_ratio

In [None]:
trace = go.Bar(
    x = df_ratio['algorithm'],
    y = df_ratio['disagreement_ratio'],
    text = df_ratio['disagreement_ratio'],
    textposition = 'auto',
    marker = dict(
        color = 'rgb(150,255,200)',
        line = dict(
            color='rgb(8,48,107)',
            width=1.5),
    ),
    opacity=0.6
)

layout = go.Layout(
    title = 'Disagreement ratio of each diff algorithm in HBase Project'
)

init_notebook_mode(connected=True)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig, show_link=False)