# Importing libraries

In [None]:
import os, glob, csv, sys, re
from os import path
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import plot, init_notebook_mode, iplot

%matplotlib inline

# Defining repository and directories

In [None]:
userhome = os.path.expanduser('~')
analyze_dir = userhome + r'/different-diff/analyze/analyze_openjpa/'

# Load dataset of validated bug-intro commit id

In [None]:
valid_bugintro_dir = analyze_dir + '05_finding_versions/03_filtering/01_valid_bugintro_commitid/'

In [None]:
header = ['bugintro_commitID','totalnumber_of_buggyline']

In [None]:
df_bugintrocid_hist = pd.read_csv(valid_bugintro_dir + 'histogram_valid_bugintro_commitid.csv')
df_bugintrocid_min = pd.read_csv(valid_bugintro_dir + 'minimal_valid_bugintro_commitid.csv')
df_bugintrocid_myers = pd.read_csv(valid_bugintro_dir + 'myers_valid_bugintro_commitid.csv')
df_bugintrocid_pat = pd.read_csv(valid_bugintro_dir + 'patience_valid_bugintro_commitid.csv')

df_bugintrocid_hist = df_bugintrocid_hist[header]
df_bugintrocid_min = df_bugintrocid_min[header]
df_bugintrocid_myers = df_bugintrocid_myers[header]
df_bugintrocid_pat = df_bugintrocid_pat[header]

sorting dataset based on bugintro_commitID

In [None]:
df_bugintrocid_hist = df_bugintrocid_hist.sort_values('bugintro_commitID')
df_bugintrocid_min = df_bugintrocid_min.sort_values('bugintro_commitID')
df_bugintrocid_myers = df_bugintrocid_myers.sort_values('bugintro_commitID')
df_bugintrocid_pat = df_bugintrocid_pat.sort_values('bugintro_commitID')

In [None]:
#Merge dataset histogram and minimal
datamerge1 = df_bugintrocid_hist.merge(df_bugintrocid_min, on=['bugintro_commitID'], 
                                       how="outer", suffixes=('_histogram','_minimal'))
#Merge dataset myers and patience
datamerge2 = df_bugintrocid_myers.merge(df_bugintrocid_pat, on=['bugintro_commitID'],
                                       how="outer", suffixes=('_myers','_patience'))
#Merge datamerge1 and datamerge2
datamerge3 = datamerge1.merge(datamerge2, on=['bugintro_commitID'], how="outer")

#Fill NAs with 0 and sort by bugintro_commitID
datamerge3.fillna(0, inplace=True)
datamerge3 = datamerge3.sort_values(by = 'totalnumber_of_buggyline_histogram').reset_index(drop=True)
cols = ['bugintro_commitID','totalnumber_of_buggyline_histogram','totalnumber_of_buggyline_minimal',
        'totalnumber_of_buggyline_myers','totalnumber_of_buggyline_patience']
datamerge3 = datamerge3[cols]

#save to CSV file
datamerge3.to_csv(valid_bugintro_dir + 'joining_validated_bugintro_commitID.csv')

In [None]:
join_validbugintro = pd.read_csv(valid_bugintro_dir + 'joining_validated_bugintro_commitID.csv')
join_validbugintro[cols]

# Capturing only data which have different number of buglines

In [None]:
#Remove data having same number of buglines
df_diffnumofbugline = datamerge3[datamerge3.iloc[:,-4:].nunique(1).gt(1)]
df_diffnumofbugline = df_diffnumofbugline.sort_values('totalnumber_of_buggyline_histogram', ascending=True)

#Save to CSV file
df_diffnumofbugline.to_csv(valid_bugintro_dir + 'bugintrocid_with_different_numberofbugline.csv')

In [None]:
data_diff = pd.read_csv(valid_bugintro_dir + 'bugintrocid_with_different_numberofbugline.csv')
data_diff[cols]

In [None]:
len(data_diff)

In [None]:
len(join_validbugintro)

# Counting percentage of bug-intro commit id having different number of buglines

In [None]:
percentage = (len(data_diff) / len(join_validbugintro)) * 100
the_rest = 100 - percentage
print ("{0:.2f}%".format(percentage))

In [None]:
labels = ['having different number of buglines','having same number of buglines']
values = [percentage, the_rest]
colors = ['#E1396C','#96D38C']

trace = go.Pie(
    labels=labels, 
    values=values,
    hoverinfo='label+percent', textinfo='value', 
               textfont=dict(size=15),
               marker=dict(colors=colors, 
                           line=dict(color='#000000', width=2))
)

data = [trace]
layout = go.Layout(
    title = "The percentage of validated bug-intro commit id for all algorithms in OPENJPA Project"
)

init_notebook_mode(connected=True)
fig = go.Figure(data=data, layout=layout)
iplot(fig, show_link=False)

# MYERS VS HISTOGRAM

Load dataset

In [None]:
headcols = ['bugintro_commitID','totalnumber_of_buggyline_histogram','totalnumber_of_buggyline_myers']
validbugintro_myershist = pd.read_csv(valid_bugintro_dir + 'joining_validated_bugintro_commitID.csv')
validbugintro_myershist = validbugintro_myershist[headcols]
validbugintro_myershist

Capturing only data which have different number of buglines

In [None]:
#Remove data having same number of buglines
df_diffbugline_myershist = validbugintro_myershist[validbugintro_myershist.iloc[:,-2:].nunique(1).gt(1)]
df_diffbugline_myershist = df_diffbugline_myershist.sort_values('totalnumber_of_buggyline_histogram', ascending=True)

#Save to CSV file
df_diffbugline_myershist.to_csv(valid_bugintro_dir + '01_myers_vs_histogram/bugintrocid_with_different_numberofbugline_myershist.csv')

In [None]:
diff_myershist = pd.read_csv(valid_bugintro_dir + '01_myers_vs_histogram/bugintrocid_with_different_numberofbugline_myershist.csv')
diff_myershist[headcols]

Counting percentage of bug-intro commit id having different number of buglines

In [None]:
percentage_myershist = (len(diff_myershist) / len(validbugintro_myershist)) * 100
rest_myershist = 100 - percentage_myershist
print ("{0:.2f}%".format(percentage_myershist))

In [None]:
labels = ['having different number of buglines','having same number of buglines']
values = [percentage_myershist, rest_myershist]
colors = ['#E1396C','#96D38C']

trace = go.Pie(
    labels=labels, 
    values=values,
    hoverinfo='label+percent', textinfo='value', 
               textfont=dict(size=15),
               marker=dict(colors=colors, 
                           line=dict(color='#000000', width=2))
)

data = [trace]
layout = go.Layout(
    title = "The percentage of validated bug-intro commit id for Myers vs Histogram in OPENJPA Project"
)

init_notebook_mode(connected=True)
fig = go.Figure(data=data, layout=layout)
iplot(fig, show_link=False)