# Import library

In [None]:
import os, csv
import pandas as pd
from os import path
import plotly.graph_objs as go
from plotly.offline import plot, init_notebook_mode, iplot

%matplotlib inline

# Configure directory

In [None]:
userhome = os.path.expanduser('~')
txt_file = open(userhome + r"/DifferentDiffAlgorithms/SZZ/code_document/project_identity.txt", "r")
pid = txt_file.read().split('\n')
project = pid[0]
bugidentifier = pid[1]
proj = project.upper()
analyze_dir = userhome + r'/DifferentDiffAlgorithms/SZZ/projects_analyses/' + project + '/'

print ("Project name = %s" % project)
print ("Project key = %s" % bugidentifier)

# Load Dataset

In [None]:
mcolumns = ['bug_id','bugfix_commitID','parent_id','filepath','myers_#validbugline']
hcolumns = ['bug_id','bugfix_commitID','parent_id','filepath','histogram_#validbugline']

In [None]:
ds_myers = pd.read_csv(analyze_dir + "05_validation/02_validfiles/myers_valid_files.csv")
ds_myers = ds_myers[mcolumns]
ds_hist = pd.read_csv(analyze_dir + "05_validation/02_validfiles/histogram_valid_files.csv")
ds_hist = ds_hist[hcolumns]

In [None]:
ds_hist

In [None]:
ds_myers

# Merge the two datasets

In [None]:
data_merge = ds_hist.merge(ds_myers, on=['bug_id','bugfix_commitID','parent_id','filepath'], how='outer')
data_merge.fillna(0, inplace=True)
data_merge = data_merge.reset_index(drop=True)
data_merge

# Capture only different data

Number of valid files having different number of bug-related lines

In [None]:
df_diffcid = data_merge[data_merge.iloc[:,-2:].nunique(1).gt(1)]

#Save to CSV file
df_diffcid.to_csv(analyze_dir + '05_validation/02_validfiles/validfiles_with_different_numberofbugline.csv', index=False)
df_diffcid

Number of bug-fix commit ID having different number of bug-related lines

In [None]:
df_bugfix = data_merge.groupby('bugfix_commitID', as_index=False).agg({"histogram_#validbugline":"sum",
                                                                           "myers_#validbugline":"sum"})
df_bugfix

In [None]:
df_diffbugfix = df_bugfix[df_bugfix.iloc[:,-2:].nunique(1).gt(1)]
#Save to CSV file
df_diffbugfix.to_csv(analyze_dir + '05_validation/03_validbugfixcommitid/validbugfixcid_with_different_numberofbugline.csv', index=False)
df_diffbugfix

# Counting the percentage and visualizing the result

In [None]:
percent = (len(df_diffbugfix)/len(df_bugfix))*100
rest = 100 - percent
print ("Project name: {}".format(proj))
print ("Number of valid bug-fix commit ids having different result: {}".format(len(df_diffbugfix)))
print ("Number of valid bug-fix commit ids having same result: {}".format(len(df_bugfix)-len(df_diffbugfix)))
print ("Total valid bug-fix commit ids: {}".format(len(df_bugfix)))
print ("The percentage of different valid bug-fix commit id: {0:.2f}%".format(percent))

In [None]:
labels = ['different number of valid bug-fix commit ids','same number of valid bug-fix commit ids']
values = [percent, rest]
colors = ['#E1396C','#96D38C']

trace = go.Pie(
    labels=labels, 
    values=values,
    hoverinfo='label+percent', textinfo='value', 
               textfont=dict(size=15),
               marker=dict(colors=colors, 
                           line=dict(color='#000000', width=2))
)

data = [trace]
layout = go.Layout(
    title = "The percentage of valid bug-fix commit id based on the different number of diffs in " + proj + " Project"
)

init_notebook_mode(connected=True)
fig = go.Figure(data=data, layout=layout)
iplot(fig, show_link=False)