# Importing libraries

In [None]:
import os, glob, json, csv, subprocess, sys, re, operator
from git import *
from subprocess import Popen, PIPE
from os import path
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import plot, init_notebook_mode, iplot

%matplotlib inline

# Defining repository and directories

In [None]:
userhome = os.path.expanduser('~')
repository = userhome + r'/different-diff/dataset/openjpa/'
analyze_dir = userhome + r'/different-diff/analyze/analyze_openjpa/'

# Load dataset number of files

In [None]:
algorithms = ['histogram','minimal','myers','patience']

Adding a field key to combine 2 CSV files

In [None]:
for j, alg in enumerate(algorithms):
    with open(analyze_dir + "03_file-diff/03_number_of_files/" + alg + "_total_bugline.csv",'r') as csvinput:
        with open(analyze_dir + "04_annotate/05_matching_affectedfiles/temp_" + alg + "_bugline.csv", 'w') as csvoutput:
            writer = csv.writer(csvoutput, lineterminator='\n')
            reader = csv.reader(csvinput)

            all = []
            row = next(reader)
            row.append('file_key')
            all.append(row)
            
            num = 1
            for row in reader:
                row.append(row[0] + "_" + row[1] + "-" + row[2][:10] + "_" + row[4] + "_" + str(num))
                all.append(row)
                num += 1

            writer.writerows(all)

In [None]:
for j, alg in enumerate(algorithms):
    with open(analyze_dir + "04_annotate/04_grouping_affected_non-affected_files/" + alg + "_affected_files.csv",'r') as csvinput:
        with open(analyze_dir + "04_annotate/05_matching_affectedfiles/temp_" + alg + "_aff_files.csv", 'w') as csvoutput:
            writer = csv.writer(csvoutput, lineterminator='\n')
            reader = csv.reader(csvinput)

            all = []
            row = next(reader)
            row.append('file_key')
            all.append(row)
            
            num = 1
            for row in reader:
                row.append(row[1] + "_" + row[3] + "-" + row[4] + "_" + row[2] + "_" + str(num))
                all.append(row)
                num += 1

            writer.writerows(all)

# Joining bugline files and affected files

In [None]:
for k, algo in enumerate(algorithms):
    filedatabug = pd.read_csv(analyze_dir + '04_annotate/05_matching_affectedfiles/temp_' + algo + '_bugline.csv')
    filedataaff = pd.read_csv(analyze_dir + "04_annotate/05_matching_affectedfiles/temp_" + algo + "_aff_files.csv")

    details = filedatabug.join(filedataaff.set_index('file_key')[['filenumber','affect_version?']], on='file_key')
    details = details[details['number_of_buggyline'] != 0]
    details.to_csv(analyze_dir + '05_finding_versions/02_files_affect/' + algo + '_joining_files.csv')

In [None]:
details

# Separating affected and non-affected files

In [None]:
headname = ['bug_id','bugintro_commitID','parent_id','filepath','filename','number_of_buggyline','file_key','filenumber','affect_version?']

In [None]:
for p, algor in enumerate(algorithms):
    ds = pd.read_csv(analyze_dir + "05_finding_versions/02_files_affect/" + algor + "_joining_files.csv")
    ds_affect = ds[headname][ds['affect_version?'] == 'Affect the version']
    ds_nonaffect = ds[headname][ds['affect_version?'] == 'Does not affect the version']
    
    ds_affect = ds_affect[headname]
    ds_nonaffect = ds_nonaffect[headname]
    
    ds_affect.to_csv(analyze_dir + "05_finding_versions/02_files_affect/01_affected_files/" + algor + "_affected_files.csv")
    ds_nonaffect.to_csv(analyze_dir + "05_finding_versions/02_files_affect/02_non-affected_files/" + algor + "_non-affected_files.csv")

# Defining validated file as a bug-introducing change file

In [None]:
for aa, algrt in enumerate(algorithms):
    valid_file = pd.read_csv(analyze_dir + "05_finding_versions/02_files_affect/01_affected_files/" + algrt + "_affected_files.csv")
    valid_file = (pd.to_numeric(valid_file['number_of_buggyline'], errors='coerce')
                       .groupby(valid_file['filepath'])
                       .sum()
                       .to_frame()
                       .add_prefix('total')
                       .reset_index())

    valid_file = valid_file.dropna(subset=['totalnumber_of_buggyline'])
    valid_file = valid_file.sort_values('totalnumber_of_buggyline', ascending=False)
    valid_file = valid_file[['filepath','totalnumber_of_buggyline']]
    valid_file.to_csv(analyze_dir + "05_finding_versions/03_filtering/02_valid_files/" + algrt + "_valid_files.csv", index=False)

In [None]:
uniq_hist_file = pd.read_csv(analyze_dir + "05_finding_versions/03_filtering/02_valid_files/histogram_valid_files.csv")
uniq_hist_file

In [None]:
total_bugline = []
for ff, algorit in enumerate(algorithms):
    file = pd.read_csv(analyze_dir + "05_finding_versions/03_filtering/02_valid_files/" + algorit + "_valid_files.csv")
    file = file[['filepath','totalnumber_of_buggyline']]
    filenum = file['filepath'].count()
    bugnumber = 0
    for z in range(0,len(file)):
        bugnumber = bugnumber + file.iloc[z][1]
    temp = [algorit, bugnumber, filenum]
    total_bugline.append(temp)

In [None]:
total_bugline

# Finding validated bug-introducing change commit id

In [None]:
for bb, algrt in enumerate(algorithms):
    valid_bugintro = pd.read_csv(analyze_dir + "05_finding_versions/02_files_affect/01_affected_files/" + algrt + "_affected_files.csv")
    valid_bugintro = (pd.to_numeric(valid_bugintro['number_of_buggyline'], errors='coerce')
                       .groupby(valid_bugintro['bugintro_commitID'])
                       .sum()
                       .to_frame()
                       .add_prefix('total')
                       .reset_index())

    valid_bugintro = valid_bugintro.dropna(subset=['totalnumber_of_buggyline'])
    valid_bugintro = valid_bugintro.sort_values('totalnumber_of_buggyline', ascending=False)
    valid_bugintro = valid_bugintro[['bugintro_commitID','totalnumber_of_buggyline']]
    valid_bugintro.to_csv(analyze_dir + "05_finding_versions/03_filtering/01_valid_bugintro_commitid/" + algrt + "_valid_bugintro_commitid.csv", index=False)

In [None]:
numof_bugintro = []
for ff, algorit in enumerate(algorithms):
    file = pd.read_csv(analyze_dir + "05_finding_versions/03_filtering/01_valid_bugintro_commitid/" + algorit + "_valid_bugintro_commitid.csv")
    file = file[['bugintro_commitID','totalnumber_of_buggyline']]
    num = file['bugintro_commitID'].count()
    temp = [algorit, num]
    numof_bugintro.append(temp)
numof_bugintro

# Finding validated candidate bugfix commit id

In [None]:
candidate_bugfix = pd.read_csv(analyze_dir + "02_extracting_commitid/file_in_candidatebugfix_commitid.csv")
candidate_bugfix

Filter validated bug ids and files

In [None]:
#filter valid bug_id
for mm, algo in enumerate(algorithms):
    valid_bugid = pd.read_csv(analyze_dir + "05_finding_versions/02_files_affect/01_affected_files/" + algo + "_affected_files.csv")
    valid_bugid = (pd.to_numeric(valid_bugid['number_of_buggyline'], errors='coerce')
                       .groupby(valid_bugid['bug_id'])
                       .sum()
                       .to_frame()
                       .add_prefix('total')
                       .reset_index())

    valid_bugid = valid_bugid.dropna(subset=['totalnumber_of_buggyline'])
    valid_bugid = valid_bugid.sort_values('totalnumber_of_buggyline', ascending=False)
    valid_bugid = valid_bugid[['bug_id','totalnumber_of_buggyline']]
    valid_bugid.to_csv(analyze_dir + "05_finding_versions/03_filtering/03_valid_bugfix_commitid/valid_bug_id/" + algo + "_valid_bugid.csv", index=False)

In [None]:
#Filter validated files and save into txt file
for nn, alg in enumerate(algorithms):
    uniq_file = pd.read_csv(analyze_dir + "05_finding_versions/03_filtering/02_valid_files/" + alg + "_valid_files.csv")
    valid_file = []
    for i in range(0, len(uniq_file)):
        valid_file.append(uniq_file.iloc[i][0])
    valid_file.sort()
    
    with open (analyze_dir + "05_finding_versions/03_filtering/02_valid_files/txt/" + alg + "valid_file.txt", mode="wt", encoding="utf-8") as myfile:
        myfile.write('\n'.join(valid_file))

In [None]:
#Filter validated bug id and save into txt file
for nn, alg in enumerate(algorithms):
    uniq_bugid = pd.read_csv(analyze_dir + "05_finding_versions/03_filtering/03_valid_bugfix_commitid/valid_bug_id/" + alg + "_valid_bugid.csv")
    valid_bugid = []
    for i in range(0, len(uniq_bugid)):
        valid_bugid.append(uniq_bugid.iloc[i][0])
    valid_bugid.sort()
    
    with open (analyze_dir + "05_finding_versions/03_filtering/03_valid_bugfix_commitid/valid_bug_id/txt/" + alg + "valid_bugid.txt", mode="wt", encoding="utf-8") as myfile:
        myfile.write('\n'.join(valid_bugid))

In [None]:
for aa, algor in enumerate(algorithms):
    with open(analyze_dir + "02_extracting_commitid/file_in_candidatebugfix_commitid.csv",'r') as fin, open (analyze_dir + "05_finding_versions/03_filtering/03_valid_bugfix_commitid/" + algor + "_validbugfix_commitid.csv",'w') as fout:
        writers = csv.writer(fout, delimiter=',')
        c_name = ['validated_bug_id','validated_bugfix_commitid','validated_file']
        writers.writerow(c_name)
        
        val_bugid = open(analyze_dir + "05_finding_versions/03_filtering/03_valid_bugfix_commitid/valid_bug_id/txt/" + algor + "valid_bugid.txt")
        val_bugid = val_bugid.read().split("\n")
        
        val_file = open(analyze_dir + "05_finding_versions/03_filtering/02_valid_files/txt/" + algor + "valid_file.txt")
        val_file = val_file.read().split("\n")
        
        for row in csv.reader(fin, delimiter=','):
            if row[0] in val_bugid:
                if row[2] in val_file: 
                    writers.writerow(row)

Removing duplicate bugfix_commit_id

In [None]:
for bb, algrt in enumerate(algorithms):
    valid_bugfix = pd.read_csv(analyze_dir + "05_finding_versions/03_filtering/03_valid_bugfix_commitid/" + algrt + "_validbugfix_commitid.csv")
    valid = []
    for row in range(0,len(valid_bugfix)):
        if valid_bugfix.iloc[row][1] not in valid:
            valid.append(valid_bugfix.iloc[row][1])
    with open (analyze_dir + "05_finding_versions/03_filtering/03_valid_bugfix_commitid/distinct_bugfix_commitid/" + algrt + "valid_bugfix_commitid.txt", mode="wt", encoding="utf-8") as myfile:
        myfile.write('\n'.join(valid))

In [None]:
val_bugfix = open(analyze_dir + "05_finding_versions/03_filtering/03_valid_bugfix_commitid/distinct_bugfix_commitid/histogramvalid_bugfix_commitid.txt")
val_bugfix = val_bugfix.read().split("\n")
val_bugfix

In [None]:
numof_bugfixcid = []
for cc, algrt in enumerate(algorithms):
    txtfile = open(analyze_dir + "05_finding_versions/03_filtering/03_valid_bugfix_commitid/distinct_bugfix_commitid/" + algrt + "valid_bugfix_commitid.txt")
    txtfile = txtfile.read().split("\n")
    temp = [algrt, len(txtfile)]
    numof_bugfixcid.append(temp)
numof_bugfixcid

In [None]:
#Merging the list of total_bugline and numof_bugintro
d = dict(numof_bugintro)
totbug_numbugintro = [i + [d[i[0]]] for i in total_bugline]
totbug_numbugintro

In [None]:
#Merging the list of totbug_numbugintro and numof_bugfixcid
d = dict(numof_bugfixcid)
join_list = [i + [d[i[0]]] for i in totbug_numbugintro]
join_list

In [None]:
result_matrix = [list(i) for i in zip(*join_list)]
label = ['algorithm','#validated_buglines', '#validated_files', '#validated_bug-intro_commitID', '#validated_bugfix_commitID']
for s, lbl in enumerate(label):
    result_matrix[s].insert(0, lbl)
result_matrix

In [None]:
with open(analyze_dir + "06_statistics/final_stats.csv","w") as finalfile:
    writers = csv.writer(finalfile, delimiter=",")
    colnames = ['item','histogram', 'minimal','myers','patience']
    writers.writerow(colnames)
    for row in result_matrix[1:]:
        writers.writerow(row)

In [None]:
data_stat = pd.read_csv(analyze_dir + "06_statistics/final_stats.csv")
data_stat

# Visualizing number of validated buglines in graphs

In [None]:
hist = go.Bar(
    x = ['histogram','minimal','myers','patience'],
    y = [data_stat.iloc[0][1],data_stat.iloc[0][2],data_stat.iloc[0][3],data_stat.iloc[0][4]],
    text = [data_stat.iloc[0][1],data_stat.iloc[0][2],data_stat.iloc[0][3],data_stat.iloc[0][4]],
    textposition = 'auto',
    marker = dict(
        color = 'rgb(150,255,200)',
        line = dict(
            color='rgb(8,48,107)',
            width=1.5),
    ),
    opacity=0.6
)

data = [hist]
layout = go.Layout(
    title = "Number of validated buglines in OPENJPA Project"
)

init_notebook_mode(connected=True)
fig = go.Figure(data=data, layout=layout)
iplot(fig, show_link=False)

# Visualizing number of validated files, bug-intro and bug-fix commit in graphs

In [None]:
data_stat

In [None]:
numhist = go.Bar(
    x = ['validated_files','validated_bug-intro_commitID','validated_bugfix_commitID'],
    y = data_stat['histogram'].iloc[1:],
    text = data_stat['histogram'].iloc[1:],
    textposition = 'auto',
    name = 'Histogram'
)

nummin = go.Bar(
    x = ['validated_files','validated_bug-intro_commitID','validated_bugfix_commitID'],
    y = data_stat['minimal'].iloc[1:],
    text = data_stat['minimal'].iloc[1:],
    textposition = 'auto',
    name = 'minimal'
)

nummyers = go.Bar(
    x = ['validated_files','validated_bug-intro_commitID','validated_bugfix_commitID'],
    y = data_stat['myers'].iloc[1:],
    text = data_stat['myers'].iloc[1:],
    textposition = 'auto',
    name = 'Myers'
)

numpat = go.Bar(
    x = ['validated_files','validated_bug-intro_commitID','validated_bugfix_commitID'],
    y = data_stat['patience'].iloc[1:],
    text = data_stat['patience'].iloc[1:],
    textposition = 'auto',
    name = 'Patience'
)

data = [numhist, nummin, nummyers, numpat]
layout = go.Layout(
    title = "Number of validated files, bug-intro commit id and bugfix commit id in OPENJPA Project"
)

init_notebook_mode(connected=True)
fig = go.Figure(data=data, layout=layout)
iplot(fig, show_link=False)