# Importing libraries

In [None]:
import sys, os, re, csv, subprocess, operator
import pandas as pd
from urllib.request import urlopen
import urllib.request
from bs4 import BeautifulSoup

# Configure repository and directories

In [None]:
userhome = os.path.expanduser('~')
txt_file = open(userhome + r"/DifferentDiffAlgorithms/SZZ/code_document/project_identity.txt", "r")
pid = txt_file.read().split('\n')
project = pid[0]
bugidentifier = pid[1]
repository = userhome + r'/DifferentDiffAlgorithms/SZZ/datasource/' + project + '/'
analyze_dir = userhome + r'/DifferentDiffAlgorithms/SZZ/projects_analyses/' + project + '/'

print ("Project name = %s" % project)
print ("Project key = %s" % bugidentifier)

# Load textfile contains bug-ids

In [None]:
txtfile = open(analyze_dir + "01_bug_ids_extraction/candidate_bug_ids.txt", "r")
bug_links = txtfile.read().split('\n')
print ("Found " + str(len(bug_links)) + " bug_ids")

# Finding affected versions by bug ids

In [None]:
error_links = []
affected_version = []
for a,b in enumerate(bug_links):
    link = "https://issues.apache.org/jira/browse/" + b
    sys.stdout.write("\r%i " %(a+1) + "Extracting: " + b)
    sys.stdout.flush()
    try:
        page = urllib.request.urlopen(link)
        soup = BeautifulSoup(page, 'html.parser')
        aff_version = soup.find('span', attrs={'id':'versions-val'}).text.replace("\n",'').replace(" M",'-M').replace(" ",'').replace(".x",'.').split(",")
        aff_version = sorted(aff_version)
        aff_version.insert(0,b)
        affected_version.append(aff_version)
    except:
        error_links.append(b)

print("\nExtraction has been completed.")

In [None]:
print (error_links)

In [None]:
#Repeat the process if there are still some affected versions by bug_ids haven't been captured due to network problems
errorlinks = []
if error_links != []:
    for c,d in enumerate(error_links):
        link = "https://issues.apache.org/jira/browse/" + d
        sys.stdout.write("\r%i " %(c+1) + "Extracting: " + d)
        sys.stdout.flush()
        try:
            page = urllib.request.urlopen(link)
            soup = BeautifulSoup(page, 'html.parser')
            types = soup.find('span', attrs={'id':'versions-val'}).text.replace("\n",'').replace(" M",'-M').replace(" ",'').replace(".x",'.').split(",")
            types = sorted(types)
            types.insert(0, d)
            affected_version.append(types)
        except:
            errorlinks.append(d)

print ("\nExtraction is complete")

In [None]:
print (errorlinks)

In [None]:
affected_version.sort()

#Finding the earliest version affected by the bug ids
earliest_version = []
for num, affver in enumerate(affected_version):
    earliest_version.append(affver[:2])

earliest_version.sort()
for early in earliest_version:
    print (early)

# Defining the function for git command

In [None]:
def execute_command(cmd, work_dir):
    #Executes a shell command in a subprocess, waiting until it has completed.
    pipe = subprocess.Popen(cmd, shell=True, cwd=work_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    (out, error) = pipe.communicate()
    return out, error
    pipe.wait()

# Finding the versions related with earliest version

In [None]:
related_version = []
for n, item in enumerate(earliest_version):
    if "." in item[1]:
        git_cmd = 'git tag -l "*' + item[1] + '*"'
        temp = str(execute_command(git_cmd, repository)).replace("b'",'').replace("(",'').replace(")",'').split("\\n")
        del temp[len(temp)-1]
        if temp == []:
            temp = [item[1].replace("Java-SCA-","")]
    else:
        temp = ['None']

    temp.insert(0, item[0])
    related_version.append(temp)

for xx in related_version:
    print (xx)

# Finding the date release for affected version

In [None]:
date_release = []
for n, item in enumerate(related_version):
    sys.stdout.write("\rFinding datetime for version {}: {}".format(n+1, item[0]))
    sys.stdout.flush()
    if item[1] != "None":
        for m in range(1, len(item)):
            git_cmd = "git log -1 --format=%ai " + item[m]
            temp = str(execute_command(git_cmd, repository)).replace("b'",'').replace("(",'').replace(")",'').split("\\n")
            del temp[len(temp)-1]
            temp = temp[0].split(" ")
            if temp[0] != "',":
                temp.insert(0,item[0])
                temp.insert(1,item[m])
                date_release.append(temp)
                date_release = sorted(date_release, key=operator.itemgetter(0, 2))
    """else:
        date_release.append(item)"""

date_release = sorted(date_release, key=operator.itemgetter(0), reverse=True)
print ("\nThe process is finish")

In [None]:
#save in CSV file
with open(analyze_dir + '04_affected_versions/affected_version.csv','w') as csvfile:
    writers = csv.writer(csvfile)
    writers.writerow(['bug_id','earliest_affected_version','date_release','time_release','tz'])
    for item in date_release:
        writers.writerow(item)

In [None]:
df = pd.read_csv(analyze_dir + '04_affected_versions/affected_version.csv')
df

In [None]:
earliest_vers = df.groupby('bug_id', as_index=False).first()
earliest_vers = earliest_vers.sort_values(['date_release', 'time_release', 'earliest_affected_version'], ascending=True)
earliest_vers.to_csv(analyze_dir + '04_affected_versions/earliest_version.csv', index=False)
earliest_vers

# Joining 2 csv files: list of annotated files and earliest affected versions

In [None]:
colname = ['bug_id','bugfix_commitID','parent_id','filepath','diff_myers_file','diff_histogram_file','blame_myers_file','blame_histogram_file',
           '#deletions_myers','#deletions_histogram']
filedata = pd.read_csv(analyze_dir + '03_annotate/01_annotated_files/listof_diff_n_annotated_files/diff_n_blame_combination_files.csv')
filedata = filedata[colname]

details = filedata.join(earliest_vers.set_index('bug_id')[['earliest_affected_version','date_release']], on='bug_id')
details.to_csv(analyze_dir + '04_affected_versions/affected_version_for_identified_files.csv', index=False)

print ("Affected version for identified files has been created")