# Lucena - Data Collection

## Finding Realease with most bugs

Read issue files.

In [138]:
import json
from pprint import pprint
from os import listdir
from os.path import isfile, join
import operator

mypath = "issue_LUCENE"

onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) and f.endswith(".json") ]
version_bugs = {}
bug_fixed_dates = {}

for f in onlyfiles:
    with open(join(mypath,f)) as data_file:   
        data = json.load(data_file)
        if (data["fields"]["issuetype"]["name"] == "Bug" and 
            data["fields"]["status"]["name"] == "Closed" and 
            data["fields"]["resolution"]["name"] == "Fixed"):
            
            bug_fixed_dates[data["key"]] = data["fields"]["resolutiondate"]
            
            versions = data["fields"]["versions"]
            for v in versions:
                if version_bugs.has_key(v["name"]):
                    version_bugs[v["name"]].append(data["key"])
                else:
                    version_bugs[v["name"]] = [data["key"]]

From the dictionary with the bugs associated to each release we extract the buggiest version and its bugs.

In [139]:
most_bugs = 0
buggy_version = ""

for k,v in version_bugs.iteritems():
    if len(v) > most_bugs:
        most_bugs = len(v)
        buggy_version = k
        
bugs = {}

for b in version_bugs[buggy_version]:
    bugs[b] = []

print "Version with most closed bugs:", buggy_version , "with", len (version_bugs[buggy_version]), "bugs."

Version with most closed bugs: 4.0-ALPHA with 179 bugs.


Find the date of the most recent bug fix that afefcts the chosen version.

In [140]:
import datetime

latest_date = "1999"   # date for the most recent bug fix

for b in bugs:
    latest_date = (bug_fixed_dates[b] if bug_fixed_dates[b] > latest_date else latest_date)

latest_date = latest_date[:10]

print "The day of the most recent bug fix is", latest_date

The day of the most recent bug fix is 2014-01-24


For some reason shortlog was not working. #git shortlog -s -n

In [141]:
import sh
import re
import os

if not os.path.exists("lucene-solr"):
    sh.git.clone("https://github.com/apache/lucene-solr.git")
    
git = sh.git.bake(_cwd='lucene-solr')

git.checkout("tags/lucene_solr_4_0_0_ALPHA")


files = filter(None,git("ls-files").split("\n"))

print "Reading repository..."

#git shortlog -s -n

table = {}
#count = 0 #TODO remove

for f in files:
    contributors_data = filter(None, sh.uniq(sh.sort(git.log("--format=format:%an", f)), "-c").split("\n"))
    contributors = []
    total = 0
    max_ownership = 0
    minor = 0
    major = 0
    
    for a in contributors_data:
        num = int(re.search("[0-9]+", a).group(0))
        name = re.search("([A-z]+\s*)+", a).group(0)
        total += num
        max_ownership = num if num > max_ownership else max_ownership
        contributors.append((name,num))
    
    for a in contributors:

        if a[1] * 1.0 / total >= 0.05:
            major += 1
        else:
            minor += 1
   
    #count += 1 #TODO delete
    #if count >= 100: #TODO delete
    #    break #TODO delete
        
    table[f]= {"minor": minor, "major": major, "total": minor + major, "ownership": (float("{0:.2f}".format(max_ownership * 1.0 / total * 100))), "num_of_bugs":0}

print "...finished reading"


Reading repository...
...finished reading


## Finding commits that fix bugs of the chosen release

In [142]:
#git rev-list --topo-order HEAD..towards | tail -1
#git rev-parse trunk

#last_commit_hash = git("rev-parse", "trunk")
#print last_commit_hash

shaLatest = (git("rev-list","-n 1","--before=\"" + latest_date + "23:59\"","trunk")).stdout[:-1]
print "Last bug fix date detected:", shaLatest

commit_list = filter(None, git("rev-list", "--topo-order", "HEAD.." + shaLatest).split("\n"))
print "We have", len(commit_list), "commits to analyse."

# git log --format=%B -n 1 7bfa4fd6505e86c7481526bca7b157055a7b4ead

for commit in commit_list:
    message = str(git.log("--format=%B", "-n 1", commit))
    match = re.search("LUCENE-[0-9]+", message)

    if match:
        key = match.group(0).strip()
        
        if key in bugs: 
            #print key, " ",
            files_changed = filter(None, git("diff-tree", "--no-commit-id", "--name-only", "-r", commit).split("\n"))
            bugs[key] = files_changed
   
print "Identified fixes for", len ({k for (k,v) in bugs.iteritems() if len(v) > 0}) ,"out of", str(len(bugs)) + "."

Last bug fix date detected: ba560c7484c1df260ae78b414749fa81af998231
We have 7342 commits to analyse.
Identified fixes for 32 out of 179.


## Assign number of bugs to each file

Count number of bugs in each file by checking the files modified in the correction of each bug.

In [143]:
for key, value in bugs.iteritems():
    if len(value) > 0:
        for file in value:
            if file in table:
                table[file]["num_of_bugs"] += 1

## Write CSV

In [144]:
f = open("data.csv", "w")
f.write("file_name, minor, major, total, ownership, num_of_bugs\n")

for k,v in table.iteritems():
    f.write(k + "," + str(v["minor"]) + ", " + str(v["major"]) + ", " + str(v["total"]) + ", " + str(v["ownership"]) + "%, " + str(v["num_of_bugs"]) +"\n")

f.close()

print "CSV file saved as data.csv"

CSV file saved as data.csv
