# Data Cleaning
delete files in manually labeled data (**release-level-data** from replication kit https://zenodo.org/record/5675024#.Ya-B8fHML0o) that are not related to bug fixes

In [56]:
import os

for filename in os.listdir('./release-level-data'):
    if filename.endswith("bug_fixes.json"):
        continue
    os.remove('./release-level-data/' + filename)
print("data cleaning done")

data cleaning done


# Extract traditional smells
returns a dictionary of list, with key represents a traditional smell and value represents list of classes that contain the smell

In [2]:
import os
import subprocess
import configparser
import pandas as pd
import re
import shutil
import time
import sys

def extract_traditional_smells(project_name):
    print("extracting traditional smells from repository: " + project_name)
    subprocess.call(['java', '-jar', './DECOR_JAVA.jar', project_name, project_name, ""])

    # check if the result of detection is generated
    result_files_path = "../TEMP/" + project_name
    if os.path.isdir(result_files_path):

        # dictionary of list, key represents a traditional smell and value represents list of classes that contain the smell
        smell_classes_dict = {}
        # iterate through each generated file
        for filename in os.listdir(result_files_path):

            # read the content of the file
            file_path = result_files_path + "/" + filename
            with open(file_path) as file:
                file_content = '[dummy_section]\n' + file.read()
            config = configparser.ConfigParser()
            config.read_string(file_content)

            # find all smelly test classes that contain traditional smell
            smelly_classes = []
            for key, value in config.items('dummy_section'):
                if re.match("^[0-9]+\.[0-9]+\.[a-zA-Z]+-\d$", key):
                    class_name = value.split(".")[-1]
                    smelly_classes.append(class_name)
            # map smell and classes containing the smell
            if smelly_classes:
                smell_name = filename.split(" ")[-1].split(".")[0]
                smell_classes_dict[smell_name] = smelly_classes

        # delete result of detection
        try:
            shutil.rmtree("../TEMP/")
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))
        finally:
            print("Traditional smells extracted for project: ", project_name)
            return smell_classes_dict

    else:    
        print("ERROR: extracting traditionial smells failed")
        return None

# Extract traditional smells test


In [49]:
smell_classes_dict = extract_traditional_smells('ant-ivy')
print(smell_classes_dict)

extracting traditional smells from repository: ant-ivy
Traditional smells extracted for project:  ant-ivy
{'LazyClass': ['IvyWebdavFileSystem', 'IvyWebdavFileSystem'], 'LongMethod': ['VfsResourceTest', 'XmlModuleUpdaterTest', 'IvyCachePathTest', 'TestPerformance', 'IvyCacheTask', 'ModuleRevisionId', 'IvyVar', 'ResolveTest', 'CredentialsUtil', 'IBiblioResolverTest', 'SortTest', 'IvyArtifactReport', 'IvyInstall', 'VfsRepository', 'IvyTask', 'JarJarDependencyAnalyser', 'IvyWebdavClientFactory', 'VersionRangeMatcher', 'FileUtil', 'MRIDTransformationRuleTest', 'AbstractSshBasedRepository', 'SshRepository', 'Main', 'AntBuildTrigger', 'IvyReport', 'IvyBuildNumber', 'IvyCacheFileset', 'DualResolverTest', 'IvyPublishTest', 'ConfiguratorTest', 'IvyWebdavConnectionManager', 'ApacheURLLister', 'Message', 'JarModuleFinder', 'ChainResolverTest', 'IvyConfigure', 'DefaultModuleDescriptor', 'RepositoryResolver', 'IvyListModules', 'IvyInfo', 'SFTPRepository', 'VfsRepositoryTest', 'BasicURLHandler', 'IBi

# Extract traditional smells 2
returns a set of classes that contain a smell

In [3]:
import os
import subprocess
import configparser
import pandas as pd
import re
import shutil
import time
import sys

def extract_traditional_smells2(project_name):
    print("extracting traditional smells from repository: " + project_name)
    subprocess.call(['java', '-jar', './DECOR_JAVA.jar', project_name, project_name, ""])

    # check if the result of detection is generated
    result_files_path = "../TEMP/" + project_name
    if os.path.isdir(result_files_path):

        # set of classes that contain the smell
        smell_classes_set = set()

        # iterate through each generated file
        for filename in os.listdir(result_files_path):

            # read the content of the file
            file_path = result_files_path + "/" + filename
            with open(file_path) as file:
                file_content = '[dummy_section]\n' + file.read()
            config = configparser.ConfigParser()
            config.read_string(file_content)

            # find all smelly test classes that contain traditional smell
            for key, value in config.items('dummy_section'):
                if re.match("^[0-9]+\.[0-9]+\.[a-zA-Z]+-\d$", key):
                    class_name = value.split(".")[-1]
                    smell_classes_set.add(class_name)

        # delete result of detection
        try:
            shutil.rmtree("../TEMP/")
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))
        finally:
            print("Traditional smells extracted for project: ", project_name)
            return smell_classes_set

    else:    
        print("ERROR: extracting traditionial smells failed")
        return None

# OpenSZZ
run OpenSZZ on releases of selected projects, extract bug fixing files, and save them as JSON files in the same format as manual validated data 

In [3]:
import os
import pandas as pd
from git import Repo
import subprocess
import json
import glob

os.mkdir('release-level-data-OpenSZZ')

df = pd.read_csv('projects.csv')
for index, row in df.iterrows():
    repo = Repo.clone_from(row['url'], row['project'])
    
    # create key value pairs: key - release number, value - release tag (for git checkout purpose)
    releases_number = row['releases'].split(', ')
    releases_tags = row['releases_tags'].split(', ')
    releases_dict = dict(zip(releases_number, releases_tags))
    
    for release in releases_dict:
        release_tag = releases_dict[release]
        repo.git.checkout(release_tag)
        
        print("extracting bug fixing files using OpenSZZ from repository: " + row['project'] + " at release " + release)
        subprocess.call(['java', '-jar', 'openszz.jar', '-all', row['url'], row['jira'], row['key']])
        print("extraction done")
        output_file_name = row['key'] + '_BugInducingCommits.csv'
        
        if os.path.isfile(output_file_name):
            
            # 1. extracte bug fixing files
            df1 = pd.read_csv(output_file_name, sep=';')
            # key - bug fixing file, value: information on the bug fix
            bug_fixing_file_dict = dict()
            for index, row1 in df1.iterrows():
                if row1["issueType"] == "Bug":
                    bug_fixing_file = row1["bugFixingfileChanged"]
                    bug_fixing_info = {
                        "bugfix_commit": row1["bugFixingId"],
                        "bugfix_commit_date": row1["bugFixingTs"],
                        "type": row1["issueType"]
                    }
                    if bug_fixing_file in bug_fixing_file_dict:
                        bug_fixing_file_dict[bug_fixing_file].append(bug_fixing_info)
                    else:
                        bug_fixing_file_dict[bug_fixing_file] = [bug_fixing_info]
                        
            # 2. save bug fixing files as JSON in same format as manual validated data
            json_file_name = row["project"] + "-" + release + "_bug_fixes.json"
            json_file = open("./release-level-data/" + json_file_name, "r")
            data = json.load(json_file)
            json_file.close()
            
            for line in data:
                line['bug_fixes'] = []
                if line['file'] in bug_fixing_file_dict:
                    line['bug_fixes'] = bug_fixing_file_dict[line['file']]
            
            with open("./release-level-data-OpenSZZ/" + json_file_name, "w") as outfile:
                json.dump(data, outfile, indent=4)
            print("release level data generated with OpenSZZ for project: " + row['project'] + " at release " + release + "\n")
                
            # 3. clean up csv files generated by OpenSZZ
            os.remove(row['project'] + ".txt")
            os.remove(row['key'] + "-log.txt")
            
            fileList = glob.glob(row['key'] + "_*.csv")
            for filePath in fileList:
                try:
                    os.remove(filePath)
                except:
                    print("Error while deleting file : ", filePath)
            
        else:
            print("ERROR: extracting bug fixing files using OpenSZZ failed")

# Fisher exact test
compute fisher exact test and return odds ratios & p-value

In [18]:
from scipy.stats import fisher_exact

def fisher(smelly_classes, faulty_classes_file_path):

    # number of classes with at least one fixing and with at least one anti-pattern 
    w_fix_w_anti = 0
    # number of classes with at least one fixing and without any anti-patterns
    w_fix_wo_anti = 0
    # number of classes without any fixing and with at least one anti-pattern 
    wo_fix_w_anti = 0
    # number of classes without any fixing and without any anti-patterns
    wo_fix_wo_anti = 0

    f = open(faulty_classes_file_path)
    data = json.load(f)
    for line in data:
        filename = line['file'].split("/")[-1].split(".")[0]
        if line['bug_fixes']:
            if filename in smelly_classes:
                w_fix_w_anti +=1
            else:
                w_fix_wo_anti += 1
        else:
            if filename in smelly_classes:
                wo_fix_w_anti +=1
            else:
                wo_fix_wo_anti += 1

    f.close()

    # computer fisher exact test
    data = [[w_fix_w_anti, wo_fix_w_anti],
             [w_fix_wo_anti, wo_fix_wo_anti]]

    oddsratio, pvalue = fisher_exact(data)

    return str(round(oddsratio, 2)), round(pvalue, 4)

# RQ1

In [17]:
from git import Repo
import json

# os.mkdir('RQ1-results')

df = pd.read_csv('projects.csv')
for index, row in df.iterrows():
    project_name = row['project']
    if project_name != "ant-ivy" and project_name != "archiva" and project_name != "calcite":
        repo = Repo(project_name)

        # create key value pairs: key - release number, value - release tag (for git checkout purpose)
        releases_number = row['releases'].split(', ')
        releases_tags = row['releases_tags'].split(', ')
        releases_dict = dict(zip(releases_number, releases_tags))

        list_release, list_oddsratio, list_oddsratio_szz = [], [], []

        for release in releases_dict:
            release_tag = releases_dict[release]
            repo.git.checkout(release_tag)

            smelly_classes = extract_traditional_smells2(project_name)
            if smelly_classes == None:
                print("Unable to extract traditionial smells for project: " + project_name)
                break

            # 1. compute fisher using manually validated data
            json_file_name = row["project"] + "-" + release + "_bug_fixes.json"
            manually_validated_data_path = "./release-level-data/" + json_file_name
            oddsratio, pvalue = fisher(smelly_classes, manually_validated_data_path)

            # mark odds ratio that is not statistically significant (p-value >= 0.05)
            if pvalue >= 0.05:
                oddsratio += "*"

            print('\nFisher exact test using manually validated data for project ' + project_name + ' on release ' + release)
            print('odds ratio: ' + str(oddsratio))

            # 2. compute fisher using szz generated data
            szz_generated_data_path = "./release-level-data-OpenSZZ/" + json_file_name
            oddsratio_szz, pvalue_szz = fisher(smelly_classes, szz_generated_data_path)

            # mark odds ratio that is not statistically significant (p-value >= 0.05)
            if pvalue_szz >= 0.05:
                oddsratio_szz += "*"

            print('\nFisher exact test using szz generated data for project ' + project_name + ' on release ' + release)
            print('odds ratio: ' + str(oddsratio_szz))

            # 3. append results to correponding list
            list_release.append(release)
            list_oddsratio.append(oddsratio)
            list_oddsratio_szz.append(oddsratio_szz)

        # create dataframe from lists of results and save to csv
        temp = {'Releases': list_release, 'Odds ratios (manually validated data)': list_oddsratio, 'Odds ratios (szz generated data)': list_oddsratio_szz} 
        df = pd.DataFrame(temp)
        print(df)

        df.to_csv("RQ1-results/" + project_name + ".csv", index=False)

extracting traditional smells from repository: cayenne
Traditional smells extracted for project:  cayenne

Fisher exact test using manually validated data for project cayenne on release 3.0.0
odds ratio: 15.06

Fisher exact test using szz generated data for project cayenne on release 3.0.0
odds ratio: 4.76
extracting traditional smells from repository: cayenne
Traditional smells extracted for project:  cayenne

Fisher exact test using manually validated data for project cayenne on release 3.1.0
odds ratio: 13.51

Fisher exact test using szz generated data for project cayenne on release 3.1.0
odds ratio: 4.46
  Releases Odds ratios (manually validated data)  \
0    3.0.0                                 15.06   
1    3.1.0                                 13.51   

  Odds ratios (szz generated data)  
0                             4.76  
1                             4.46  
extracting traditional smells from repository: commons-bcel
ERROR: extracting traditionial smells failed
Unable to e

extracting traditional smells from repository: commons-collections
Traditional smells extracted for project:  commons-collections

Fisher exact test using manually validated data for project commons-collections on release 4.0
odds ratio: 39.0

Fisher exact test using szz generated data for project commons-collections on release 4.0
odds ratio: 17.63
extracting traditional smells from repository: commons-collections
Traditional smells extracted for project:  commons-collections

Fisher exact test using manually validated data for project commons-collections on release 4.1
odds ratio: inf*

Fisher exact test using szz generated data for project commons-collections on release 4.1
odds ratio: 13.58
  Releases Odds ratios (manually validated data)  \
0      1.0                                  inf*   
1      2.0                                  inf*   
2      2.1                                  inf*   
3      3.0                                  7.75   
4      3.1                          

Traditional smells extracted for project:  commons-dbcp

Fisher exact test using manually validated data for project commons-dbcp on release 1.3
odds ratio: 3.5*

Fisher exact test using szz generated data for project commons-dbcp on release 1.3
odds ratio: 1.12*
extracting traditional smells from repository: commons-dbcp
Traditional smells extracted for project:  commons-dbcp

Fisher exact test using manually validated data for project commons-dbcp on release 1.4
odds ratio: 3.15*

Fisher exact test using szz generated data for project commons-dbcp on release 1.4
odds ratio: 1.01*
extracting traditional smells from repository: commons-dbcp
Traditional smells extracted for project:  commons-dbcp

Fisher exact test using manually validated data for project commons-dbcp on release 2.0
odds ratio: 4.58*

Fisher exact test using szz generated data for project commons-dbcp on release 2.0
odds ratio: 7.08
extracting traditional smells from repository: commons-dbcp
Traditional smells extracte