# Data Cleaning
delete files in manually labeled data (**release-level-data** from replication kit https://zenodo.org/record/5675024#.Ya-B8fHML0o) that are not related to bug fixes

In [56]:
import os

for filename in os.listdir('./release-level-data'):
    if filename.endswith("bug_fixes.json"):
        continue
    os.remove('./release-level-data/' + filename)
print("data cleaning done")

data cleaning done


# Extract traditional smells
returns a dictionary of list, with key represents a traditional smell and value represents list of classes that contain the smell

In [2]:
import os
import subprocess
import configparser
import pandas as pd
import re
import shutil
import time
import sys

def extract_traditional_smells(project_name):
    print("extracting traditional smells from repository: " + project_name)
    subprocess.call(['java', '-jar', './DECOR_JAVA.jar', project_name, project_name, ""])

    # check if the result of detection is generated
    result_files_path = "../TEMP/" + project_name
    if os.path.isdir(result_files_path):

        # dictionary of list, key represents a traditional smell and value represents list of classes that contain the smell
        smell_classes_dict = {}
        # iterate through each generated file
        for filename in os.listdir(result_files_path):

            # read the content of the file
            file_path = result_files_path + "/" + filename
            with open(file_path) as file:
                file_content = '[dummy_section]\n' + file.read()
            config = configparser.ConfigParser()
            config.read_string(file_content)

            # find all smelly test classes that contain traditional smell
            smelly_classes = []
            for key, value in config.items('dummy_section'):
                if re.match("^[0-9]+\.[0-9]+\.[a-zA-Z]+-\d$", key):
                    class_name = value.split(".")[-1]
                    smelly_classes.append(class_name)
            # map smell and classes containing the smell
            if smelly_classes:
                smell_name = filename.split(" ")[-1].split(".")[0]
                smell_classes_dict[smell_name] = smelly_classes

        # delete result of detection
        try:
            shutil.rmtree("../TEMP/")
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))
        finally:
            print("Traditional smells extracted for project: ", project_name)
            return smell_classes_dict

    else:    
        print("ERROR: extracting traditionial smells failed")
        return None

# Extract traditional smells 2
returns a set of classes that contain a smell

In [3]:
import os
import subprocess
import configparser
import pandas as pd
import re
import shutil
import time
import sys

def extract_traditional_smells2(project_name):
    print("extracting traditional smells from repository: " + project_name)
    subprocess.call(['java', '-jar', './DECOR_JAVA.jar', project_name, project_name, ""])

    # check if the result of detection is generated
    result_files_path = "../TEMP/" + project_name
    if os.path.isdir(result_files_path):

        # set of classes that contain the smell
        smell_classes_set = set()

        # iterate through each generated file
        for filename in os.listdir(result_files_path):

            # read the content of the file
            file_path = result_files_path + "/" + filename
            with open(file_path) as file:
                file_content = '[dummy_section]\n' + file.read()
            config = configparser.ConfigParser()
            config.read_string(file_content)

            # find all smelly test classes that contain traditional smell
            for key, value in config.items('dummy_section'):
                if re.match("^[0-9]+\.[0-9]+\.[a-zA-Z]+-\d$", key):
                    class_name = value.split(".")[-1]
                    smell_classes_set.add(class_name)

        # delete result of detection
        try:
            shutil.rmtree("../TEMP/")
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))
        finally:
            print("Traditional smells extracted for project: ", project_name)
            return smell_classes_set

    else:    
        print("ERROR: extracting traditionial smells failed")
        return None

# OpenSZZ
run OpenSZZ on releases of selected projects, extract bug fixing files, and save them as JSON files in the same format as manual validated data 

In [3]:
import os
import pandas as pd
from git import Repo
import subprocess
import json
import glob

os.mkdir('release-level-data-OpenSZZ')

df = pd.read_csv('projects.csv')
for index, row in df.iterrows():
    repo = Repo.clone_from(row['url'], row['project'])
    
    # create key value pairs: key - release number, value - release tag (for git checkout purpose)
    releases_number = row['releases'].split(', ')
    releases_tags = row['releases_tags'].split(', ')
    releases_dict = dict(zip(releases_number, releases_tags))
    
    for release in releases_dict:
        release_tag = releases_dict[release]
        repo.git.checkout(release_tag)
        
        print("extracting bug fixing files using OpenSZZ from repository: " + row['project'] + " at release " + release)
        subprocess.call(['java', '-jar', 'openszz.jar', '-all', row['url'], row['jira'], row['key']])
        print("extraction done")
        output_file_name = row['key'] + '_BugInducingCommits.csv'
        
        if os.path.isfile(output_file_name):
            
            # 1. extracte bug fixing files
            df1 = pd.read_csv(output_file_name, sep=';')
            # key - bug fixing file, value: information on the bug fix
            bug_fixing_file_dict = dict()
            for index, row1 in df1.iterrows():
                if row1["issueType"] == "Bug":
                    bug_fixing_file = row1["bugFixingfileChanged"]
                    bug_fixing_info = {
                        "bugfix_commit": row1["bugFixingId"],
                        "bugfix_commit_date": row1["bugFixingTs"],
                        "type": row1["issueType"]
                    }
                    if bug_fixing_file in bug_fixing_file_dict:
                        bug_fixing_file_dict[bug_fixing_file].append(bug_fixing_info)
                    else:
                        bug_fixing_file_dict[bug_fixing_file] = [bug_fixing_info]
                        
            # 2. save bug fixing files as JSON in same format as manual validated data
            json_file_name = row["project"] + "-" + release + "_bug_fixes.json"
            json_file = open("./release-level-data/" + json_file_name, "r")
            data = json.load(json_file)
            json_file.close()
            
            for line in data:
                line['bug_fixes'] = []
                if line['file'] in bug_fixing_file_dict:
                    line['bug_fixes'] = bug_fixing_file_dict[line['file']]
            
            with open("./release-level-data-OpenSZZ/" + json_file_name, "w") as outfile:
                json.dump(data, outfile, indent=4)
            print("release level data generated with OpenSZZ for project: " + row['project'] + " at release " + release + "\n")
                
            # 3. clean up csv files generated by OpenSZZ
            os.remove(row['project'] + ".txt")
            os.remove(row['key'] + "-log.txt")
            
            fileList = glob.glob(row['key'] + "_*.csv")
            for filePath in fileList:
                try:
                    os.remove(filePath)
                except:
                    print("Error while deleting file : ", filePath)
            
        else:
            print("ERROR: extracting bug fixing files using OpenSZZ failed")

# Fisher exact test
compute fisher exact test and return odds ratios & p-value

In [18]:
from scipy.stats import fisher_exact

def fisher(smelly_classes, faulty_classes_file_path):

    # number of classes with at least one fixing and with at least one anti-pattern 
    w_fix_w_anti = 0
    # number of classes with at least one fixing and without any anti-patterns
    w_fix_wo_anti = 0
    # number of classes without any fixing and with at least one anti-pattern 
    wo_fix_w_anti = 0
    # number of classes without any fixing and without any anti-patterns
    wo_fix_wo_anti = 0

    f = open(faulty_classes_file_path)
    data = json.load(f)
    for line in data:
        filename = line['file'].split("/")[-1].split(".")[0]
        if line['bug_fixes']:
            if filename in smelly_classes:
                w_fix_w_anti +=1
            else:
                w_fix_wo_anti += 1
        else:
            if filename in smelly_classes:
                wo_fix_w_anti +=1
            else:
                wo_fix_wo_anti += 1

    f.close()

    # computer fisher exact test
    data = [[w_fix_w_anti, wo_fix_w_anti],
             [w_fix_wo_anti, wo_fix_wo_anti]]

    oddsratio, pvalue = fisher_exact(data)

    return str(round(oddsratio, 2)), round(pvalue, 4)

# RQ1

In [17]:
from git import Repo
import json

# os.mkdir('RQ1-results')

df = pd.read_csv('projects.csv')
for index, row in df.iterrows():
    project_name = row['project']
    repo = Repo(project_name)

    # create key value pairs: key - release number, value - release tag (for git checkout purpose)
    releases_number = row['releases'].split(', ')
    releases_tags = row['releases_tags'].split(', ')
    releases_dict = dict(zip(releases_number, releases_tags))

    list_release, list_oddsratio, list_oddsratio_szz = [], [], []

    for release in releases_dict:
        release_tag = releases_dict[release]
        repo.git.checkout(release_tag)

        smelly_classes = extract_traditional_smells2(project_name)
        if smelly_classes == None:
            print("Unable to extract traditionial smells for project: " + project_name)
            break

        # 1. compute fisher using manually validated data
        json_file_name = row["project"] + "-" + release + "_bug_fixes.json"
        manually_validated_data_path = "./release-level-data/" + json_file_name
        oddsratio, pvalue = fisher(smelly_classes, manually_validated_data_path)

        # mark odds ratio that is not statistically significant (p-value >= 0.05)
        if pvalue >= 0.05:
            oddsratio += "*"

        print('\nFisher exact test using manually validated data for project ' + project_name + ' on release ' + release)
        print('odds ratio: ' + str(oddsratio))

        # 2. compute fisher using szz generated data
        szz_generated_data_path = "./release-level-data-OpenSZZ/" + json_file_name
        oddsratio_szz, pvalue_szz = fisher(smelly_classes, szz_generated_data_path)

        # mark odds ratio that is not statistically significant (p-value >= 0.05)
        if pvalue_szz >= 0.05:
            oddsratio_szz += "*"

        print('\nFisher exact test using szz generated data for project ' + project_name + ' on release ' + release)
        print('odds ratio: ' + str(oddsratio_szz))

        # 3. append results to correponding list
        list_release.append(release)
        list_oddsratio.append(oddsratio)
        list_oddsratio_szz.append(oddsratio_szz)

    # create dataframe from lists of results and save to csv
    temp = {'Releases': list_release, 'Odds ratios (manually validated data)': list_oddsratio, 'Odds ratios (szz generated data)': list_oddsratio_szz} 
    df = pd.DataFrame(temp)
    print(df)

    df.to_csv("RQ1-results/" + project_name + ".csv", index=False)

extracting traditional smells from repository: cayenne
Traditional smells extracted for project:  cayenne

Fisher exact test using manually validated data for project cayenne on release 3.0.0
odds ratio: 15.06

Fisher exact test using szz generated data for project cayenne on release 3.0.0
odds ratio: 4.76
extracting traditional smells from repository: cayenne
Traditional smells extracted for project:  cayenne

Fisher exact test using manually validated data for project cayenne on release 3.1.0
odds ratio: 13.51

Fisher exact test using szz generated data for project cayenne on release 3.1.0
odds ratio: 4.46
  Releases Odds ratios (manually validated data)  \
0    3.0.0                                 15.06   
1    3.1.0                                 13.51   

  Odds ratios (szz generated data)  
0                             4.76  
1                             4.46  
extracting traditional smells from repository: commons-bcel
ERROR: extracting traditionial smells failed
Unable to e

extracting traditional smells from repository: commons-collections
Traditional smells extracted for project:  commons-collections

Fisher exact test using manually validated data for project commons-collections on release 4.0
odds ratio: 39.0

Fisher exact test using szz generated data for project commons-collections on release 4.0
odds ratio: 17.63
extracting traditional smells from repository: commons-collections
Traditional smells extracted for project:  commons-collections

Fisher exact test using manually validated data for project commons-collections on release 4.1
odds ratio: inf*

Fisher exact test using szz generated data for project commons-collections on release 4.1
odds ratio: 13.58
  Releases Odds ratios (manually validated data)  \
0      1.0                                  inf*   
1      2.0                                  inf*   
2      2.1                                  inf*   
3      3.0                                  7.75   
4      3.1                          

Traditional smells extracted for project:  commons-dbcp

Fisher exact test using manually validated data for project commons-dbcp on release 1.3
odds ratio: 3.5*

Fisher exact test using szz generated data for project commons-dbcp on release 1.3
odds ratio: 1.12*
extracting traditional smells from repository: commons-dbcp
Traditional smells extracted for project:  commons-dbcp

Fisher exact test using manually validated data for project commons-dbcp on release 1.4
odds ratio: 3.15*

Fisher exact test using szz generated data for project commons-dbcp on release 1.4
odds ratio: 1.01*
extracting traditional smells from repository: commons-dbcp
Traditional smells extracted for project:  commons-dbcp

Fisher exact test using manually validated data for project commons-dbcp on release 2.0
odds ratio: 4.58*

Fisher exact test using szz generated data for project commons-dbcp on release 2.0
odds ratio: 7.08
extracting traditional smells from repository: commons-dbcp
Traditional smells extracte

# RQ2

In [None]:
def getProjects():
    return {
    "ant-ivy": "https://github.com/apache/ant-ivy",
    "archiva": "https://github.com/apache/archiva",
    "calcite": "https://github.com/apache/calcite",
    "cayenne": "https://github.com/apache/cayenne",
    "commons-bcel": "https://github.com/apache/commons-bcel",
    "commons-beanutils": "https://github.com/apache/commons-beanutils",
    "commons-codec": "https://github.com/apache/commons-codec",
    "commons-collections": "https://github.com/apache/commons-collections",
    "commons-compress": "https://github.com/apache/commons-compress",
    "commons-configuration": "https://github.com/apache/commons-configuration",
    "commons-dbcp": "https://github.com/apache/commons-dbcp",
    "commons-digester": "https://github.com/apache/commons-digester",
    "commons-io": "https://github.com/apache/commons-io",
    "commons-jcs": "https://github.com/apache/commons-jcs",
    "commons-jexl": "https://github.com/apache/commons-jexl",
    "commons-lang": "https://github.com/apache/commons-lang",
    "commons-math": "https://github.com/apache/commons-math",
    "commons-net": "https://github.com/apache/commons-net",
    "commons-scxml": "https://github.com/apache/commons-scxml",
    "commons-validator": "https://github.com/apache/commons-validator",
    "commons-vfs": "https://github.com/apache/commons-vfs",
    "deltaspike": "https://github.com/apache/deltaspike",
    "eagle": "https://github.com/apache/eagle",
    "giraph": "https://github.com/apache/giraph",
    "gora": "https://github.com/apache/gora",
    "jspwiki": "https://github.com/apache/jspwiki",
    "knox": "https://github.com/apache/knox",
    "kylin": "https://github.com/apache/kylin",
    "lens": "https://github.com/apache/lens",
    "mahout": "https://github.com/apache/mahout",
    "manifoldcf": "https://github.com/apache/manifoldcf",
    "nutch": "https://github.com/apache/nutch",
    "opennlp": "https://github.com/apache/opennlp",
    "parquet-mr": "https://github.com/apache/parquet-mr",
    "santuario-java": "https://github.com/apache/santuario-java",
    "systemml": "https://github.com/apache/systemds",
    "tika": "https://github.com/apache/tika",
    "wss4j": "https://github.com/apache/ws-wss4j"
}

In [None]:
def getRelease(isCorrespondance = False):
    if(isCorrespondance):
        return {
   "ant-ivy":[
      "1.4.1",
      "2.0.0",
      "2.1.0",
      "2.2.0",
      "2.3.0",
      "2.4.0"
   ],
   "archiva":[
      "1.0",
      "1.1",
      "1.2",
      "1.3",
      "2.0.0",
      "2.1.0",
      "2.2.0"
   ],
   "calcite":[
      "1.8.0",
      "1.9.0",
      "1.10.0",
      "1.11.0",
      "1.12.0",
      "1.13.0",
      "1.14.0",
      "1.15.0"
   ],
   "cayenne":[
      "3.0.0",
      "3.1.0"
   ],
   "commons-bcel":[
      "5.2",
      "6.0",
      "6.1",
      "6.2"
   ],
   "commons-beanutils":[
      "1.7.0",
      "1.8.0",
      "1.9.0"
   ],
   "commons-codec":[
      "1.2",
      "1.3",
      "1.5",
      "1.6",
      "1.7",
      "1.8",
      "1.9",
      "1.10"
   ],
   "commons-collections":[
      "1.0",
      "2.0",
      "2.1",
      "3.0",
      "3.1",
      "3.2",
      "3.3",
      "4.0",
      "4.1"
   ],
   "commons-compress":[
      "1.9",
      "1.10",
      "1.11",
      "1.12",
      "1.13",
      "1.14",
      "1.15",
      "1.16"
   ],
   "commons-configuration":[
      "1.4",
      "1.5",
      "1.6",
      "1.7",
      "1.8",
      "1.9",
      "1.10",
      "2.2"
   ],
   "commons-dbcp":[
      "1.3",
      "1.4",
      "2.0",
      "2.1",
      "2.2.0",
      "2.3.0",
      "2.4.0",
      "2.5.0"
   ],
   "commons-digester":[
      "1.4",
      "1.5",
      "1.6",
      "1.7",
      "1.8",
      "3.0",
      "3.1",
      "3.2"
   ],
   "commons-io":[
      "1.0",
      "1.1",
      "1.2",
      "1.3",
      "1.4",
      "2.0",
      "2.1",
      "2.2",
      "2.3",
      "2.4",
      "2.5"
   ],
   "commons-jcs":[
      "1.0",
      "1.1",
      "1.3",
      "2.0",
      "2.1",
      "2.2"
   ],
   "commons-jexl":[
      "2.0",
      "2.1"
   ],
   "commons-lang":[
      "2.0",
      "2.1",
      "2.2",
      "2.3",
      "2.4",
      "2.5",
      "2.6",
      "3.0",
      "3.1",
      "3.2",
      "3.3",
      "3.4",
      "3.5",
      "3.6",
      "3.7"
   ],
   "commons-math":[
      "1.0",
      "1.1",
      "1.2",
      "2.0",
      "2.1",
      "2.2",
      "3.0",
      "3.1",
      "3.2",
      "3.3",
      "3.4",
      "3.5"
   ],
   "commons-net":[
      "1.0.0",
      "1.1.0",
      "1.2.0",
      "1.3.0",
      "1.4.0",
      "2.0",
      "2.1",
      "2.2",
      "3.0",
      "3.1",
      "3.2",
      "3.3",
      "3.4",
      "3.5",
      "3.6"
   ],
   "commons-scxml":[
      "0.5",
      "0.6",
      "0.7",
      "0.8",
      "0.9"
   ],
   "commons-validator":[
      "1.1.0",
      "1.2.0",
      "1.3.0",
      "1.4.0",
      "1.5.0",
      "1.6.0"
   ],
   "commons-vfs":[
      "1.0",
      "2.0",
      "2.1",
      "2.2"
   ],
   "deltaspike":[
      "0.1",
      "0.2",
      "0.3",
      "0.4",
      "0.5",
      "0.6",
      "0.7",
      "1.0.0",
      "1.1.0",
      "1.2.0",
      "1.3.0",
      "1.4.0",
      "1.5.0",
      "1.6.0",
      "1.7.0",
      "1.8.0"
   ],
   "eagle":[
      "0.3.0",
      "0.4.0"
   ],
   "giraph":[
      "1.0.0",
      "1.1.0"
   ],
   "gora":[
      "0.1",
      "0.2",
      "0.3",
      "0.4",
      "0.5",
      "0.6"
   ],
   "jspwiki":[
      "1.4.0",
      "1.5.0",
      "1.6.0",
      "1.7.0",
      "1.8.0",
      "2.0.36",
      "2.2.19",
      "2.4.56",
      "2.6.0",
      "2.8.0",
      "2.9.0",
      "2.10.0"
   ],
   "knox":[
      "0.3.0",
      "0.4.0",
      "0.5.0",
      "0.6.0",
      "0.7.0",
      "0.8.0",
      "0.9.0",
      "0.10.0",
      "0.11.0",
      "0.12.0",
      "0.13.0",
      "0.14.0",
      "1.0.0"
   ],
   "kylin":[
      "0.6.1",
      "0.7.1",
      "1.0",
      "1.1",
      "1.2",
      "1.3",
      "1.5.0",
      "1.6.0",
      "2.0.0",
      "2.1.0",
      "2.2.0"
   ],
   "lens":[
      "2.6.0",
      "2.7.0"
   ],
   "mahout":[
      "0.1",
      "0.2",
      "0.3",
      "0.4",
      "0.5",
      "0.6",
      "0.7",
      "0.8",
      "0.9",
      "0.10.0",
      "0.11.0",
      "0.12.0"
   ],
   "manifoldcf":[
      "0.1",
      "0.2",
      "0.3",
      "0.4",
      "0.5",
      "0.6",
      "1.0",
      "1.1",
      "1.2",
      "1.3",
      "1.4",
      "1.5",
      "1.6",
      "1.7",
      "1.8",
      "1.9",
      "1.10",
      "2.0",
      "2.1",
      "2.2",
      "2.3",
      "2.4",
      "2.5",
      "2.6",
      "2.7",
      "2.8",
      "2.9",
      "2.10"
   ],
   "nutch":[
      "0.7",
      "0.8",
      "0.9",
      "1.0",
      "1.1",
      "1.2",
      "1.3",
      "1.4",
      "1.5",
      "1.6",
      "1.7",
      "1.8",
      "1.9",
      "1.10",
      "1.11",
      "1.12",
      "1.13",
      "1.14",
      "2.0",
      "2.1",
      "2.2",
      "2.3"
   ],
   "opennlp":[
      "1.7.0",
      "1.8.0"
   ],
   "parquet-mr":[
      "1.0.0",
      "1.1.0",
      "1.2.0",
      "1.3.0",
      "1.4.0",
      "1.5.0",
      "1.6.0",
      "1.7.0",
      "1.8.0",
      "1.9.0"
   ],
   "santuario-java":[
      "1.4.5",
      "1.5.9",
      "2.0.0",
      "2.1.0"
   ],
   "systemml":[
      "0.9",
      "0.10",
      "0.11",
      "0.12",
      "0.13",
      "0.14",
      "0.15",
      "1.0.0",
      "1.1.0"
   ],
   "tika":[
      "0.1",
      "0.2",
      "0.3",
      "0.4",
      "0.5",
      "0.6",
      "0.7",
      "0.8",
      "0.9",
      "0.10",
      "1.0",
      "1.1",
      "1.2",
      "1.3",
      "1.4",
      "1.5",
      "1.6",
      "1.7",
      "1.8",
      "1.9",
      "1.10",
      "1.11",
      "1.12",
      "1.13",
      "1.14",
      "1.15",
      "1.16",
      "1.17"
   ],
   "wss4j":[
      "1.5.0",
      "1.6.0",
      "2.0.0",
      "2.1.0"
   ]
}
    else:
        return { 
    "ant-ivy": [
        "1.4.1",
        "2.0.0",
        "2.1.0",
        "2.2.0",
        "2.3.0",
        "2.4.0"
    ],
    "archiva": [
        "archiva-1.0",
        "archiva-1.1",
        "archiva-1.2",
        "archiva-1.3",
        "archiva-2.0.0",
        "archiva-2.1.0",
        "archiva-2.2.0"
    ],
    "calcite": [
        "calcite-1.8.0",
        "calcite-1.9.0",
        "calcite-1.10.0",
        "calcite-1.11.0",
        "calcite-1.12.0",
        "calcite-1.13.0",
        "calcite-1.14.0",
        "calcite-1.15.0"
    ],
    "cayenne": [
        "STABLE-3.0",
        "STABLE-3.1"
    ],
    "commons-bcel": [
        "BCEL_5_2",
        "BCEL_6_0",
        "BCEL_6_1",
        "BCEL_6_2"
    ],
    "commons-beanutils": [
        "BEANUTILS_1_7_0",
        "BEANUTILS_1_8_0",
        "BEANUTILS_1_9_0"
    ],
    "commons-codec": [
        "CODEC_1_2",
        "CODEC_1_3",
        "commons-codec-1.5",
        "1_6", 
        "1.7",
        "1.8",
        "1.9",
        "1.10"
    ],
    "commons-collections": [
        "collections-1.0",
        "collections-2.0",
        "collections-2.1",
        "collections-3.0",
        "collections-3.1",
        "collections-3.2",
        "collections-3.3",
        "collections-4.0",
        "collections-4.1"
    ],
    "commons-compress": [
        "rel/1.9",
        "rel/1.10",
        "rel/1.11",
        "rel/1.12",
        "rel/1.13",
        "rel/1.14",
        "rel/1.15",
        "rel/1.16"
    ],
    "commons-configuration": [
        "CONFIGURATION_1_4",
        "CONFIGURATION_1_5",
        "CONFIGURATION_1_6",
        "CONFIGURATION_1_7",
        "CONFIGURATION_1_8",
        "CONFIGURATION_1_9",
        "CONFIGURATION_1_10",
        "CONFIGURATION_2_2"
    ],
    "commons-dbcp": [
        "DBCP_1_3",
        "DBCP_1_4",
        "DBCP_2_0",
        "DBCP_2_1",
        "DBCP_2_2_0",
        "DBCP_2_3_0",
        "commons-dbcp-2.4.0",
        "commons-dbcp-2.5.0"
    ],
    "commons-digester": [
        "DIGESTER_1_4",
        "DIGESTER_1_5",
        "DIGESTER_1_6",
        "DIGESTER_1_7",
        "DIGESTER_1_8",
        "DIGESTER3_3_0",
        "DIGESTER3_3_1",
        "DIGESTER3_3_2"
    ],
    "commons-io": [
        "IO_1_0",
        "IO_1_1",
        "IO_1_2",
        "IO_1_3",
        "commons-io-1.4",
        "commons-io-2.0",
        "commons-io-2.1",
        "2.2",
        "2.3",
        "2.4",
        "2.5"
    ],
    "commons-jcs": [
        "JCS_1_0",
        "jcs_1_1_dev",
        "JCS_1_3",
        "commons-jcs-2.0",
        "commons-jcs-2.1",
        "commons-jcs-2.2"
    ],
    "commons-jexl": [
        "2.0",
        "2.1"
    ],
    "commons-lang": [
        "LANG_2_0",
        "LANG_2_1",
        "LANG_2_2",
        "LANG_2_3",
        "LANG_2_4",
        "LANG_2_5",
        "LANG_2_6",
        "LANG_3_0",
        "LANG_3_1",
        "LANG_3_2",
        "LANG_3_3",
        "LANG_3_4",
        "LANG_3_5",
        "LANG_3_7"
    ],
    "commons-math": [
        "MATH_1_0",
        "MATH_1_1",
        "MATH_1_2",
        "MATH_2_0",
        "MATH_2_1",
        "MATH_2_2",
        "MATH_3_0",
        "MATH_3_1",
        "MATH_3_2",
        "MATH_3_3",
        "MATH_3_4",
        "MATH_3_5"
    ],
    "commons-net": [
        "NET_1_0_0",
        "NET_1_1_0",
        "NET_1_2_0",
        "NET_1_3_0",
        "NET_1_4_0",
        "NET_2_0",
        "commons-net-2.1",
        "NET_2_2",
        "NET_3_0",
        "NET_3_1",
        "NET_3_2",
        "NET_3_3",
        "NET_3_4",
        "NET_3_5",
        "NET_3_6"
    ],
    "commons-scxml": [
        "SCXML_0_5",
        "SCXML_0_6",
        "SCXML_0_7",
        "SCXML_0_8",
        "SCXML_0_9"
    ],
    "commons-validator": [
        "VALIDATOR_1_1_0",
        "VALIDATOR_1_2_0",
        "VALIDATOR_1_3_0",
        "VALIDATOR_1_4_0",
        "VALIDATOR_1_5_0",
        "VALIDATOR_1_6"
    ],
    "commons-vfs": [
        "rel/commons-vfs-1.0",
        "rel/commons-vfs-2.0",
        "rel/commons-vfs-2.1",
        "rel/commons-vfs-2.2"
    ],
    "deltaspike": [
        "deltaspike-project-0.1-incubating",
        "deltaspike-project-0.2-incubating",
        "deltaspike-project-0.3-incubating",
        "deltaspike-project-0.4",
        "deltaspike-project-0.5",
        "deltaspike-project-0.6",
        "deltaspike-project-0.7",
        "deltaspike-project-1.0.0",
        "deltaspike-project-1.1.0",
        "deltaspike-project-1.2.0",
        "deltaspike-root-1.3.0",
        "deltaspike-1.4.0",
        "deltaspike-1.5.0",
        "deltaspike-1.6.0",
        "deltaspike-1.7.0",
        "deltaspike-1.8.0"
    ],
    "eagle": [
        "v0.3.0-incubating",
        "v0.4.0-incubating"
    ],
    "giraph": [
        "release-1.0.0-RC3",
        "release-1.1.0"
    ],
    "gora": [
        "0.1-incubating",
        "gora-0.2",
        "apache-gora-0.3",
        "apache-gora-0.4",
        "apache-gora-0.5",
        "apache-gora-0.6"
    ],
    "jspwiki": [
        "jspwiki_1_4_0",
        "jspwiki_1_5_0",
        "jspwiki_1_6_0",
        "jspwiki_1_7_0",
        "jspwiki_1_8_0",
        "jspwiki_2_0_36",
        "jspwiki_2_2_19",
        "jspwiki_2_4_56",
        "jspwiki_2_6_0",
        "jspwiki_2_8_0",
        "jspwiki_2_9_0_rc1",
        "jspwiki_2_10_0"
    ],
    "knox": [
        "v0.3.0-final",
        "v0.4.0-release",
        "v0.5.0-rc2",
        "v0.6.0-release",
        "v0.7.0-release",
        "v0.8.0-release",
        "v0.9.0-release",
        "v0.10.0-release",
        "v0.11.0-release",
        "v0.12.0-release",
        "v0.13.0-release",
        "v0.14.0-release",
        "v1.0.0-release"
    ],
    "kylin": [
        "v0.6.1",
        "kylin-0.7.1-incubating",
        "kylin-1.0-incubating",
        "kylin-1.1-incubating",
        "kylin-1.2",
        "kylin-1.3",
        "kylin-1.5.0",
        "kylin-1.6.0",
        "kylin-2.0.0",
        "kylin-2.1.0",
        "kylin-2.2.0"
    ],
    "lens": [
        "apache-lens-2.6.0",
        "apache-lens-2.7.0"
    ],
    "mahout": [
        "mahout-0.1",
        "mahout-0.2",
        "mahout-0.3",
        "mahout-0.4",
        "mahout-0.5",
        "mahout-0.6",
        "mahout-0.7",
        "mahout-0.8",
        "mahout-0.9",
        "mahout-0.10.0",
        "mahout-0.11.0",
        "mahout-0.12.0"
    ],
    "manifoldcf": [
        "release-0.1-incubating",
        "release-0.2-incubating",
        "release-0.3-incubating",
        "release-0.4-incubating",
        "release-0.5-incubating",
        "release-0.6",
        "release-1.0",
        "release-1.1",
        "release-1.2",
        "release-1.3",
        "release-1.4",
        "release-1.5",
        "release-1.6",
        "release-1.7",
        "release-1.8",
        "release-1.9",
        "release-1.10",
        "release-2.0",
        "release-2.1",
        "release-2.2",
        "release-2.3",
        "release-2.4-RC0",
        "release-2.5",
        "release-2.6",
        "release-2.7",
        "release-2.8",
        "release-2.9",
        "release-2.10"
    ],
    "nutch": [
        "release-0.7",
        "release-0.8",
        "release-0.9",
        "release-1.0",
        "relase-1.1",
        "release-1.2",
        "release-1.3",
        "release-1.4",
        "release-1.5",
        "release-1.6",
        "release-1.7",
        "release-1.8",
        "release-1.9",
        "release-1.10",
        "release-1.11-rc2",
        "release-1.12",
        "release-1.13",
        "release-1.14",
        "release-2.0",
        "release-2.1",
        "release-2.2",
        "release-2.3"
    ],
    "opennlp": [
        "opennlp-1.7.0",
        "opennlp-1.8.0"
    ],
    "parquet-mr": [
        "parquet-1.0.0",
        "parquet-1.1.0",
        "parquet-1.2.0",
        "parquet-1.3.0",
        "parquet-1.4.0",
        "parquet-1.5.0",
        "apache-parquet-mr-1.6.0-incubating",
        "apache-parquet-1.7.0",
        "apache-parquet-1.8.0",
        "apache-parquet-1.9.0"
    ],
    "santuario-java": [
        "1.4.5",
        "xmlsec-1.5.8",
        "xmlsec-2.0.0",
        "xmlsec-2.1.0"
    ],
    "systemml": [
        "v0.9.0-rc1",
        "0.10.0-incubating-rc1",
        "v0.11.0-incubating-rc1",
        "v0.12.0-incubating-rc1",
        "v0.13.0-incubating-rc1",
        "v0.14.0-incubating-rc1",
        "v0.15.0-rc1",
        "v1.0.0-rc1",
        "v1.1.0-rc1"
    ],
    "tika": [
        "0.1-incubating",
        "0.2",
        "0.3",
        "0.4",
        "0.5",
        "0.6",
        "0.7",
        "0.8",
        "0.9",
        "0.10",
        "1.0",
        "1.1",
        "1.2",
        "1.3",
        "1.4",
        "1.5",
        "1.6",
        "1.7",
        "1.8",
        "1.9-rc2",
        "1.10",
        "1.11",
        "1.12",
        "1.13",
        "1.14",
        "1.15",
        "1.16",
        "1.17"
    ],
    "wss4j": [
        "1_5_0",
        "1_6_0",
        "wss4j-2.0.0",
        "wss4j-2.1.0"
    ]
}


In [None]:
def getReleaseCorr(projectName, release):
    pos = getRelease()[projectName].index(release)
    return getRelease(True)[projectName][pos]

# Données automatiques

In [None]:
from git import Repo
from math import exp
import json


# clone repo
# repo = Repo.clone_from('https://github.com/apache/ant-ivy', projetName)

# Open result file
f = open('result_b.txt', 'w')
    
for project_name, git_url in getProjects().items():
    # projet name
    print('\n')
    print("===+> projet name", project_name)
    print('\n')
    repo = Repo(project_name)

    # Seuil en dessous duquel il n'y a pas correlation
    seuil = 0.75

    # Write project name in file
    f.write(project_name)
    f.write('\n')

    # checkout to specific release
    # print('\n')
    # print('-- TAGS --')
    # print(repo.tags)

    # dictionnaire des correlation de chaque smell avec les classes
    smell_correlation_dict = {}

    tags = getRelease()[project_name]
    for tag in tags:
        print('\n')
        print(tag)

        repo.git.checkout(tag)

        # smelly_classes = extract_traditional_smells2('ant-ivy')
        # print('\n')
        # print('-- Smelly classes --')
        # print(smelly_classes)

        smell_classes_dict = extract_traditional_smells(project_name)
        # print('-- Smell in classes --')
        # print(smell_classes_dict)
        # print('\n')


        for smell, classes in smell_classes_dict.items():
            size = len(classes)
            size_exp = exp(size)
            # print("size : ",size)
            # print("size_exp", size_exp)
            correlation_value = (size_exp)/(1+size_exp)

            if correlation_value > seuil:
                # print(smell, " : ", correlation_value)
                smell_correlation_dict[smell] = smell_correlation_dict.get(smell, 0) +1



    f.write(str(smell_correlation_dict))
    f.write('\n')
    print(smell_correlation_dict)

f.close()
print("... Done!")

# Données manuelles et openSSZ

In [None]:
from git import Repo



# Seuil en dessous duquel il n'y a pas correlation
seuil = 0.75

# Open result file
ff = open('result2_b.txt', 'w')

for project_name, git_url in getProjects().items():

    # dictionnaire des correlation de chaque smell avec les classes
    smell_correlation_dict = {}
    
    # projet name
    print('\n')
    print("===+> projet name", project_name)
    print('\n')
    repo = Repo(project_name)

    
    # Write project name in file
    ff.write(project_name)
    ff.write('\n')
    
    # checkout to specific release
    tags = getRelease()[project_name]
    for tag in tags:
        
        # dictionnaire des smells
        manu_smell_dict = {}
        
        tagCorr = getReleaseCorr(project_name, tag)
        print('\n')
        print('tag : ', tag)
        print('tag corr : ', tagCorr)
        
        # checkout to specific release
        repo.git.checkout(tag)

        smelly_classes = extract_traditional_smells(project_name)



        f = open('./release-level-data/'+project_name+'-'+tagCorr+'_bug_fixes.json')
        data = json.load(f)
        for line in data:
            filename = line['file'].split("/")[-1].split(".")[0]
            if line['bug_fixes']:
                #print(filename)

                for smell, classes in smelly_classes.items():
                    if filename in classes:
                        manu_smell_dict.setdefault(smell, []).append(filename)


        print(manu_smell_dict)

        for smell, classes in manu_smell_dict.items():
            size = len(classes)
            size_exp = exp(size)
            print("size : ",size)
            print("size_exp", size_exp)
            correlation_value = (size_exp)/(1+size_exp)

            if correlation_value > seuil:
                print(smell, " : ", correlation_value)
                smell_correlation_dict[smell] = smell_correlation_dict.get(smell, 0) +1
        f.close()
        
    print('\n')
    print('\n')
    print('--- Resultats ---')
    ff.write(str(smell_correlation_dict))
    ff.write('\n')
    print(smell_correlation_dict)
ff.close()
print("... Done!")

# Nettoyage de données

In [None]:
import os
import shutil

# clone repos
for project_name, git_url in getProjects().items():
    if os.path.isdir("./"+project_name):
        print(project_name)
        shutil.rmtree("./"+project_name)
print("... Done!")