In [1]:
import git
import re

In [2]:
#git diff between two commit
def get_diff(repo_path, commit_A, commit_B):
    repo = git.Repo(repo_path)
    diff = repo.git.diff(commit_A, commit_B, '-U0', '--histogram')
    return diff

In [3]:
#get the dictionary where the key is the file path and the value is a list of numbers of the changed lines
def generate_changes_dict(diff_output):
    file_path_pattern = re.compile(r'^\+\+\+ b/(.*)$')
    line_number_pattern = re.compile(r'^@@ -(\d+)(,(\d+))? \+(\d+)(,(\d+))? @@')

    result_dict = {}
    current_file_path = None
    numbers_list = []

    diff_lines = diff_output.split('\n')

    for line in diff_lines:

        file_path_match = file_path_pattern.match(line)
        line_number_match = line_number_pattern.match(line)

        if file_path_match:
            if current_file_path and numbers_list:
                result_dict[current_file_path] = numbers_list
                numbers_list = []

            current_file_path = file_path_match.group(1)
        elif line_number_match:
            start_line = int(line_number_match.group(1))
            num_lines = 1 if line_number_match.group(3) is None else int(line_number_match.group(3))
            
            # Aggiungi le linee modificate solo se non sono commenti
            if not match_comment(line):
                numbers_list.extend(range(start_line, start_line + num_lines))

    if current_file_path and numbers_list:
        result_dict[current_file_path] = numbers_list

    return result_dict


Funzione per non considerare le modifiche che riguardano commenti( cattura quelli su singola linea, le linee che iniziano per /* o <!– e quelle che finiscono per –!> o */

In [4]:
def match_comment(line):
    comment_pattern = re.compile(r'^\s*(#|//|<!--|/\*)|(?:.*?--!>|.*?\*/)\s*$')
    
    return comment_pattern.match(line[1:])  # Ignora il primo carattere perchè le linee iniziano per '-'

In [5]:
def get_candidate_commits(blame_result, file_path, changes_dict):
    # Definisci il pattern delle espressioni regolari
    pattern = re.compile(r'([a-f0-9]+)\s+(\d+)\s+(\d+)?(?:\s+(\d+))?\nauthor\s+([^\n]+)')

    # Inizializza il set di commit
    commit_set = set()

    # Trova tutte le corrispondenze nel testo di output
    matches = pattern.findall(blame_result)

    # Estrai le informazioni desiderate
    for match in matches:
        commit_hash, first_number, second_number, third_number, author = match
        
        # Controlla se il secondo numero è nella lista associata al percorso del file
        if int(second_number) in changes_dict.get(file_path, []):
            # Aggiungi le informazioni richieste al set
            commit_set.add((commit_hash, author))

    # Restituisci il set di commit
    return commit_set

In [6]:
def get_all_candidate_commits(repo, parent_commit, changes_dict):
    all_candidate_commits = set()

    for file_path, line_numbers in changes_dict.items():
        blame_result = repo.git.blame(parent_commit, file_path, "--line-porcelain")
        candidate_commits = get_candidate_commits(blame_result, file_path, changes_dict)
        all_candidate_commits = all_candidate_commits.union(candidate_commits)
    
    return all_candidate_commits

In [24]:
#retrieve commit from the repo
repository_url = "/Users/guido/Documents/Progetto/tensorflow"
repo = git.Repo(repository_url)
commits = repo.iter_commits()

In [25]:
#retrieve bug fix commit
bug_fix_commits = []

for commit in commits:
    commit_message = commit.message.lower()
    if 'bug' in commit_message and 'fix' in commit_message:
        bug_fix_commits.append(commit)

In [None]:
#retrieve only one commit and his parent
bug_fix_commit = bug_fix_commits[0]
parent_commit = bug_fix_commit.parents[0]

In [None]:
diff = get_diff(repository_url, bug_fix_commit, parent_commit)
print(diff)

In [None]:
changes_dict = generate_changes_dict(diff)
print(changes_dict)

In [14]:
all_candidate_commits = get_all_candidate_commits(repo, parent_commit, changes_dict)

In [17]:
print(bug_fix_commit)
print("Candidate commits: ")
print(all_candidate_commits)

60e827df64a757493b29756dd9cb71224a7c34bb
Candidate commits: 
{('cf1b0378f428fedb5194083f6ba9aa708388a58d', 'Adrian Kuegel'), ('f4529e80ab30a51207901b74b438980ac8b3ceaf', 'Adrian Kuegel'), ('2f60589a4f35f09576d055ed5fa3c3df0625fc62', 'Adrian Kuegel'), ('85ac1c6ddc93d4f53ff5b2c5c1c7bac7a8a44030', 'Sergey Kozub'), ('8651f9b1a81f2568e364aa8beb969d9162cd21aa', 'Adrian Kuegel')}
