In [2]:
import time
import json
import datetime
import os
import tempfile
import subprocess

from tqdm import tqdm
from git import Repo
from git import Commit
from git import IndexFile


def get_merge_conflict_and_resolution(repo_name):
    # 初始化Git仓库对象
    repo = Repo(f"cases/{repo_name}")
    conflict_commits = []
    conflict_commits_num = 0

    # 遍历所有提交
    start = time.time()

    for commit in tqdm(repo.iter_commits(), desc="Commits"):
        # 检查提交是否存在合并冲突
        if len(commit.parents) != 2:
            continue
        try:
            base = repo.merge_base(commit.parents[0], commit.parents[1])[0]
        except:
            continue
        virtual_merge = IndexFile.from_tree(repo, base, commit.parents[0], commit.parents[1])
        actual_merge = IndexFile.from_tree(repo, commit)

        if not compare_index_files(virtual_merge.entries, actual_merge.entries):
            conflict_commits.append({
                "commit_hash": commit.hexsha,
                "commit_message": commit.message
            })
            conflict_commits_num += 1
    
    end = time.time()
    print(f"Total conflict commits: {conflict_commits_num}")
    print(f"Total extract time: {end-start} s")
    with open("conflict_commits_adjust.json", "w", encoding="utf-8") as f:
        json.dump(conflict_commits, f, indent=2)


def get_conflict_lines_trend(repo_path, conflict_commit_hash):
    repo = Repo(repo_path)
    conflict_commit = repo.commit(conflict_commit_hash)
    base_commit = repo.merge_base(conflict_commit.parents[0], conflict_commit.parents[1])[0]
    commits1, commits2 = get_commit_lists(conflict_commit, base_commit)

    total_duration = conflict_commit.committed_datetime - base_commit.committed_datetime
    num_intervals = len(commits1) + len(commits2) + 1
    # 计算每段的时间间隔
    interval = total_duration / num_intervals

    # 生成分段时间点
    segments = [base_commit.committed_datetime + (i + 1) * interval for i in range(num_intervals)]
    counts = [0]
    for seg in segments:
        cur_commit_branch1 = base_commit
        cur_commit_branch2 = base_commit
        while len(commits1) > 0 and commits1[-1]["time"] <= seg:
            cur_commit_branch1 = commits1.pop()["commit"]

        while len(commits2) > 0 and commits2[-1]["time"] <= seg:
            cur_commit_branch2 = commits2.pop()["commit"]
        
        # 计算当前时间点的冲突行数
        conflict_lines_count = get_conflict_lines_count(repo, cur_commit_branch1, cur_commit_branch2, base_commit)
        counts.append(conflict_lines_count)
    
    print(segments)
    print(counts)
    return segments, counts


def get_conflict_lines_count(repo: Repo, commit1: Commit, commit2: Commit, base: Commit):
    index1 = IndexFile.from_tree(repo, commit1).entries
    index2 = IndexFile.from_tree(repo, commit2).entries
    conflict_files = get_conflict_files(index1, index2)
    conflict_lines_count = 0

    for f in conflict_files:
        merge_lines = merge_files(repo, commit1, commit2, base, f)
        start, end = 0, 0
        
        for index, line in enumerate(merge_lines):
            if line.startswith('<<<<<<<'):
                start = index
            elif line.startswith('>>>>>>>'):
                end = index
        
        conflict_lines_count += end - start + 1
    
    return conflict_lines_count


def merge_files(repo, commit1, commit2, base, file_path):
    # 初始化仓库
    # 创建临时文件夹
    with tempfile.TemporaryDirectory() as temp_dir:
        base_file = os.path.join(temp_dir, 'base_file')
        local_file = os.path.join(temp_dir, 'local_file')
        remote_file = os.path.join(temp_dir, 'remote_file')
        
        # 检出文件内容到临时文件
        with open(base_file, 'w') as bf, open(local_file, 'w') as lf, open(remote_file, 'w') as rf:
            bf.write(repo.git.show(f'{base}:{file_path}'))
            lf.write(repo.git.show(f'{commit1}:{file_path}'))
            rf.write(repo.git.show(f'{commit2}:{file_path}'))
        
        # 执行 git merge-file 命令
        result = subprocess.run(
            ['git', 'merge-file', '-p', local_file, base_file, remote_file],
            stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
        )
        merge_lines = result.stdout.splitlines()

        return merge_lines


def get_conflict_files(index1, index2):
    common_files = set(index1.keys()) & set(index2.keys())
    conflict_files = []

    for path in common_files:
        if index1[path].hexsha != index2[path].hexsha:
            conflict_files.append(path)

    # Filepath only, ignoring file mode
    return [f[0] for f in conflict_files]


def get_commit_lists(conflict_commit, base_commit) -> tuple[list, list]:
    commit_list_1 = []
    commit_list_2 = []

    iter_commit = conflict_commit.parents[0]
    while iter_commit != base_commit:
        commit_list_1.append({
            "commit": iter_commit,
            "time": iter_commit.committed_datetime.astimezone(datetime.timezone.utc)
        })
        iter_commit = iter_commit.parents[0]
    
    iter_commit = conflict_commit.parents[1]
    while iter_commit != base_commit:
        commit_list_2.append({
            "commit": iter_commit,
            "time": iter_commit.committed_datetime.astimezone(datetime.timezone.utc)
        })
        iter_commit = iter_commit.parents[0]
    return commit_list_1, commit_list_2


def compare_index_files(index1, index2):
    # 比较两个 IndexFile 对象中的每个条目
    if set(index1.keys()) != set(index2.keys()):
        print("index keys difference")
        return False

    # 比较每个键对应的值
    for path in index1.keys():
        # 检查文件路径是否相同
        if index1[path].path != index2[path].path:
            print(f"path difference: {path}")
            return False
        # 检查文件模式是否相同
        if index1[path].mode != index2[path].mode:
            print(f"mode difference: {path}")
            return False
        # 检查文件哈希值是否相同
        if index1[path].hexsha != index2[path].hexsha:
            print(f"hexsha difference: {path}")
            return False

    return True


In [17]:
repo_path = "../cases/rails"
conflict_commit_hash = "8898a0ae2a0d5c8826fe08bbf4ccd10745802f9e"
repo = Repo(repo_path)
conflict_commit = repo.commit(conflict_commit_hash)
base_commit = repo.merge_base(conflict_commit.parents[0], conflict_commit.parents[1])[0]

# <git.Commit "3ec5aab87c18df2e79e336124657f8aef9074095">
# <git.Commit "0657d1ef9e0a4a4e3974028a3a7c6426330c4c2b">

In [18]:
virtual_merge = IndexFile.from_tree(repo, base_commit, conflict_commit.parents[0], conflict_commit.parents[1])

In [19]:
def get_conflict_files(conflict_index_files: IndexFile):
    conflicted_files = {}
    for entry in conflict_index_files.entries.values():
        if entry.stage == 0:
            continue

        if entry.path not in conflicted_files:
            conflicted_files[entry.path] = {}

        conflicted_files[entry.path][entry.stage] = entry.hexsha
    return conflicted_files
conflict_files = get_conflict_files(virtual_merge)

In [20]:
conflict_files

{'activerecord/lib/active_record/reflection.rb': {1: 'e1a3c59f08e470b0a7e92513bc766631b83de991',
  2: '8af0d06ecb1f25f0c56c47be43b03457a3d3b71d',
  3: '5bfc6727d80751c73449151ffe47ef505bd957a5'},
 'railties/lib/rails/command.rb': {1: '6065e78fd11dc63a1e9bbd727dca5697f89a9938',
  2: 'ddb953543f6ec420b75a6714d4df6fb1ae910312',
  3: 'e8497579ae60842d334a3975d222b9aa73ee86a9'},
 'railties/lib/rails/engine.rb': {1: 'e56f6159ad332631b9e7aa68432dc465134b50f5',
  2: '12b4f06c27eb47bcb42c3fb9a5c70bc7f7dd6b10',
  3: '53334489aad1519f8a14a9c3c7a61463455047a4'},
 'railties/lib/rails/generators.rb': {1: '67037106e513a11f74894bfcc222b3e395ef6b6a',
  2: 'e1980a42ad6f4b0675f6220090fe7601a52d0508',
  3: 'c5b7caaf2129770a9f95e13f600e84683f835271'}}

In [37]:
with open("../tmp/base", "w") as f:
    f.write(repo.git.show(f'{conflict_files["railties/lib/rails/command.rb"][1]}'))

In [38]:
with open("../tmp/local", "w") as f:
    f.write(repo.git.show(f'{conflict_files["railties/lib/rails/command.rb"][2]}'))

In [39]:
with open("../tmp/remote", "w") as f:
    f.write(repo.git.show(f'{conflict_files["railties/lib/rails/command.rb"][3]}'))

In [40]:
result = subprocess.run(
    ['git', 'merge-file', '-p', "--diff3", "../tmp/local", "../tmp/base", "../tmp/remote"],
    stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)
merge_lines = result.stdout.splitlines()

In [41]:
with open("../tmp/merge", "w") as f:
    f.write("\n".join(merge_lines))

In [42]:
merge_lines

['require "active_support"',
 'require "active_support/dependencies/autoload"',
 'require "active_support/core_ext/enumerable"',
 'require "active_support/core_ext/object/blank"',
 'require "active_support/core_ext/hash/transform_values"',
 '',
 'require "thor"',
 '',
 'module Rails',
 '  module Command',
 '    extend ActiveSupport::Autoload',
 '',
 '    autoload :Behavior',
 '    autoload :Base',
 '',
 '    include Behavior',
 '',
 '    class << self',
 '      def hidden_commands # :nodoc:',
 '        @hidden_commands ||= []',
 '      end',
 '',
 '      def environment # :nodoc:',
 '        ENV["RAILS_ENV"] || ENV["RACK_ENV"] || "development"',
 '      end',
 '',
 '      # Receives a namespace, arguments and the behavior to invoke the command.',
 '      def invoke(namespace, args = [], **config)',
 '        namespace = namespace.to_s',
 '        namespace = "help" if namespace.blank? || Thor::HELP_MAPPINGS.include?(namespace)',
 '        namespace = "version" if %w( -v --version ).inc

In [80]:
for index, line in enumerate(merge_lines):
    if line.startswith('<<<<<<<'):
        print(f"Start: {index}")
    elif line.startswith('>>>>>>>'):
        print(f"End: {index}")

In [42]:

diff = difflib.unified_diff(c.splitlines(), b.splitlines())
"/n".join(diff)



In [45]:
commits = []
with open("../conflict_commits_rails_bac.json", "r") as f:
    commits = json.load(f)

filtered_commits = [c for c in commits if "Conflicts:\n" in c["commit_message"]]

with open("../conflict_commits_rails_filtered.json", "w") as f:
    json.dump(filtered_commits, f, indent=2)