In [25]:
#Source : #https://folivora.tistory.com/67, https://gagadi.tistory.com/9

import os 
import hashlib 
import collections
import shutil

def compute_partial_hash_of(file_path, seek_pos, chunk_size): 
    with open(file_path, "rb") as f: 
        f.seek(seek_pos) 
        data = f.read(chunk_size) 
        if not data: 
            return # no more yield 
        
        hashes = hashlib.sha256()
        hashes.update(data)
        yield "_".join(map(lambda y: y.hexdigest(), hashes)) 
        
def make_challenge(file_path): 
    chunk_size = 1048576 #2**20
    file_size = os.path.getsize(file_path) 
    yield file_size
    yield from compute_partial_hash_of(file_path, 0, chunk_size)
    ## 1 MB = 1000000 bytes 의 hash 만을 가지고 비교
    yield from compute_partial_hash_of(file_path, file_size - chunk_size, chunk_size)
#     print(file_size - chunk_size)

def fuzzy_group_files(target_directory): 
    target_files = [] 
    for root, _, files in os.walk(target_directory): 
        target_files.extend(map(lambda x: os.path.join(root, x), files)) 
        
    groups, challenge, challenge_result = init_challenge(target_files) 
    
    found_challenge = True
    
    while found_challenge: 
        found_challenge = False 
        for _, v in groups.items(): 
            if len(v) > 1: 
                for file_path in v: 
                    if challenge[file_path] is not None: 
                        try: 
                            challenge_result[file_path].append(next(challenge[file_path])) 
                            found_challenge = True
                        except StopIteration: 
                            challenge[file_path] = None 
                            pass 
                
                groups = regroup(challenge_result) 
                print("중복 파일 ({x : y)},",collections.Counter(map(lambda x: len(x), groups.values()))) 
            return groups 
    
def regroup(challenge_result): 
    groups = {} # Regroup items 
    for file_path, v in challenge_result.items(): 
        key = "/".join(map(lambda x: str(x), v)) 
        if key not in groups: 
            groups[key] = [] 
        groups[key].append(file_path) 
    return groups 
    
def init_challenge(target_files): 
    groups = {"": []} 
    challenge = {} 
    challenge_result = {} 
    for file_path in target_files: 
        challenge[file_path] = make_challenge(file_path) 
        challenge_result[file_path] = [] 
        groups[""].append(file_path) 
    return groups, challenge, challenge_result 


def read_all_file(path):
    output = os.listdir(path) 
    file_list = [] 
    
    for i in output: 
        if os.path.isdir(path + "/" + i) and path.endswith('.exe'): 
            file_list.extend(read_all_file(path + "/" +i)) 
        elif os.path.isfile(path + "/" +i):
            file_list.append(path + "/" +i) 

    return file_list 

def copy_all_file(file_list, new_path): 
    for src_path in file_list: 
        file = src_path.split("/")[-1] 
        shutil.copyfile(src_path, new_path+"/"+file) 
        print("파일 {} 이동 완료".format(file)) # 작업한 파일명 출력

def main():
    
    src_path = "C:\\Users\\teddy\\files1" # 기존 폴더 경로
    new_path = "C:\\Users\\teddy\\files2" # 옮길 폴더 경로
    
    file_list = read_all_file(src_path) 
    copy_all_file(file_list, new_path)
    
    groups = fuzzy_group_files(new_path) 
    for _, v in groups.items(): 
        if len(v) > 1: 
            print("$$$$$ 중복 파일 출력 $$$$$") 
            print("\n".join(v))

        elif len(v) < 1:
            print("중복 파일 없음")
                
if __name__ == "__main__": 
    main()

    
    

파일 123  123.exe 이동 완료
파일 32rrfsdfa.exe 이동 완료
파일 sisinfo.exe 이동 완료
중복 파일 ({x : y)}, Counter({1: 3, 4: 1})
$$$$$ 중복 파일 출력 $$$$$
C:\Users\teddy\files2\123  123.exe
C:\Users\teddy\files2\123123.exe
C:\Users\teddy\files2\32rrfsdfa.exe
C:\Users\teddy\files2\sisinfo.exe
