In [8]:
import pandas as pd
import shutil
from tqdm.notebook import trange, tqdm

import imagehash




In [13]:
def get_media_file_meta_df(df_meta_files):
    df_meta_files = df_meta_files[df_meta_files['is_file']==True]
    df_meta_files['ext_lower'] = df_meta_files['ext'].str.lower()
    # print(df_meta_files.ext_lower.value_counts())
    selected_ext = [
        ".jpg", 
        ".heic",
        ".png", 
        ".mov", 
        ".avi", 
        ".aae", 
        ".bmp", 
        ".mp4", 
        ".mts", 
        ".gif", 
        ".3gp", 
        ".mpg", 
        ".mp3", 
        ".3gp", 
        ".jpeg",
        ".flv",
        ".wmv",
        ".wav",
        ".webp",
        ".m4v",
    ]

    df_meta_file_media = df_meta_files[df_meta_files["ext_lower"].isin(set(selected_ext))]
    total_media_size = df_meta_file_media['size'].sum()
    total_file_size = df_meta_files['size'].sum()
    print('total media size:\t', total_media_size)
    print('total file size:\t', total_file_size)
    print('media-size / file-size:\t', total_media_size/total_file_size)
    
    return df_meta_file_media.to_dict("records")



def get_dict(records):
    d = {}
    for re in records:
        fname = re['fname']
        if fname not in d.keys():
            d[fname] = []
        d[fname].append(re)
    
    for key in d.keys():
        d[key] = sorted(d[key], key=lambda x: x['mtime'])
    
    return d


def is_same(record1, record2):
    # is two file the same?
    is_fname_same = record1['fname'] == record2['fname']
    # is_size_same = record1['size'] == record2['size']
    is_size_same = (abs(record1['size'] - record2['size']) <= 10)
    
    # is_ctime_yyyy_same = record1['ctime_YYYY'] == record2['ctime_YYYY']
    # is_ctime_mm_same = record1['ctime_MM'] == record2['ctime_MM']
    # is_ctime_dd_same = record1['ctime_DD'] == record2['ctime_DD']
    # is_ctime_same = (is_ctime_yyyy_same and is_ctime_mm_same and is_ctime_dd_same)
    
    # is_ctime_same = record1['ctime'] == record2['ctime']
    # is_mtime_same = record1['mtime'] == record2['mtime']

    is_mtime_exact_same = record1['mtime'] == record2['mtime']
    
    is_mtime_yyyy_same = record1['mtime_YYYY'] == record2['mtime_YYYY']
    is_mtime_mm_same = record1['mtime_MM'] == record2['mtime_MM']
    is_mtime_dd_same = record1['mtime_DD'] == record2['mtime_DD']
    is_mtime_same = (is_mtime_yyyy_same and is_mtime_mm_same and is_mtime_dd_same)
    is_img_hash_similar = False
    
    img_hash_str_1 = record1['img_hash_str']
    img_hash_str_2 = record2['img_hash_str']
    if img_hash_str_1 == "" or img_hash_str_2 == "":
        is_img_hash_similar =  True
    else:
        # if info1['Image Height'] == info2['Image Height'] and info1['Image Width'] == info2['Image Width']:
        #     is_metadata_same = True
        if (imagehash.hex_to_hash(img_hash_str_1)  - imagehash.hex_to_hash(img_hash_str_2) )< 5: # bit hamming distance
            is_img_hash_similar = True
    
    return is_fname_same and (is_mtime_exact_same or is_img_hash_similar or is_size_same) 
    

def resolve_fname_conflict(records):
    total_records = len(records)
    if total_records == 1:
        return records
        
    status = [-1]*total_records
    status[0] = 1 # 1: keep # -1: not determine #0: remove
    
    
    for i in range(total_records):
        for j in range(i+1,total_records):
            record_i = records[i]
            record_j = records[j]
            
            if status[j]==0:
                # j is deleted, nothing to compare, do not make change
                continue
            
            is_same_val = is_same(record_i, record_j)
            # print(record_i['size'],record_j['size'],is_same_val)
            
            if is_same_val:
                # delete
                status[j] = 0
            else:
                # keep so far
                status[j] = 1
    
    deduplicate_records = []
    
    for keep, record in zip(status, records):
        if keep == 1:
            deduplicate_records.append(record)
        else:
            continue

    return deduplicate_records


def merge_dict(dict_old, dict_google):
    # merge two into one big dictionary, then resolve fname
    len_google_key = len(dict_google.keys())
    len_not_same_name = 0
    len_same_name = 0
    for old_key in tqdm(dict_old.keys()):
        if old_key in dict_google.keys():
            old_records = dict_old[old_key]
            dict_google[old_key]+=old_records
            len_same_name+=1
        else:
            dict_google[old_key]=dict_old[old_key]
            len_not_same_name+=1
        prev_records = dict_google[old_key]
        dict_google[old_key] = resolve_fname_conflict(prev_records)
    
    len_google_key_new = len(dict_google.keys())
    print(f'#google key: old:{len_google_key}, new: {len_google_key_new}, added: {len_google_key_new - len_google_key}')
    print('len_not_same_name', len_not_same_name)
    print('len_same_name', len_same_name)
    return dict_google


def merge_files(rcd_old, rcd_google):
    # get the dict of list
    dict_old = get_dict(rcd_old)
    dict_google = get_dict(rcd_google)
    print('old #fname', len(dict_old.keys()))
    old_records = sum([len(dict_old[key]) for key in dict_old.keys()])
    print('old #rcrds', old_records)
    
    print('google #fname', len(dict_google.keys()))
    google_records = sum([len(dict_google[key]) for key in dict_google.keys()])
    print('google #rcrds', google_records)
    
    print('total #rcrd', old_records+google_records)

    # self-deduplication
    dedup_num_rcds_old = 0 
    dedup_num_rcds_google = 0
    for key in tqdm(dict_old.keys()):
        prev_records = dict_old[key]
        prev_num_rcds = len(prev_records)
        dict_old[key] = resolve_fname_conflict(prev_records)
        curt_num_rcds = len(dict_old[key])
        dedup_num_rcds_old += (prev_num_rcds - curt_num_rcds)
    for key in tqdm(dict_google.keys()):
        prev_records = dict_google[key]
        prev_num_rcds = len(prev_records)
        dict_google[key] = resolve_fname_conflict(dict_google[key])
        curt_num_rcds = len(dict_google[key])
        dedup_num_rcds_google += (prev_num_rcds - curt_num_rcds)
        
    # for the same size.... how to remove them ? 
    print('self-dedup', dedup_num_rcds_old, dedup_num_rcds_google)
    # merge two data
    dict_google = merge_dict(dict_old, dict_google)
    return dict_google



In [14]:

df_meta_old = pd.read_json("/Volumes/ssd-0/old-photo-organize/MyPhotoVideoAudio-meta.json")
rcrd_meta_old_media = get_media_file_meta_df(df_meta_old)

df_meta_google = pd.read_json("/Volumes/ssd-0/old-photo-organize/google-photo-dump-20230220-meta.json")
rcrd_meta_google_media = get_media_file_meta_df(df_meta_google)

dict_all = merge_files(rcrd_meta_old_media, rcrd_meta_google_media)

total_records = sum([len(dict_all[key]) for key in dict_all.keys()])
print(total_records)


total media size:	 339501557
total file size:	 339501557
media-size / file-size:	 1.0
total media size:	 2910934
total file size:	 2910934
media-size / file-size:	 1.0
old #fname 8
old #rcrds 8
google #fname 3
google #rcrds 3
total #rcrd 11


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta_files['ext_lower'] = df_meta_files['ext'].str.lower()


  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

self-dedup 0 0


  0%|          | 0/8 [00:00<?, ?it/s]

#google key: old:3, new: 11, added: 8
len_not_same_name 8
len_same_name 0
11


### Proba duplicate imgs

In [15]:
key_records = [(key, len(dict_all[key])) for key in dict_all.keys()]
key_records = sorted(key_records, key=lambda x:x[1], reverse=True)
print(key_records[:100])
#71380
#68984

[('Wallace_at_University_of_Alabama_edit2.jpg', 1), ('DSC07994.JPG', 1), ('IMG_0867.MOV', 1), ('P5201477.AVI', 1), ('PB030168.JPG', 1), ('P5201458.AVI', 1), ('P1250290.AVI', 1), ('PA280004.JPG', 1), ('P1250286.AVI', 1), ('PA290079.JPG', 1), ('PA290077.JPG', 1)]


In [12]:
dict_all['IMG_0430.JPG']

KeyError: 'IMG_0430.JPG'