In [98]:
from tqdm.auto import tqdm
import hashlib
from datetime import datetime
import exifread
import pandas as pd
from pathlib import Path
import os
import shutil
tqdm.pandas(desc="progress-bar")

In [99]:
# Get mapping of old path and new paths for all files

src_path=r'C:\Users\vigne\Desktop\Takeout'
dst_path=r'C:\Users\vigne\Pictures\New Photos'
formats = ['*.jpg','*.gif', '*.jpeg', '*.png','*.mp4','*.avi','*.3gp','*.mkv']
df=pd.DataFrame(columns=['filename','year','month'])
all_files = []
for ext in formats:
    all_files.extend(Path(src_path).rglob(ext))
df['filename']=all_files

def capturedate(filename):
    with open(filename, 'rb') as image: # file path and name
        try:
            exif = exifread.process_file(image)
            dt = str(exif['EXIF DateTimeOriginal'])
            date=datetime.strptime(dt, "%Y:%m:%d %H:%M:%S")
            return str(date.year),str(date.month).rjust(2, '0')
        except Exception as e:
            d=datetime.fromtimestamp(os.path.getmtime(filename))
            return d.strftime('%Y'),d.strftime('%m')

df[['year','month']]=df.progress_apply(lambda r: capturedate(r['filename']), axis=1, result_type="expand")
df['filename']=df['filename'].astype(str)
def create_new_path(row):
    #check file extension
    videos=['.mp4','.avi','.3gp','.mkv']
    fname=os.path.basename(row['filename'])
    if(Path(fname).suffix.lower() in videos):
        return os.path.join(dst_path,'Videos',row['year'],row['month'],fname)
    elif('screenshot' in fname.lower()):
        return os.path.join(dst_path,'Screenshot',row['year'],row['month'],fname)
    elif('wa' in fname.lower()):
        return os.path.join(dst_path,'Whatsapp',row['year'],row['month'],fname)
    else:
        return os.path.join(dst_path,row['year'],row['month'],fname)

df['new_path']=df.progress_apply(lambda r: create_new_path(r), axis=1)
print(len(df))

progress-bar: 100%|██████████| 260/260 [00:04<00:00, 63.98it/s] 
progress-bar: 100%|██████████| 260/260 [00:00<00:00, 23550.78it/s]

260





In [100]:
# create hash for new files and compare
def create_hash(filename):
    with open(filename, "rb") as f:
        file_hash = hashlib.blake2s()
        while chunk := f.read(8192):
            file_hash.update(chunk)
        return file_hash.hexdigest()

df['hash']=df.progress_apply(lambda r: create_hash(r['filename']), axis=1)
pq_file=os.path.join(dst_path,'photo_info.pq')
destination_content=pd.read_parquet(pq_file)
orignal_len=len(df)
df=df[~df['hash'].isin(destination_content['hash'])]
print(f'will copy {df.shape[0]}/{orignal_len} files')


progress-bar: 100%|██████████| 260/260 [00:12<00:00, 20.81it/s] 

will copy 234/260 files





In [101]:
# copy files to new path

with tqdm(total=df.shape[0]) as pbar: 
    for i,r in df.iterrows():
        pbar.update(1)
        os.makedirs(os.path.dirname(r['new_path']), exist_ok=True)
        shutil.copy(r['filename'], r["new_path"])

#writeback new hashes
for i,r in df.iterrows():
    destination_content=destination_content.append({'filename':str(r['new_path']).replace(dst_path,''),'hash':r['hash']},ignore_index=True)
destination_content.to_parquet(pq_file)

100%|██████████| 234/234 [00:09<00:00, 25.77it/s] 


# Utilities and for first time 

In [97]:
# remove duplicates within folder
folder=r'C:\Users\vigne\Pictures\New Photos'
contents=pd.read_parquet(os.path.join(folder,'photo_info.pq'))
duplicated_files=contents[contents['hash'].duplicated()]['filename'].to_list()
print("Duplicates : ",len(duplicated_files))
for i in duplicated_files:
    os.remove(os.path.join(folder,i.strip('\\')))


Duplicates :  756


In [None]:
# use this only if you creating a new source folder. else use above code
# after this run the hexdigest code (below) for that folder
# copy files to new path

with tqdm(total=df.shape[0]) as pbar: 
    for i,r in df.iterrows():
        pbar.update(1)
        os.makedirs(os.path.dirname(r['new_path']), exist_ok=True)
        shutil.copy(r['filename'], r["new_path"])

In [72]:
# Generate hash for all files in a folder
# only during the first time

hash_df=pd.DataFrame(columns=['filename','hash'])
path=r'C:\Users\vigne\Pictures\New Photos'
with tqdm(total=len(list(Path(path).rglob('*.*')))) as pbar: 
    for file in Path(path).rglob('*.*'):
        with open(file, "rb") as f:
            file_hash = hashlib.blake2s()
            while chunk := f.read(8192):
                file_hash.update(chunk)
            hash_df=hash_df.append({'filename':str(file).replace(path,''),'hash':file_hash.hexdigest()},ignore_index=True)
            pbar.update(1)

hash_df.to_parquet(os.path.join(path,'photo_info.pq'))

100%|██████████| 9307/9307 [01:51<00:00, 83.36it/s] 
