In [132]:
import os
import pickle
import shutil
import tempfile
import traceback
from contextlib import contextmanager
from datetime import datetime, timedelta

import browser_cookie3
import google_auth_httplib2
import imagehash
import progressbar
import requests
from google.auth.exceptions import RefreshError
from google.auth.transport.requests import Request
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.http import build_http
from PIL import Image, UnidentifiedImageError
from pprint import pprint
from requests.cookies import MockRequest

SCOPES = ["https://www.googleapis.com/auth/photoslibrary.readonly"]

LIST = []
TOKEN = None

LOAD_LIST = False
LOAD_DUPL = False

COOKIES = browser_cookie3.chrome()

In [157]:
class UrlError(Exception):
    pass


@contextmanager
def tempinput(data):
    temp = tempfile.NamedTemporaryFile(delete=False)
    temp.write(data)
    temp.close()
    try:
        yield temp.name
    finally:
        os.unlink(temp.name)


def get_image(photo):
    response = requests.get(photo["baseUrl"])
    return response.content


def hash_diff(a, b):
    return abs(a - b) <= 500_000_000_000  # 200


class Hash:
    def __init__(self, hash):
        self.hash = hash
    def __hash__(self):
        return 1  # self.hash
    def __eq__(self, other):
        return hash_diff(other.hash, self.hash)
    def __str__(self):
        return str(self.hash)
    def __repr__(self):
        return self.__str__()

    
def get_hash(image):
    with tempinput(image) as fp:
        return Hash(int("0x" + str(imagehash.phash(Image.open(fp))), base=16))

    
def get_50(iterable):
    result = []
    for item in iterable:
        result.append(item)
        if len(result) == 50:
            yield result
            result.clear()
    yield result

In [136]:
if os.path.exists("photos_list_error.pkl"):
    with open("photos_list_error.pkl", "rb") as fp:
        TOKEN = pickle.load(fp)
        LIST = pickle.load(fp)

creds = None

if not os.path.exists("credentials.pkl"):
    # Set up Google Photos API credentials
    flow = InstalledAppFlow.from_client_secrets_file("credentials4.json", SCOPES)
    creds = flow.run_local_server(port=0)
    with open("credentials.pkl", "wb") as fp:
        pickle.dump(creds, fp)
else:
    with open("credentials.pkl", "rb") as fp:
        creds = pickle.load(fp)

# Set up the Google Photos API client
service = build("photoslibrary", "v1", credentials=creds, static_discovery=False)
http = google_auth_httplib2.AuthorizedHttp(creds, http=build_http())

In [140]:
# Download image metadata and binary image file

if LOAD_LIST:
    with open("photos_list.pkl", "rb") as infile:
        LIST = pickle.load(infile)
else:
    bar = progressbar.ProgressBar(max_value=22706)

    while True:
        if not creds.valid:
            creds.refresh(Request())

        try:
            results = service.mediaItems().list(pageSize=100, pageToken=TOKEN).execute()
        except HttpError as err:
            if err.status_code == 429:
                if os.path.exists("photos_list_error.pkl"):
                    os.remove("photos_list_error.pkl")
                with open("photos_list_error.pkl", "wb") as fp:
                    pickle.dump(TOKEN, fp)
                    pickle.dump(LIST, fp)
            raise

        if "nextPageToken" not in results:
            break

        TOKEN = results["nextPageToken"]

        items = results.get("mediaItems", [])
        for item in items:
            if "image" not in item["mimeType"] or "gif" in item["mimeType"]:
                continue
            if "image" not in item:
                item["image"] = get_image(item)
            LIST.append(item)
            bar.update(len(LIST))

In [141]:
squares = []
for item in progressbar.ProgressBar()(LIST):
    metadata = item["mediaMetadata"]
    if (
        "width" in metadata
        and "height" in metadata
        and metadata["width"] == metadata["height"]
    ):
        squares.append(item)

100% (22706 of 22706) |##################| Elapsed Time: 0:00:00 Time:  0:00:00


In [142]:
print(f"{len(LIST)} photos, {len(squares)} square ({len(squares)/len(LIST)*100:.1f}%)")

22706 photos, 155 square (0.7%)


In [143]:
with open("photos_squares.out", "w") as fp:
    for i, item in enumerate(squares):
        fp.write(item["productUrl"])
        if (i + 1) % 100 == 0:
            fp.write("\n")
        else:
            fp.write(", ")

In [158]:
# Get hashes
for item in progressbar.ProgressBar()(LIST):
    try:
        item["hash"] = get_hash(item["image"])
    except (KeyError, OSError):
        pass
    except UnidentifiedImageError:
        if not creds.valid:
            creds.refresh(Request())
        try:
            result = service.mediaItems().get(mediaItemId=item["id"]).execute()
            item["image"] = get_image(result)
            item["hash"] = get_hash(item["image"])
        except HttpError:
            pass

100% (22706 of 22706) |##################| Elapsed Time: 0:05:10 Time:  0:05:10


In [159]:
if not LOAD_LIST:
    with open("photos_list.pkl", "wb") as fp:
        pickle.dump(LIST, fp)

In [160]:
# Find duplicate hashes
if LOAD_DUPL:
    with open("photos_duplicates.pkl", "rb") as infile:
        DUPL = pickle.load(infile)
else:
    hashes = {}
    DUPL = {}
    for item in progressbar.ProgressBar()(LIST):
        try:
            hash = item["hash"]
        except KeyError:
            pass
        if hash in hashes:
            if item["id"] != hashes[hash]["id"]:
                DUPL.setdefault(hash, [hashes[hash]]).append(item)
        else:
            hashes[hash] = item

100% (22706 of 22706) |##################| Elapsed Time: 0:03:56 Time:  0:03:56


In [161]:
# Sort duplicates
for key, val in progressbar.ProgressBar()(DUPL.items()):
    val.sort(
        key=lambda x: (
            -int(x["mediaMetadata"]["width"] if "width" in x["mediaMetadata"] else 0),
            datetime.fromisoformat(x["mediaMetadata"]["creationTime"].strip("Z")),
        )
    )

100% (472 of 472) |######################| Elapsed Time: 0:00:00 Time:  0:00:00


In [162]:
num_dupl = sum(len(val[1:]) for val in DUPL.values())
print(f"{num_dupl} photos before prune")

597 photos before prune


In [163]:
# Remove real duplicate locations from DUPL
for key, val in progressbar.ProgressBar()(DUPL.items()):
    for item in val[1:]:
        if item["id"] == val[0]["id"]:
            val.remove(item)

100% (472 of 472) |######################| Elapsed Time: 0:00:00 Time:  0:00:00


In [164]:
num_dupl = sum(len(val[1:]) for val in DUPL.values())
print(f"{num_dupl} photos after prune")

530 photos after prune


In [165]:
if not LOAD_DUPL:
    with open("photos_duplicates.pkl", "wb") as fp:
        pickle.dump(DUPL, fp)

In [166]:
num_dupl = sum(len(val[1:]) for val in DUPL.values())
print(f"{len(LIST)} photos, {num_dupl} duplicates ({num_dupl/len(LIST)*100:.1f}%)")

22706 photos, 530 duplicates (2.3%)


In [167]:
# Make duplicate photo directories

shutil.rmtree("photos", ignore_errors=True)

os.mkdir("photos")

for key, val in progressbar.ProgressBar()(DUPL.items()):
    keydir = os.path.join("photos", str(key))
    os.mkdir(keydir)
    orig = True
    for item in val:
        if "image" not in item:
            continue
        mime, ext = item["mimeType"].split("/")
        if mime != "image":
            continue
        filename = f"{item['id']}.{ext}"
        if orig:
            filename = f"!original_{filename}"
            orig = False
        with open(os.path.join(keydir, filename), "wb") as fp:
            fp.write(item["image"])

100% (472 of 472) |######################| Elapsed Time: 0:00:01 Time:  0:00:01


In [168]:
with open("photos_duplicates.out", "w") as fp:
    for key, val in progressbar.ProgressBar()(DUPL.items()):
        print(f"Hash: {key}", file=fp)
        for item in val:
            print(f"    {item['productUrl']}", file=fp)

100% (472 of 472) |######################| Elapsed Time: 0:00:00 Time:  0:00:00


In [169]:
# Create list of items to be deleted
batch = dict(
    sorted(
        DUPL.items(),
        key=lambda x: datetime.fromisoformat(
            x[1][0]["mediaMetadata"]["creationTime"].strip("Z")
        ),
    )
)
batch = [item["productUrl"] for val in batch.values() for item in val[1:]]

In [170]:
with open("photos_tabs.out", "w") as fp:
    for i, url in enumerate(batch):
        fp.write(url)
        if (i + 1) % 100 == 0:
            fp.write("\n")
        else:
            fp.write(", ")

In [22]:
# Create list of dates to be changed
dates = []
for key, val in progressbar.ProgressBar()(DUPL.items()):
    url = val[0]["productUrl"]
    orig_date = datetime.fromisoformat(
        val[0]["mediaMetadata"]["creationTime"].strip("Z")
    )
    min_date = min(
        [
            datetime.fromisoformat(entry["mediaMetadata"]["creationTime"].strip("Z"))
            for entry in val
        ]
    )
    if orig_date - min_date > timedelta(days=1):
        dates.append([min_date, url])

dates.sort(key=lambda x: x[0])

for i, date in enumerate(dates):
    dates[i][0] = date[0].strftime("%Y %m %d %I:%M %p")

100% (9588 of 9588) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


In [23]:
with open("photos_dates.out", "w") as fp:
    for date, url in progressbar.ProgressBar()(dates):
        print(date, url, file=fp)

100% (513 of 513) |######################| Elapsed Time: 0:00:00 Time:  0:00:00


In [None]:
# STOP HERE UNTIL DELETION COMPLETE

In [25]:
# Detect any existing squares
bar = progressbar.ProgressBar(max_value=len(squares))

existing = []
for items in get_50(squares):
    ids = list(set([item["id"] for item in items]))
    results = service.mediaItems().batchGet(mediaItemIds=ids).execute()
    existing += [item["mediaItem"]["productUrl"] for item in results["mediaItemResults"] if "mediaItem" in item]
    bar.update(bar.value + len(items))

print(f"{len(existing)} squares are still there!")

with open("photos_missed_squares.out", "w") as fp:
    for i, url in enumerate(existing):
        fp.write(url)
        if (i + 1) % 100 == 0:
            fp.write("\n")
        else:
            fp.write(", ")

100% (5953 of 5953) |####################| Elapsed Time: 0:00:36 ETA:  00:00:00

305 squares are still there!


In [33]:
# Detect any existing duplicates
duplicates = [item for val in DUPL.values() for item in val[1:]]

bar = progressbar.ProgressBar(max_value=len(duplicates))

existing = []
for items in get_50(duplicates):
    ids = list(set(item["id"] for item in items))
    results = service.mediaItems().batchGet(mediaItemIds=ids).execute()
    existing += [item["mediaItem"]["productUrl"] for item in results["mediaItemResults"] if "mediaItem" in item]
    bar.update(bar.value + len(items))

print(f"{len(existing)} duplicates are still there!")

with open("photos_missed.out", "w") as fp:
    for i, url in enumerate(existing):
        fp.write(url)
        if (i + 1) % 100 == 0:
            fp.write("\n")
        else:
            fp.write(", ")

100% (26358 of 26358) |##################| Elapsed Time: 0:00:00 ETA:  00:00:00
  5% (1350 of 26358) |                   | Elapsed Time: 0:00:11 ETA:   0:03:47

KeyboardInterrupt: 

In [40]:
# Detect any missing originals
originals = [val[0] for val in DUPL.values()]

bar = progressbar.ProgressBar(max_value=len(originals))

missing = []
for items in get_50(originals):
    ids = list(set(item["id"] for item in items))
    results = service.mediaItems().batchGet(mediaItemIds=ids).execute()
    missing += [items[i] for i, result in enumerate(results["mediaItemResults"]) if "status" in result]
    bar.update(bar.value + len(items))

print(f"{len(missing)} originals are missing!")

shutil.rmtree("missing", ignore_errors=True)

os.mkdir("missing")

for item in progressbar.ProgressBar()(missing):
    mime, ext = item["mimeType"].split("/")
    filename = f"{item['id']}.{ext}"
    if "image" in item:
        with open(os.path.join("missing", filename), "wb") as fp:
            fp.write(item["image"])

100% (9588 of 9588) |####################| Elapsed Time: 0:01:21 ETA:  00:00:00

1613 originals are missing!


100% (1613 of 1613) |####################| Elapsed Time: 0:00:01 Time:  0:00:01


In [75]:
LIST[0]["hash"].__hash__()

17050226287

In [76]:
LIST[0]["hash"]

17050226287182450803

In [78]:
for i, item in progressbar.ProgressBar(max_value=len(LIST))(enumerate(LIST)):
    if item["productUrl"] in batch:
        del LIST[i]

100% (23451 of 23451) |##################| Elapsed Time: 0:00:00 Time:  0:00:00


In [125]:
for item in progressbar.ProgressBar()(LIST):
    try:
        item["hash"] = Hash(item["hash"].hash)
    except (KeyError, OSError):
        pass

100% (23061 of 23061) |##################| Elapsed Time: 0:00:00 Time:  0:00:00


In [126]:
LIST[0]["hash"].__hash__()

17050226287182450803