In [None]:
# imports and settings
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
import cv2
import glob
import numpy.typing as npt
from math import log10, floor

pd.options.plotting.backend = "plotly"

In [None]:
# import and prepare data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()
images = np.append(train_images, test_images, axis=0)
images = images.reshape(images.shape[0], 28, 28, 1).astype('float32')

In [None]:
# run dct on images and gather first significant digits
fsd = []
dcts = np.array([cv2.dct(image) for image in images])
dcts = dcts.flatten()
dcts = dcts[dcts != 0]
for n in tqdm(dcts):
    num = int(abs(n * (10 ** -int(floor(log10(abs(n))))))) if n != 0 else 0
    fsd.append(num)
fsd = np.array(fsd)
print(f"Shape: {fsd.shape}")
print(f"Sum: {np.sum(fsd)}")
#print(fsd)

In [None]:
# NUMPY run dct on images and gather first significant digits
fsd_fast = [] #np.array([]).astype(int)
dcts = np.array([cv2.dct(image) for image in images])
n = dcts.flatten()
n = n[n != 0]
n = np.abs(n * np.power(np.full(n.shape, 10.), -np.floor(np.log10(np.abs(n).astype("float64"))))).astype("int")
fsd_fast.extend(list(n))
fsd_fast = np.array(fsd_fast)
print(f"Shape: {fsd_fast.shape}")
print(f"Sum: {np.sum(fsd_fast)}")

In [None]:
for i in range(len(fsd)):
    if fsd[i] != fsd_fast[i]:
        print(f"different at index {i}, {fsd[i]} != {fsd_fast[i]}")
print(np.sum(fsd) == np.sum(fsd_fast))

In [None]:
# count fds
count = []
for i in range(1,10):
    count.append(np.count_nonzero(fsd == i))
count = count / np.sum(count)

count_fast = []
for i in range(1,10):
    count_fast.append(np.count_nonzero(fsd_fast == i))
count_fast = count_fast / np.sum(count_fast)

print(count)
print(count_fast)

In [None]:
# generate ground truth benfords law
bf_law = []
for i in range(1,10):
    bf_law.append(log10(1 + (1 / i)))
bf_law

In [None]:
# plot data tp compare fsd vs benfords law
df = pd.DataFrame()
df["digit"] = [1,2,3,4,5,6,7,8,9]
df["MNIST FSD count"] = count
df["Benfords Law (ground truth)"] = bf_law

df.plot(x="digit", y=["MNIST FSD count", "Benfords Law (ground truth)"],
        labels={
            "digit" : "First Significant Digit (FSD)",
            "value" : "Probability"
        })

In [None]:
horses = np.array([cv2.imread(file, cv2.IMREAD_GRAYSCALE) for file in glob.glob("horses/000000/*.png")]).astype("float32")
print(horses.shape)

fsd = []
for horse in tqdm(horses):
    for dct in cv2.dct(horse):
        for n in dct:
            num = int(abs(n * (10 ** -int(floor(log10(abs(n))))))) if n != 0 else 0
            fsd.append(num)
fsd = np.array(fsd)

# count fds
count = []
for i in range(1,10):
    count.append(np.count_nonzero(fsd == i))
count = count / np.sum(count)
print(count)

bf_law = []
for i in range(1,10):
    bf_law.append(log10(1 + (1 / i)))

df = pd.DataFrame()
df["digit"] = [1,2,3,4,5,6,7,8,9]
df["MNIST FSD count"] = count
df["Benfords Law (ground truth)"] = bf_law

df.plot(x="digit", y=["MNIST FSD count", "Benfords Law (ground truth)"],
        labels={
            "digit" : "First Significant Digit (FSD)",
            "value" : "Probability"
        })
