In [None]:
# imports and settings
import tensorflow as tf
import numpy as np
import pandas as pd
import tqdm
import cv2
from math import log10, floor

pd.options.plotting.backend = "plotly"

In [None]:
# import and prepare data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()
images = np.append(train_images, test_images, axis=0)
images = images.reshape(images.shape[0], 28, 28, 1).astype('float32')

In [None]:
# run dct on images and gather first significant digits
fsd = []
for image in tqdm.tqdm(images):
    for dct in cv2.dct(image):
        for n in dct:
            num = int(abs(n * (10 ** -int(floor(log10(abs(n))))))) if n != 0 else 0
            fsd.append(num)
fsd = np.array(fsd)

In [None]:
# count fds
count = []
for i in range(1,10):
    count.append(np.count_nonzero(fsd == i))
count = count / np.sum(count)

In [None]:
# generate ground truth benfords law
bf_law = []
for i in range(1,10):
    bf_law.append(log10(1 + (1 / i)))

In [None]:
# plot data tp compare fsd vs benfords law
df = pd.DataFrame()
df["digit"] = [1,2,3,4,5,6,7,8,9]
df["MNIST FSD count"] = count
df["Benfords Law (ground truth)"] = bf_law

df.plot(x="digit", y=["MNIST FSD count", "Benfords Law (ground truth)"],
        labels={
            "digit" : "First Significant Digit (FSD)",
            "value" : "Probability"
        })