In [1]:
%pip install matplotlib tqdm

Collecting matplotlib
  Obtaining dependency information for matplotlib from https://files.pythonhosted.org/packages/4f/d7/3303f11188122f66c940056f162d030992e7fbc9c702869bab163e85156b/matplotlib-3.7.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading matplotlib-3.7.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Obtaining dependency information for contourpy>=1.0.1 from https://files.pythonhosted.org/packages/d8/23/8d968922459b1c8a2c6ffca28fac00324b06b3a0633be2a39b0b1c3f84ab/contourpy-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading contourpy-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.7 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Obtaining dependency information for fonttools>=4.22.0 from https://files

In [2]:
import random
from pathlib import Path

import matplotlib.pyplot as plt
import polars as pl
from tqdm import tqdm

from src.datarules.dataset_builder import DatasetBuilder
from src.datarules.base_rules import Rule

In [3]:
filename = "filedb.feather"
df = pl.read_ipc(filename)
df

path,mtime,size,width,height,channels
str,datetime[ms],i64,i64,i64,i64
"""/mnt/Toshiba/.…",2022-02-14 06:45:03,251546,1600,1600,3
"""/mnt/Toshiba/.…",2010-05-22 07:14:45,58654,396,592,3
"""/mnt/Toshiba/.…",2016-08-13 13:36:10,474472,1200,784,3
"""/mnt/Toshiba/.…",2022-05-07 10:34:11,216346,1600,1600,3
"""/mnt/Toshiba/.…",2020-10-24 10:47:58,137881,1280,1280,3
"""/mnt/Toshiba/.…",2022-07-04 12:05:17,1818210,2240,1280,3
"""/mnt/Toshiba/.…",2017-02-03 20:24:34,648837,826,1000,3
"""/mnt/Toshiba/.…",2018-04-20 07:56:20,1038371,2448,3264,3
"""/mnt/Toshiba/.…",2021-05-21 12:48:32,697842,1466,1232,3
"""/mnt/Toshiba/.…",2020-10-30 07:57:49,808173,800,1091,3


In [4]:
# shuffle dataframe
def shuffle(df: pl.DataFrame, rand_col="rnd") -> pl.DataFrame:
    return df.with_columns(pl.Series(rand_col, [random.random() for _ in range(len(df))])).sort(rand_col).drop(rand_col)
df = shuffle(df)

In [6]:
# delete random items in dataframe
thresh = 0.95
import random


def rnd(_):
    return random.random()


def drop_rand(df: pl.DataFrame, exclude: list[str], thresh: float = 0.9) -> pl.DataFrame:
    new = df.select(pl.when(pl.all().exclude(*exclude).apply(rnd, skip_nulls=False) < thresh).then(pl.all()))
    return new.with_columns(df.select(*exclude)).select(df.columns)


df = drop_rand(df, ["path"], 0.5)

In [7]:
# save
df.rechunk().write_ipc(filename)

In [None]:
from polars import col
df.select(col("modifiedtime") == col("modifiedtime").max())

### plot resolution vs modified time

In [None]:
plt.figure(figsize=(5, 10), dpi=300)
plt.scatter(
    x=df.select("modifiedtime"),
    y=df.with_columns(sum_res=pl.col("resolution").apply(lambda lst: sum(lst))).select("sum_res"),
    s=0.05,
    c=df.select("checkedtime"),
    alpha=0.5
)
plt.xlabel("modifiedtime")
plt.ylabel("sum resolution")

### Link a list of files based on data from the database

In [None]:
input_folder = Path("/mnt/Toshiba/.Grabber/")
output = input_folder.with_name(f"{input_folder.name}-linked")
category = "hash"
config_path = Path("database_config.toml")
overwrite = True

populate=False # you'll need to specify a rule to add to the builder if you use this
# define rules here
# from dataset_filters.external_filters import HashRule
rule_list: list[Rule] = [
#     HashFilter()
    ]
# ^^ these filters do not change the output size. They only dictate what columns are available,
# if what you want is not already.

In [None]:
# Run
output.mkdir(exist_ok=True)
exts = [".jpg", ".jpeg", ".png", ".webp"]
filelist = [i for i in input_folder.rglob("*") if i.suffix in exts]
db = DatasetBuilder(Path("filedb.feather"))
##### add filters here ##### 
if populate:
    db.add_rules(rule_list)
    if db.rules:
        db.populate_df(map(str, filelist))
############################
assert category in db.df.columns, f"selected category is not in {db.df.columns}"
file_data = db.df.filter(pl.col("path").is_in(list(map(str, filelist))))

with tqdm(file_data.iter_rows(named=True), total=len(file_data)) as t:
    for data in t:
        pth = Path(data["path"])
        hash_ = str(data[category])
        new_path: Path = (output / f"{hash_}_{pth.stem}").with_suffix(pth.suffix)
        if not new_path.exists() or overwrite:
            new_path.unlink()
            new_path.symlink_to(pth)
            t.set_description_str(hash_)
