In [None]:
%pip install matplotlib tqdm

In [None]:
import random
from pathlib import Path

import matplotlib.pyplot as plt
import polars as pl
from tqdm import tqdm

from src.datafilters.dataset_builder import DatasetBuilder
from src.datafilters.base_filters import DataFilter

In [None]:
filename = "filedb.feather"
df = pl.read_ipc(filename)
df

In [None]:
# shuffle dataframe
def shuffle(df: pl.DataFrame, rand_col="rnd") -> pl.DataFrame:
    return df.with_columns(pl.Series(rand_col, [random.random() for _ in range(len(df))])).sort(rand_col).drop(rand_col)
df = shuffle(df)

In [None]:
# delete random items in dataframe
thresh = 0.9
import random


def rnd(_):
    return random.random()


def drop_rand(df: pl.DataFrame, exclude: list[str], thresh: float = 0.9) -> pl.DataFrame:
    new = df.select(pl.when(pl.all().exclude(*exclude).apply(rnd, skip_nulls=False) < thresh).then(pl.all()))
    return new.with_columns(df.select(*exclude)).select(df.columns)


df = drop_rand(df, ["path", "checkedtime"], 0.5)

In [None]:
# save
df.rechunk().write_ipc(filename)

In [None]:
df

### plot resolution vs modified time

In [None]:
plt.figure(figsize=(5, 10), dpi=300)
plt.scatter(
    x=df.select("modifiedtime"),
    y=df.with_columns(sum_res=pl.col("resolution").apply(lambda lst: sum(lst))).select("sum_res"),
    s=0.05,
    c=df.select("checkedtime"),
    alpha=0.5
)
plt.xlabel("modifiedtime")
plt.ylabel("sum resolution")

### Link a list of files based on data from the database

In [None]:
input_folder = Path("/mnt/Toshiba/.Grabber/")
output = input_folder.with_name(f"{input_folder.name}-linked")
category = "hash"
config_path = Path("database_config.toml")
overwrite = True

populate=False # you'll need to specify a filter to add to the builder if you use this
# define filters here
# from dataset_filters.external_filters import HashFilter
filter_list: list[DataFilter] = [
#     HashFilter()
    ]
# ^^ these filters do not change the output size. They only dictate what columns are available, 
# if what you want is not already available.

In [None]:
# Run
output.mkdir(exist_ok=True)
exts = [".jpg", ".jpeg", ".png", ".webp"]
filelist = [i for i in input_folder.rglob("*") if i.suffix in exts]
db = DatasetBuilder("filedb.feather", config_path)
##### add filters here ##### 
if populate:
    db.add_filters(*filter_list)
    if db.filters:
        db.populate_df(filelist)
############################
assert category in db.df.columns, f"selected category is not in {db.df.columns}"
file_data = db.df.filter(pl.col("path").is_in(list(map(str, filelist))))

with tqdm(file_data.iter_rows(named=True), total=len(file_data)) as t:
    for data in t:
        pth = Path(data["path"])
        hash_ = str(data[category])
        new_path: Path = (output / f"{hash_}_{pth.stem}").with_suffix(pth.suffix)
        if not new_path.exists() or overwrite:
            new_path.unlink()
            new_path.symlink_to(pth)
            t.set_description_str(hash_)
