In [1]:
import sys
import os
from pathlib import Path

# This code adds the project's root directory to the Python path.
# This is necessary so that both the 'src' and 'scripts' directories can be found.
project_root = Path(os.getcwd()).parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Now we can import the miners and script functions using their full paths from the root
from src.algorithms.fuzzy.cuFFIMiner import cuFFIMiner
from src.algorithms.fuzzy.naiveFFIMiner import naiveFFIMiner

print("Successfully imported cuFFIMiner and naiveFFIMiner.")
print(f"Project Root: {project_root}")

data_dir = project_root / 'data' / 'fuzzy'
results_dir = project_root / 'results' / 'fuzzy'

Successfully imported cuFFIMiner and naiveFFIMiner.
Project Root: /export/home1/ltarun/cuda_pami


In [2]:
from pathlib import Path
import os, time, requests
import pandas as pd
from typing import List, Dict, Any, Tuple
from scripts.replicate_file import replicate_file
from scripts.fixedpoint_normalize import normalize_file
import matplotlib.pyplot as plt
from scripts.convert_to_parquet import convert_text_to_parquet

# New naming convention and directory layout:
#   data/fuzzy/<dataset_name>/ <dataset_name>.csv (original)
#                                 <dataset_name>_SF{N}_floating.ext
#                                 <dataset_name>_SF{N}_fixed.ext
#                                 <dataset_name>_SF{N}_quant_mult.txt
#                                 <dataset_name>_SF{N}_fixed.parquet
#                                 <dataset_name>_SF{N}_floating.parquet (NEW)
#   results/fuzzy/<dataset_name>/ ...

# -----------------------------
# Helpers using existing scripts
# -----------------------------

def _dataset_filename_from_url(url: str) -> str:
    return Path(url.split("?")[0]).name  # e.g. Fuzzy_retail.csv

def _dataset_name_no_ext(filename: str) -> str:
    return Path(filename).stem  # e.g. Fuzzy_retail

# We will place each dataset inside its own subfolder under data_dir

def download_dataset(url: str, base_data_dir: Path) -> Path:
    filename = _dataset_filename_from_url(url)
    name_root = _dataset_name_no_ext(filename)
    dataset_dir = base_data_dir / name_root
    dataset_dir.mkdir(parents=True, exist_ok=True)
    local_path = dataset_dir / filename
    if local_path.exists():
        print(f"[download] Existing: {local_path}")
        return local_path
    # Fallback: if legacy path (without subfolder) exists, move it
    legacy_path = base_data_dir / filename
    if legacy_path.exists():
        print(f"[download] Moving legacy file into subfolder: {legacy_path} -> {local_path}")
        local_path.write_bytes(legacy_path.read_bytes())
        return local_path
    print(f"[download] Fetch {url}")
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    local_path.write_bytes(r.content)
    print(f"[download] Saved {local_path}")
    return local_path




def ensure_floating_sf(original: Path, sf: int) -> Path:
    if sf < 1: sf = 1
    float_path = original.with_name(f"{original.stem}_SF{sf}_floating{original.suffix}")
    if float_path.exists():
        print(f"[SF] Using existing floating SF file: {float_path.name}")
        return float_path
    replicate_file(str(original), sf, str(float_path))
    return float_path

def ensure_fixed_variant(floating_path: Path) -> Tuple[Path, int]:
    stem = floating_path.stem.replace('_floating','')  # e.g. base_SF10
    fixed_path = floating_path.with_name(f"{stem}_fixed{floating_path.suffix}")
    quant_file = floating_path.with_name(f"{stem}_quant_mult.txt")
    # Backward compatibility: if only old scale file exists, read it and rename
    legacy_scale = floating_path.with_name(f"{stem}_scale.txt")
    if fixed_path.exists() and quant_file.exists():
        quant_val = int(quant_file.read_text().strip())
        print(f"[fixed] Reusing existing fixed file: {fixed_path.name} quant_mult={quant_val}")
        return fixed_path, quant_val
    if fixed_path.exists() and legacy_scale.exists():
        quant_val = int(legacy_scale.read_text().strip())
        quant_file.write_text(str(quant_val)+'\n')
        print(f"[fixed] Upgraded legacy scale -> quant_mult: {legacy_scale.name} -> {quant_file.name}")
        return fixed_path, quant_val
    fixed_generated, quant_val = normalize_file(str(floating_path))
    return Path(fixed_generated), quant_val

def ensure_fixed_parquet(fixed_text: Path) -> Path:
    parquet_path = fixed_text.with_suffix('.parquet')
    if parquet_path.exists():
        print(f"[parquet-fixed] Existing: {parquet_path.name}")
        return parquet_path
    convert_text_to_parquet(str(fixed_text), str(parquet_path))
    return parquet_path

def ensure_floating_parquet(floating_text: Path) -> Path:
    """Create a floating parquet preserving float probabilities for naiveFFIMiner.
    Schema: item:str, prob:float64, txn_id:uint32
    Naming: *_SF{N}_floating.parquet
    """
    parquet_path = floating_text.with_suffix('.parquet')
    if parquet_path.exists():
        try:
            sample = pd.read_parquet(parquet_path).head(5)
            # If any prob has a fractional component assume it's already floating
            if not sample.empty and sample['prob'].dtype.kind in {'f'} and any((sample['prob'] % 1) != 0):
                print(f"[parquet-floating] Existing floating parquet: {parquet_path.name}")
                return parquet_path
            else:
                print(f"[parquet-floating] Existing parquet appears integer or non-float; rebuilding to preserve floats.")
        except Exception:
            print("[parquet-floating] Could not validate existing parquet; rebuilding.")
    import re as _re
    df = pd.read_csv(
        floating_text,
        sep=":",
        header=None,
        names=["items_str", "values_str"],
        dtype=str,
        engine='python'
    )
    df.fillna("", inplace=True)
    pattern = r"[\t\r\n ]+$"
    df["items_str"] = df["items_str"].str.replace(pattern, "", regex=True)
    df["values_str"] = df["values_str"].str.replace(pattern, "", regex=True)
    df["items"] = df["items_str"].str.split("\t")
    df["values"] = df["values_str"].str.split("\t")
    records = []
    for txn_id, (items, values) in enumerate(zip(df["items"], df["values"]), start=1):
        if len(items) != len(values):
            continue
        for it, val in zip(items, values):
            if not it:
                continue
            try:
                fval = float(val)
            except ValueError:
                continue
            records.append((it, fval, txn_id))
    # Keep probability as float64 for arithmetic; do NOT coerce to string (previous bug)
    out_df = pd.DataFrame(records, columns=["item","prob","txn_id"]).astype({"item":"string","prob":"float64","txn_id":"uint32"})
    out_df.to_parquet(parquet_path, engine="pyarrow", index=False)

    print(f"[parquet-floating] Wrote {len(out_df)} rows -> {parquet_path.name}")
    return parquet_path

# Mapping supports

def support_to_float(support_int: int, quant_mult: int) -> float:
    if quant_mult > 0:
        return support_int / quant_mult
    return float(support_int)

# Mining both miners with unified quant_mult

def run_both_miners(fixed_parquet: Path, floating_parquet: Path, quant_mult: int, supports_scaled: List[int], results_subdir: Path, memory_type: str = "global", debug: bool = False) -> pd.DataFrame:
    rows: List[Dict[str, Any]] = []
    results_subdir.mkdir(parents=True, exist_ok=True)

    for sup_int in supports_scaled:
        cuffi_patterns_path = results_subdir / f"patterns_cuffi_sup{sup_int}.txt"
        if cuffi_patterns_path.exists():
            print(f"[cuFFIMiner] Skip sup={sup_int} (exists)")
        else:
            try:
                algo_c = cuFFIMiner(str(fixed_parquet), min_support=sup_int, scaling_factor=quant_mult, memory_type=memory_type, debug=debug)
                algo_c.mine(); algo_c.save(cuffi_patterns_path); algo_c.print_results()
                rows.append({"algorithm":"cuFFIMiner","support_quant_int":sup_int,"quant_mult":quant_mult,"exec_time":algo_c.get_execution_time(),"cpu_mem_mb":algo_c.get_memory_usage(),"gpu_mem_bytes":getattr(algo_c,'_gpu_memory_usage',None),"patterns_found":algo_c.get_pattern_count()})
            except Exception as e:
                print(f"[cuFFIMiner][ERROR] sup={sup_int}: {e}")
                rows.append({"algorithm":"cuFFIMiner","support_quant_int":sup_int,"quant_mult":quant_mult,"error":str(e)})

        sup_float = support_to_float(sup_int, quant_mult)
        print('sup_float:', sup_float)
        print(f"[naiveFFIMiner] quant_int={sup_int} -> float={sup_float} (forced quant_mult={quant_mult})")
        naive_patterns_path = results_subdir / f"patterns_naive_sup{sup_int}.txt"
        if naive_patterns_path.exists():
            print(f"[naiveFFIMiner] Skip sup={sup_int} (exists)")
        else:
            try:
                algo_n = naiveFFIMiner(str(floating_parquet), min_support=sup_float, quant_mult=quant_mult, debug=debug)
                algo_n.mine(); algo_n.save(naive_patterns_path); algo_n.print_results()
                rows.append({"algorithm":"naiveFFIMiner","support_quant_int":sup_int,"support_float":sup_float,"quant_mult":quant_mult,"exec_time":algo_n.get_execution_time(),"cpu_mem_mb":algo_n.get_memory_usage(),"gpu_mem_bytes":getattr(algo_n,'_gpu_memory_usage',None),"patterns_found":algo_n.get_pattern_count()})
            except Exception as e:
                print(f"[naiveFFIMiner][ERROR] sup={sup_int}: {e}")
                rows.append({"algorithm":"naiveFFIMiner","support_quant_int":sup_int,"support_float":sup_float,"quant_mult":quant_mult,"error":str(e)})
    return pd.DataFrame(rows)

In [3]:

plt.rcParams.update({
    'pdf.fonttype': 42,
    'ps.fonttype': 42,
    'figure.dpi': 150,
    'font.size': 11,
    'axes.titlesize': 12,
    'axes.labelsize': 11,
    'legend.fontsize': 9,
})

_METRIC_LABELS = {
    'exec_time': 'Execution Time (s)',
    'cpu_mem_mb': 'Peak CPU Memory (MB)',
    'gpu_mem_bytes': 'GPU Memory (MB)',
    'patterns_found': 'Patterns Found',
}

DEFAULT_FIG_CFG = {
    'width': 5.0,
    'height': 3.0,
    'legend_loc': 'best',
    'tight_layout': True,
}

def _ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def _prep_df(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    if 'gpu_mem_bytes' in d.columns:
        d['gpu_mem_mb'] = d['gpu_mem_bytes'] / (1024**2)
    return d

def plot_metric(df: pd.DataFrame, metric: str, dataset_name: str, output_dir: Path,
                fig_cfg: dict | None = None, scale_x: bool = False):
    cfg = {**DEFAULT_FIG_CFG, **(fig_cfg or {})}
    d = _prep_df(df)
    if metric == 'gpu_mem_mb' and 'gpu_mem_mb' not in d.columns:
        print('Skipping gpu_mem_mb (not present)')
        return
    xcol = 'support_quant_int'
    if scale_x and 'quant_mult' in d.columns:
        x = d[xcol] * d['quant_mult']
        xlabel = 'Support Threshold (raw * quant_mult)'
    else:
        x = d[xcol]
        xlabel = 'Support Threshold (quantized int)'
    fig, ax = plt.subplots(figsize=(cfg['width'], cfg['height']))
    for algo, sub in d.groupby('algorithm'):
        ycol = metric if metric != 'gpu_mem_mb' else 'gpu_mem_mb'
        ax.plot(x.loc[sub.index], sub[ycol], marker='o', label=algo)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(_METRIC_LABELS.get(metric, metric))
    ax.set_title(f"{dataset_name} â€“ {_METRIC_LABELS.get(metric, metric)}")
    ax.grid(alpha=0.25, linestyle=':')
    ax.legend(loc=cfg['legend_loc'])
    if cfg.get('tight_layout', True):
        fig.tight_layout()
    _ensure_dir(output_dir)
    out_file = output_dir / f"{dataset_name}_{metric}.pdf"
    fig.savefig(out_file, format='pdf')
    plt.close(fig)
    print(f"[figure] Wrote {out_file}")

def generate_all_figures(dataset_name: str, metrics_df: pd.DataFrame | None = None,
                          metrics_path: Path | None = None, output_subdir: str = 'figures',
                          metrics: list[str] | None = None, fig_cfg: dict | None = None,
                          scale_x: bool = False):
    if metrics_df is None:
        if metrics_path:
            metrics_df = pd.read_csv(metrics_path)
        else:
            ds_dir = results_dir / dataset_name
            files = sorted(ds_dir.glob('metrics_SF*.csv'))
            if not files:
                raise FileNotFoundError(f"No metrics file found in {ds_dir}")
            dfs = [pd.read_csv(f) for f in files]
            metrics_df = pd.concat(dfs, ignore_index=True).drop_duplicates()
    if metrics is None:
        metrics = ['exec_time','cpu_mem_mb','gpu_mem_mb','patterns_found']
    out_dir = results_dir / dataset_name / output_subdir
    for m in metrics:
        if m not in metrics_df.columns and not (m == 'gpu_mem_mb' and 'gpu_mem_bytes' in metrics_df.columns):
            print(f"[figure] Skip missing metric: {m}")
            continue
        plot_metric(metrics_df, m, dataset_name, out_dir, fig_cfg=fig_cfg, scale_x=scale_x)
    print("[figure] All requested figures generated.")

def run_experiment(dataset_url: str, sf: int, supports_quant_int: List[int], memory_type: str = 'global', debug: bool = False,
                   generate_figures: bool = True, fig_metrics: List[str] | None = None, fig_cfg: Dict[str, Any] | None = None,
                   scale_x: bool = False, fig_subdir: str = 'figures') -> pd.DataFrame:
    print("========== RUN EXPERIMENT (SF + quant_mult unified) ==========")
    print(f"Dataset URL : {dataset_url}")
    print(f"SF (concat) : {sf}")
    print(f"Quantized integer supports: {supports_quant_int}")

    original = download_dataset(dataset_url, data_dir)
    dataset_name = original.parent.name
    floating_sf_text = ensure_floating_sf(original, sf)
    fixed_file_text, quant_mult = ensure_fixed_variant(floating_sf_text)

    fixed_parquet = ensure_fixed_parquet(fixed_file_text)
    floating_parquet = ensure_floating_parquet(floating_sf_text)

    result_dir = results_dir / dataset_name
    metrics_df = run_both_miners(fixed_parquet, floating_parquet, quant_mult, supports_quant_int, result_dir, memory_type=memory_type, debug=debug)

    metrics_file = result_dir / f"metrics_SF{sf}.csv"
    if not metrics_file.exists() or len(metrics_df) > 0:
        metrics_df.to_csv(metrics_file, index=False)
        print(f"[metrics] Saved {metrics_file}")

    if generate_figures:
        try:
            generate_all_figures(dataset_name, metrics_df=metrics_df, metrics=fig_metrics, fig_cfg=fig_cfg, scale_x=scale_x, output_subdir=fig_subdir)
        except Exception as e:
            print(f"[figure][ERROR] {e}")

    print("============ DONE ============")
    return metrics_df

run_complete_experiment = run_experiment

In [4]:
retail = "https://u-aizu.ac.jp/~udayrage/datasets/fuzzyDatabases/Fuzzy_retail.csv"
retail_sup = [40000, 45000, 50000, 55000, 60000]

metrics_retail = run_experiment(retail, 100, retail_sup)
metrics_retail

Dataset URL : https://u-aizu.ac.jp/~udayrage/datasets/fuzzyDatabases/Fuzzy_retail.csv
SF (concat) : 100
Quantized integer supports: [40000, 45000, 50000, 55000, 60000]
[download] Existing: /export/home1/ltarun/cuda_pami/data/fuzzy/Fuzzy_retail/Fuzzy_retail.csv
[SF] Using existing floating SF file: Fuzzy_retail_SF100_floating.csv
[fixed] Reusing existing fixed file: Fuzzy_retail_SF100_fixed.csv quant_mult=10
[parquet-fixed] Existing: Fuzzy_retail_SF100_fixed.parquet
[parquet-floating] Existing floating parquet: Fuzzy_retail_SF100_floating.parquet
[cuFFIMiner] Skip sup=40000 (exists)
sup_float: 4000.0
[naiveFFIMiner] quant_int=40000 -> float=4000.0 (forced quant_mult=10)
[naiveFFIMiner] Skip sup=40000 (exists)
[cuFFIMiner] Skip sup=45000 (exists)
sup_float: 4500.0
[naiveFFIMiner] quant_int=45000 -> float=4500.0 (forced quant_mult=10)
[naiveFFIMiner] Skip sup=45000 (exists)
[cuFFIMiner] Skip sup=50000 (exists)
sup_float: 5000.0
[naiveFFIMiner] quant_int=50000 -> float=5000.0 (forced quant

In [None]:
connect = "https://u-aizu.ac.jp/~udayrage/datasets/fuzzyDatabases/Fuzzy_connect.csv"
connect_sup = [25000000, 24000000, 23000000, 22000000, 21000000]

metrics_connect = run_experiment(connect, 100, connect_sup)
metrics_connect

Dataset URL : https://u-aizu.ac.jp/~udayrage/datasets/fuzzyDatabases/Fuzzy_connect.csv
SF (concat) : 100
Quantized integer supports: [25000000]
[download] Existing: /export/home1/ltarun/cuda_pami/data/fuzzy/Fuzzy_connect/Fuzzy_connect.csv
[SF] Using existing floating SF file: Fuzzy_connect_SF100_floating.csv
[fixed] Reusing existing fixed file: Fuzzy_connect_SF100_fixed.csv quant_mult=10
[parquet-fixed] Existing: Fuzzy_connect_SF100_fixed.parquet
[parquet-floating] Existing floating parquet: Fuzzy_connect_SF100_floating.parquet

--- cuFFIMiner Results ---
Execution Time: 50.6945 seconds
Peak CPU Memory Usage: 38185.53 MB
Peak GPU Memory Usage: 19409.06 MB
Patterns Found: 5004
------------------------------
sup_float: 2500000.0
[naiveFFIMiner] quant_int=25000000 -> float=2500000.0 (forced quant_mult=10)

--- naiveFFIMiner Results ---
Execution Time: 55.6094 seconds
Peak CPU Memory Usage: 38185.53 MB
Peak GPU Memory Usage: 19027.06 MB
Patterns Found: 5004
--------------------------------

Unnamed: 0,algorithm,support_quant_int,quant_mult,exec_time,cpu_mem_mb,gpu_mem_bytes,patterns_found,support_float
0,cuFFIMiner,25000000,10,50.69445,38185.527344,20351877120,5004,
1,naiveFFIMiner,25000000,10,55.609369,38185.527344,19951321088,5004,2500000.0


In [None]:
kosarak = "https://u-aizu.ac.jp/~udayrage/datasets/fuzzyDatabases/Fuzzy_kosarak.csv"
kosarak_sup = [900000, 1000000]

metrics_kosarak = run_experiment(kosarak, 100, kosarak_sup)
metrics_kosarak

Dataset URL : https://u-aizu.ac.jp/~udayrage/datasets/fuzzyDatabases/Fuzzy_kosarak.csv
SF (concat) : 100
Quantized integer supports: [900000, 1000000]
[download] Fetch https://u-aizu.ac.jp/~udayrage/datasets/fuzzyDatabases/Fuzzy_kosarak.csv
[download] Saved /export/home1/ltarun/cuda_pami/data/fuzzy/Fuzzy_kosarak/Fuzzy_kosarak.csv
[replicate] Wrote: /export/home1/ltarun/cuda_pami/data/fuzzy/Fuzzy_kosarak/Fuzzy_kosarak_SF100_floating.csv (SF=100)


In [None]:
pumsb = "https://u-aizu.ac.jp/~udayrage/datasets/fuzzyDatabases/Fuzzy_pumsb.csv"
pumsb_sup = [100000, 200000]

metrics_pumsb = run_experiment(pumsb, 100, pumsb_sup)
metrics_pumsb