In [1]:
import sys
import os
from pathlib import Path

# This code adds the project's root directory to the Python path.
# This is necessary so that both the 'src' and 'scripts' directories can be found.
project_root = Path(os.getcwd()).parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Now we can import the miners and script functions using their full paths from the root
from src.algorithms.fuzzy.cuFFIMiner import cuFFIMiner
from src.algorithms.fuzzy.naiveFFIMiner import naiveFFIMiner

print("Successfully imported cuFFIMiner and naiveFFIMiner.")
print(f"Project Root: {project_root}")

data_dir = project_root / 'data' / 'fuzzy'
results_dir = project_root / 'results' / 'fuzzy'

Successfully imported cuFFIMiner and naiveFFIMiner.
Project Root: /export/home1/ltarun/cuda_pami


In [2]:
from pathlib import Path
import os, time, requests
import pandas as pd
from typing import List, Dict, Any, Tuple
import importlib
from scripts import fixedpoint_normalize as _fpn_mod
import matplotlib.pyplot as plt
# ensure latest version of module (avoid stale cached copy)
importlib.reload(_fpn_mod)
from scripts.replicate_file import replicate_file
from scripts.fixedpoint_normalize import normalize_file


def _dataset_filename_from_url(url: str) -> str:
    return Path(url.split("?")[0]).name  # e.g. Fuzzy_retail.csv

def _dataset_name_no_ext(filename: str) -> str:
    return Path(filename).stem  # e.g. Fuzzy_retail

# We will place each dataset inside its own subfolder under data_dir

def download_dataset(url: str, base_data_dir: Path) -> Path:
    filename = _dataset_filename_from_url(url)
    name_root = _dataset_name_no_ext(filename)
    dataset_dir = base_data_dir / name_root
    dataset_dir.mkdir(parents=True, exist_ok=True)
    local_path = dataset_dir / filename
    if local_path.exists():
        print(f"[download] Existing: {local_path}")
        return local_path
    # Fallback: if legacy path (without subfolder) exists, move it
    legacy_path = base_data_dir / filename
    if legacy_path.exists():
        print(f"[download] Moving legacy file into subfolder: {legacy_path} -> {local_path}")
        local_path.write_bytes(legacy_path.read_bytes())
        return local_path
    print(f"[download] Fetch {url}")
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    local_path.write_bytes(r.content)
    print(f"[download] Saved {local_path}")
    return local_path


def ensure_floating_sf(original: Path, sf: int) -> Path:
    if sf < 1: sf = 1
    float_path = original.with_name(f"{original.stem}_SF{sf}_floating{original.suffix}")
    if float_path.exists():
        print(f"[SF] Using existing floating SF file: {float_path.name}")
        return float_path
    replicate_file(str(original), sf, str(float_path))
    return float_path

def ensure_fixed_variant(floating_path: Path) -> Tuple[Path, int]:
    stem = floating_path.stem.replace('_floating','')  # e.g. base_SF10
    fixed_path = floating_path.with_name(f"{stem}_fixed.parquet")
    quant_file = floating_path.with_name(f"{stem}_quant_mult.txt")
    # Backward compatibility: if only old scale file exists, read it and rename
    legacy_scale = floating_path.with_name(f"{stem}_scale.txt")
    if fixed_path.exists() and quant_file.exists():
        quant_val = int(quant_file.read_text().strip())
        print(f"[fixed] Reusing existing fixed file: {fixed_path.name} quant_mult={quant_val}")
        return fixed_path, quant_val
    if fixed_path.exists() and legacy_scale.exists():
        quant_val = int(legacy_scale.read_text().strip())
        quant_file.write_text(str(quant_val)+'\n')
        print(f"[fixed] Upgraded legacy scale -> quant_mult: {legacy_scale.name} -> {quant_file.name}")
        return fixed_path, quant_val
    fixed_generated, quant_val = normalize_file(str(floating_path), write_fixed_text=False)
    return Path(fixed_generated), quant_val

def ensure_fixed_parquet(fixed_text: Path) -> Path:
    # GPU pipeline already produced parquet alongside fixed text; just return if exists
    parquet_path = fixed_text.with_suffix('.parquet')
    if parquet_path.exists():
        print(f"[parquet-fixed] Existing: {parquet_path.name}")
        return parquet_path
    raise FileNotFoundError(f"Expected parquet produced by pipeline missing: {parquet_path}")

def ensure_floating_parquet(floating_text: Path) -> Path:
    """Return floating parquet (produced by pipeline)."""
    parquet_path = floating_text.with_suffix('.parquet')
    if parquet_path.exists():
        return parquet_path
    # If not present user likely hasn't run normalization yet; trigger pipeline via normalize_file
    normalize_file(str(floating_text), write_fixed_text=False)
    if parquet_path.exists():
        return parquet_path
    raise FileNotFoundError(f"Floating parquet not found: {parquet_path}")

# Mapping supports

def support_to_float(support_int: int, quant_mult: int) -> float:
    if quant_mult > 0:
        return support_int / quant_mult
    return float(support_int)

# Mining both miners with unified quant_mult

def run_both_miners(fixed_parquet: Path, floating_parquet: Path, quant_mult: int, supports_scaled: List[int], results_subdir: Path, memory_type: str = "global", debug: bool = False) -> pd.DataFrame:
    rows: List[Dict[str, Any]] = []
    results_subdir.mkdir(parents=True, exist_ok=True)

    for sup_int in supports_scaled:
        cuffi_patterns_path = results_subdir / f"patterns_cuffi_sup{sup_int}.txt"
        if cuffi_patterns_path.exists():
            print(f"[cuFFIMiner] Skip sup={sup_int} (exists)")
        else:
            try:
                algo_c = cuFFIMiner(str(fixed_parquet), min_support=sup_int, scaling_factor=quant_mult, memory_type=memory_type, debug=debug)
                algo_c.mine(); algo_c.save(cuffi_patterns_path); algo_c.print_results()
                rows.append({"algorithm":"cuFFIMiner","support_quant_int":sup_int,"quant_mult":quant_mult,"exec_time":algo_c.get_execution_time(),"cpu_mem_mb":algo_c.get_memory_usage(),"gpu_mem_bytes":getattr(algo_c,'_gpu_memory_usage',None),"patterns_found":algo_c.get_pattern_count()})
            except Exception as e:
                print(f"[cuFFIMiner][ERROR] sup={sup_int}: {e}")
                rows.append({"algorithm":"cuFFIMiner","support_quant_int":sup_int,"quant_mult":quant_mult,"error":str(e)})

        sup_float = support_to_float(sup_int, quant_mult)
        print('sup_float:', sup_float)
        print(f"[naiveFFIMiner] quant_int={sup_int} -> float={sup_float} (forced quant_mult={quant_mult})")
        naive_patterns_path = results_subdir / f"patterns_naive_sup{sup_int}.txt"
        if naive_patterns_path.exists():
            print(f"[naiveFFIMiner] Skip sup={sup_int} (exists)")
        else:
            try:
                algo_n = naiveFFIMiner(str(floating_parquet), min_support=sup_float, quant_mult=quant_mult, debug=debug)
                algo_n.mine(); algo_n.save(naive_patterns_path); algo_n.print_results()
                rows.append({"algorithm":"naiveFFIMiner","support_quant_int":sup_int,"support_float":sup_float,"quant_mult":quant_mult,"exec_time":algo_n.get_execution_time(),"cpu_mem_mb":algo_n.get_memory_usage(),"gpu_mem_bytes":getattr(algo_n,'_gpu_memory_usage',None),"patterns_found":algo_n.get_pattern_count()})
            except Exception as e:
                print(f"[naiveFFIMiner][ERROR] sup={sup_int}: {e}")
                rows.append({"algorithm":"naiveFFIMiner","support_quant_int":sup_int,"support_float":sup_float,"quant_mult":quant_mult,"error":str(e)})
    return pd.DataFrame(rows)

In [3]:

plt.rcParams.update({
    'pdf.fonttype': 42,
    'ps.fonttype': 42,
    'figure.dpi': 150,
    'font.size': 11,
    'axes.titlesize': 12,
    'axes.labelsize': 11,
    'legend.fontsize': 9,
})

_METRIC_LABELS = {
    'exec_time': 'Execution Time (s)',
    'cpu_mem_mb': 'Peak CPU Memory (MB)',
    'gpu_mem_bytes': 'GPU Memory (MB)',
    'patterns_found': 'Patterns Found',
}

DEFAULT_FIG_CFG = {
    'width': 5.0,
    'height': 3.0,
    'legend_loc': 'best',
    'tight_layout': True,
}

def _ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def _prep_df(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    if 'gpu_mem_bytes' in d.columns:
        d['gpu_mem_mb'] = d['gpu_mem_bytes'] / (1024**2)
    return d

def plot_metric(df: pd.DataFrame, metric: str, dataset_name: str, output_dir: Path,
                fig_cfg: dict | None = None, scale_x: bool = False):
    cfg = {**DEFAULT_FIG_CFG, **(fig_cfg or {})}
    d = _prep_df(df)
    if metric == 'gpu_mem_mb' and 'gpu_mem_mb' not in d.columns:
        print('Skipping gpu_mem_mb (not present)')
        return
    xcol = 'support_quant_int'
    if scale_x and 'quant_mult' in d.columns:
        x = d[xcol] * d['quant_mult']
        xlabel = 'Support Threshold (raw * quant_mult)'
    else:
        x = d[xcol]
        xlabel = 'Support Threshold (quantized int)'
    fig, ax = plt.subplots(figsize=(cfg['width'], cfg['height']))
    for algo, sub in d.groupby('algorithm'):
        ycol = metric if metric != 'gpu_mem_mb' else 'gpu_mem_mb'
        ax.plot(x.loc[sub.index], sub[ycol], marker='o', label=algo)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(_METRIC_LABELS.get(metric, metric))
    ax.set_title(f"{dataset_name} â€“ {_METRIC_LABELS.get(metric, metric)}")
    ax.grid(alpha=0.25, linestyle=':')
    ax.legend(loc=cfg['legend_loc'])
    if cfg.get('tight_layout', True):
        fig.tight_layout()
    _ensure_dir(output_dir)
    out_file = output_dir / f"{dataset_name}_{metric}.pdf"
    fig.savefig(out_file, format='pdf')
    plt.close(fig)
    print(f"[figure] Wrote {out_file}")

def generate_all_figures(dataset_name: str, metrics_df: pd.DataFrame | None = None,
                          metrics_path: Path | None = None, output_subdir: str = 'figures',
                          metrics: list[str] | None = None, fig_cfg: dict | None = None,
                          scale_x: bool = False):
    if metrics_df is None:
        if metrics_path:
            metrics_df = pd.read_csv(metrics_path)
        else:
            ds_dir = results_dir / dataset_name
            files = sorted(ds_dir.glob('metrics_SF*.csv'))
            if not files:
                raise FileNotFoundError(f"No metrics file found in {ds_dir}")
            dfs = [pd.read_csv(f) for f in files]
            metrics_df = pd.concat(dfs, ignore_index=True).drop_duplicates()
    if metrics is None:
        metrics = ['exec_time','cpu_mem_mb','gpu_mem_mb','patterns_found']
    out_dir = results_dir / dataset_name / output_subdir
    for m in metrics:
        if m not in metrics_df.columns and not (m == 'gpu_mem_mb' and 'gpu_mem_bytes' in metrics_df.columns):
            print(f"[figure] Skip missing metric: {m}")
            continue
        plot_metric(metrics_df, m, dataset_name, out_dir, fig_cfg=fig_cfg, scale_x=scale_x)
    print("[figure] All requested figures generated.")

def run_experiment(dataset_url: str, sf: int, supports_quant_int: List[int], memory_type: str = 'global', debug: bool = False,
                   generate_figures: bool = True, fig_metrics: List[str] | None = None, fig_cfg: Dict[str, Any] | None = None,
                   scale_x: bool = False, fig_subdir: str = 'figures') -> pd.DataFrame:
    print("========== RUN EXPERIMENT (SF + quant_mult unified) ==========")
    print(f"Dataset URL : {dataset_url}")
    print(f"SF (concat) : {sf}")
    print(f"Quantized integer supports: {supports_quant_int}")

    original = download_dataset(dataset_url, data_dir)
    dataset_name = original.parent.name
    floating_sf_text = ensure_floating_sf(original, sf)
    fixed_parquet, quant_mult = ensure_fixed_variant(floating_sf_text)

    floating_parquet = ensure_floating_parquet(floating_sf_text)

    result_dir = results_dir / dataset_name
    metrics_df = run_both_miners(fixed_parquet, floating_parquet, quant_mult, supports_quant_int, result_dir, memory_type=memory_type, debug=debug)

    metrics_file = result_dir / f"metrics_SF{sf}.csv"
    if not metrics_file.exists() or len(metrics_df) > 0:
        metrics_df.to_csv(metrics_file, index=False)
        print(f"[metrics] Saved {metrics_file}")

    if generate_figures:
        try:
            generate_all_figures(dataset_name, metrics_df=metrics_df, metrics=fig_metrics, fig_cfg=fig_cfg, scale_x=scale_x, output_subdir=fig_subdir)
        except Exception as e:
            print(f"[figure][ERROR] {e}")

    print("============ DONE ============")
    return metrics_df

run_complete_experiment = run_experiment


In [4]:
retail = "https://u-aizu.ac.jp/~udayrage/datasets/fuzzyDatabases/Fuzzy_retail.csv"
retail_sup = [40000, 45000, 50000, 55000, 60000]

metrics_retail = run_experiment(retail, 100, retail_sup)
metrics_retail

Dataset URL : https://u-aizu.ac.jp/~udayrage/datasets/fuzzyDatabases/Fuzzy_retail.csv
SF (concat) : 100
Quantized integer supports: [40000, 45000, 50000, 55000, 60000]
[download] Existing: /export/home1/ltarun/cuda_pami/data/fuzzy/Fuzzy_retail/Fuzzy_retail.csv
[SF] Using existing floating SF file: Fuzzy_retail_SF100_floating.csv
[fixed] Reusing existing fixed file: Fuzzy_retail_SF100_fixed.parquet quant_mult=10
[cuFFIMiner] Skip sup=40000 (exists)
sup_float: 4000.0
[naiveFFIMiner] quant_int=40000 -> float=4000.0 (forced quant_mult=10)
[naiveFFIMiner] Skip sup=40000 (exists)
[cuFFIMiner] Skip sup=45000 (exists)
sup_float: 4500.0
[naiveFFIMiner] quant_int=45000 -> float=4500.0 (forced quant_mult=10)
[naiveFFIMiner] Skip sup=45000 (exists)
[cuFFIMiner] Skip sup=50000 (exists)
sup_float: 5000.0
[naiveFFIMiner] quant_int=50000 -> float=5000.0 (forced quant_mult=10)
[naiveFFIMiner] Skip sup=50000 (exists)
[cuFFIMiner] Skip sup=55000 (exists)
sup_float: 5500.0
[naiveFFIMiner] quant_int=55000 

In [5]:
connect = "https://u-aizu.ac.jp/~udayrage/datasets/fuzzyDatabases/Fuzzy_connect.csv"
connect_sup = [25000000, 24500000, 24000000, 23500000, 23000000]

metrics_connect = run_experiment(connect, 100, connect_sup)
metrics_connect

Dataset URL : https://u-aizu.ac.jp/~udayrage/datasets/fuzzyDatabases/Fuzzy_connect.csv
SF (concat) : 100
Quantized integer supports: [25000000, 24500000, 24000000, 23500000, 23000000]
[download] Existing: /export/home1/ltarun/cuda_pami/data/fuzzy/Fuzzy_connect/Fuzzy_connect.csv
[SF] Using existing floating SF file: Fuzzy_connect_SF100_floating.csv
[fixed] Reusing existing fixed file: Fuzzy_connect_SF100_fixed.parquet quant_mult=10
[cuFFIMiner] Skip sup=25000000 (exists)
sup_float: 2500000.0
[naiveFFIMiner] quant_int=25000000 -> float=2500000.0 (forced quant_mult=10)
[naiveFFIMiner] Skip sup=25000000 (exists)
[cuFFIMiner] Skip sup=24500000 (exists)
sup_float: 2450000.0
[naiveFFIMiner] quant_int=24500000 -> float=2450000.0 (forced quant_mult=10)
[naiveFFIMiner] Skip sup=24500000 (exists)
[cuFFIMiner] Skip sup=24000000 (exists)
sup_float: 2400000.0
[naiveFFIMiner] quant_int=24000000 -> float=2400000.0 (forced quant_mult=10)
[naiveFFIMiner] Skip sup=24000000 (exists)
[cuFFIMiner] Skip sup=

In [6]:
kosarak = "https://u-aizu.ac.jp/~udayrage/datasets/fuzzyDatabases/Fuzzy_kosarak.csv"
kosarak_sup = [600000, 700000, 800000, 900000, 1000000]

metrics_kosarak = run_experiment(kosarak, 50, kosarak_sup)
metrics_kosarak

Dataset URL : https://u-aizu.ac.jp/~udayrage/datasets/fuzzyDatabases/Fuzzy_kosarak.csv
SF (concat) : 50
Quantized integer supports: [600000, 700000, 800000, 900000, 1000000]
[download] Existing: /export/home1/ltarun/cuda_pami/data/fuzzy/Fuzzy_kosarak/Fuzzy_kosarak.csv
[SF] Using existing floating SF file: Fuzzy_kosarak_SF50_floating.csv
[fixed] Reusing existing fixed file: Fuzzy_kosarak_SF50_fixed.parquet quant_mult=10
[cuFFIMiner] Skip sup=600000 (exists)
sup_float: 60000.0
[naiveFFIMiner] quant_int=600000 -> float=60000.0 (forced quant_mult=10)
[naiveFFIMiner] Skip sup=600000 (exists)
[cuFFIMiner] Skip sup=700000 (exists)
sup_float: 70000.0
[naiveFFIMiner] quant_int=700000 -> float=70000.0 (forced quant_mult=10)
[naiveFFIMiner] Skip sup=700000 (exists)
[cuFFIMiner] Skip sup=800000 (exists)
sup_float: 80000.0
[naiveFFIMiner] quant_int=800000 -> float=80000.0 (forced quant_mult=10)
[naiveFFIMiner] Skip sup=800000 (exists)
[cuFFIMiner] Skip sup=900000 (exists)
sup_float: 90000.0
[naiveF

In [7]:
pumsb = "https://u-aizu.ac.jp/~udayrage/datasets/fuzzyDatabases/Fuzzy_pumsb.csv"
pumsb_sup = [20000000, 19000000, 18000000, 17000000, 16000000]

metrics_pumsb = run_experiment(pumsb, 100, pumsb_sup)
metrics_pumsb

Dataset URL : https://u-aizu.ac.jp/~udayrage/datasets/fuzzyDatabases/Fuzzy_pumsb.csv
SF (concat) : 100
Quantized integer supports: [20000000, 19000000, 18000000, 17000000, 16000000]
[download] Existing: /export/home1/ltarun/cuda_pami/data/fuzzy/Fuzzy_pumsb/Fuzzy_pumsb.csv
[SF] Using existing floating SF file: Fuzzy_pumsb_SF100_floating.csv
[fixed] Reusing existing fixed file: Fuzzy_pumsb_SF100_fixed.parquet quant_mult=10

--- cuFFIMiner Results ---
Execution Time: 22.3111 seconds
Peak CPU Memory Usage: 984.95 MB
Peak GPU Memory Usage: 22719.06 MB
Patterns Found: 3384
------------------------------
sup_float: 2000000.0
[naiveFFIMiner] quant_int=20000000 -> float=2000000.0 (forced quant_mult=10)

--- naiveFFIMiner Results ---
Execution Time: 29.4312 seconds
Peak CPU Memory Usage: 21217.97 MB
Peak GPU Memory Usage: 22719.06 MB
Patterns Found: 3384
---------------------------------

--- cuFFIMiner Results ---
Execution Time: 30.8682 seconds
Peak CPU Memory Usage: 21282.16 MB
Peak GPU Memor

Unnamed: 0,algorithm,support_quant_int,quant_mult,exec_time,cpu_mem_mb,gpu_mem_bytes,patterns_found,support_float
0,cuFFIMiner,20000000,10,22.311069,984.945312,23822663680,3384,
1,naiveFFIMiner,20000000,10,29.431204,21217.96875,23822663680,3384,2000000.0
2,cuFFIMiner,19000000,10,30.868198,21282.160156,24139333632,4840,
3,naiveFFIMiner,19000000,10,38.218514,22294.625,24139333632,4840,1900000.0
4,cuFFIMiner,18000000,10,46.270154,22294.625,24764284928,6983,
5,naiveFFIMiner,18000000,10,53.068277,22838.863281,24764284928,6983,1800000.0
6,cuFFIMiner,17000000,10,70.24703,22838.863281,24871239680,10196,
7,naiveFFIMiner,17000000,10,75.269862,23077.738281,24871239680,10196,1700000.0
8,cuFFIMiner,16000000,10,105.988672,23077.738281,25584271360,15240,
9,naiveFFIMiner,16000000,10,113.837074,23460.84375,25582174208,15240,1600000.0
