In [2]:
from __future__ import annotations

import os, sys, re, json, time, subprocess, textwrap
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import requests
import pandas as pd
import matplotlib.pyplot as plt

# -----------------------
# Project paths
# -----------------------
PROJECT_ROOT = Path(os.getcwd()).parent.parent  # adjust if needed
DATA_DIR     = PROJECT_ROOT / "data" / "frequent"
RESULTS_DIR  = PROJECT_ROOT / "results" / "frequent"
for p in (DATA_DIR, RESULTS_DIR): p.mkdir(parents=True, exist_ok=True)


def _filename_from_url(url: str) -> str:
    return Path(url.split("?")[0]).name

def _download(url: str, base_dir: Path) -> Path:
    filename = _filename_from_url(url)
    name_root = Path(filename).stem
    dst_dir = base_dir / name_root
    dst_dir.mkdir(parents=True, exist_ok=True)
    out = dst_dir / filename
    if out.exists():
        print(f"[download] Using cached: {out}")
        return out
    legacy = base_dir / filename
    if legacy.exists():
        out.write_bytes(legacy.read_bytes())
        print(f"[download] Moved legacy file -> {out}")
        return out
    print(f"[download] Fetch {url}")
    r = requests.get(url, timeout=60); r.raise_for_status()
    out.write_bytes(r.content)
    print(f"[download] Saved {out}")
    return out

def prepare_dataset(url: str, sf: int) -> Dict[str, Any]:
    """
    Returns:
      {
        'dataset_name', 'original',
        'floating_text', 'floating_parquet',
        'fixed_parquet', 'quant_mult'
      }
    Uses your repo scripts:
      - scripts.replicate_file.replicate_file
      - scripts.fixedpoint_normalize.normalize_file
    """
    if str(PROJECT_ROOT) not in sys.path:
        sys.path.insert(0, str(PROJECT_ROOT))

    original = _download(url, DATA_DIR)
    dataset_name = original.parent.name

    # 1) replicate floating text (SF-concatenated)
    from scripts.replicate_file import replicate_file
    sf = max(1, int(sf))
    floating_text = original.with_name(f"{original.stem}_SF{sf}_floating{original.suffix}")
    if floating_text.exists():
        print(f"[prep] Using existing: {floating_text.name}")
    else:
        replicate_file(str(original), sf, str(floating_text))
        print(f"[prep] Made: {floating_text.name}")

    return {
        'dataset_name': dataset_name,
        'original': str(original),
        'floating_text': str(floating_text),
        'floating_parquet': str(floating_text.with_suffix('.parquet')),
        'fixed_parquet': str(floating_text.with_name(f"{floating_text.stem}_fixed.parquet")),
        'quant_mult': 1000,  # hardcoded for now
    }

In [3]:
t10 = "https://u-aizu.ac.jp/~udayrage/datasets/transactionalDatabases/Transactional_T10I4D100K.csv"
prepare_dataset(t10, sf=100)

from PAMI.frequentPattern.basic.FPGrowth import FPGrowth

obj = FPGrowth('/export/home1/ltarun/cuda_pami/data/frequent/Transactional_T10I4D100K/Transactional_T10I4D100K.csv', 500)
obj.mine()
obj.printResults()

[download] Using cached: /export/home1/ltarun/cuda_pami/data/frequent/Transactional_T10I4D100K/Transactional_T10I4D100K.csv
[prep] Using existing: Transactional_T10I4D100K_SF100_floating.csv
Frequent patterns were generated successfully using frequentPatternGrowth algorithm
Total number of Frequent Patterns: 1072
Total Memory in USS: 425541632
Total Memory in RSS 490536960
Total ExecutionTime in ms: 2.8800368309020996


In [None]:
retail = "https://u-aizu.ac.jp/~udayrage/datasets/transactionalDatabases/Transactional_retail.csv"

In [None]:
chess = "https://u-aizu.ac.jp/~udayrage/datasets/transactionalDatabases/Transactional_chess1.csv"

In [None]:
bms = "https://u-aizu.ac.jp/~udayrage/datasets/transactionalDatabases/Transactional_BMS_POS.csv"

In [None]:
connect = "https://u-aizu.ac.jp/~udayrage/datasets/transactionalDatabases/Transactional_connect.csv"

In [None]:
kosarak = "https://u-aizu.ac.jp/~udayrage/datasets/transactionalDatabases/Transactional_kosarak.csv"