In [1]:
INPUT_PATH = "data/processed/cleaned_huim_data.csv"
OUTPUT_SPMF_PATH = "data/processed/utility_db.txt"
# Gán trọng số ưu tiên: Sản phẩm nào quan trọng thì nhân hệ số cao hơn
STRATEGIC_WEIGHTS = {'85123A': 2.0, '22423': 1.5}

In [2]:
import pandas as pd
df = pd.read_csv(INPUT_PATH)

# Create mapping from StockCode to integer IDs for SPMF
unique_items = df['StockCode'].unique()
item_to_id = {item: idx for idx, item in enumerate(unique_items)}
id_to_item = {idx: item for item, idx in item_to_id.items()}

# Save mapping for later use in results interpretation
import pickle
with open("data/processed/item_mapping.pkl", "wb") as f:
    pickle.dump({"item_to_id": item_to_id, "id_to_item": id_to_item}, f)

def get_utility(row):
    # u(i, T) = Quantity * UnitPrice * Weight
    weight = STRATEGIC_WEIGHTS.get(str(row['StockCode']), 1.0)
    return row['Quantity'] * row['UnitPrice'] * weight

df['Utility'] = df.apply(get_utility, axis=1)

In [3]:
def format_spmf(group):
    items = " ".join(str(item_to_id[code]) for code in group['StockCode'].tolist())
    total_u = int(group['Utility'].sum())
    utils = " ".join(group['Utility'].astype(int).astype(str).tolist())
    return f"{items}:{total_u}:{utils}"

spmf_data = df.groupby('InvoiceNo').apply(format_spmf)
with open(OUTPUT_SPMF_PATH, 'w') as f:
    f.write("\n".join(spmf_data.tolist()))
print(f"Đã tạo file Utility cho {len(spmf_data)} hóa đơn.")

Đã tạo file Utility cho 16646 hóa đơn.


  spmf_data = df.groupby('InvoiceNo').apply(format_spmf)
