In [2]:
%%time
import os
import sys
import copy
from datetime import datetime
import gc
import pickle as pkl
import shelve

import pandas as pd
import numpy as np
import cudf
    
sys.path.append("../input/")
from handmhelpers import io as h_io, sub as h_sub, cv as h_cv, fe as h_fe
from handmhelpers import modeling as h_modeling, candidates as h_can, pairs as h_pairs

  if entities is not ():


CPU times: user 7.35 s, sys: 1.24 s, total: 8.59 s
Wall time: 13.3 s


## Load and convert data

In [5]:
from datetime import timedelta
import cudf
import numpy as np

def patched_day_week_numbers(dates: cudf.Series):
    pd_dates = cudf.to_datetime(dates)
    unique_dates = cudf.Series(pd_dates.unique())
    numbered_days = unique_dates - unique_dates.min() + timedelta(1)
    numbered_days = numbered_days.dt.days
    extra_days = numbered_days.max() % 7
    numbered_days -= extra_days
    day_weeks = (numbered_days + 6) // 7  # không dùng applymap
    day_weeks_map = cudf.DataFrame({"day_weeks": day_weeks, "unique_dates": unique_dates}).set_index("unique_dates")["day_weeks"]
    all_day_weeks = pd_dates.map(day_weeks_map).astype("int8")
    return all_day_weeks

import handmhelpers.fe as h_fe
h_fe.day_week_numbers = patched_day_week_numbers

In [6]:
%%time

c, t, a = h_io.load_data(files=['customers.csv', 'transactions_train.csv', 'articles.csv'])        

index_to_id_dict_path = h_fe.reduce_customer_id_memory(c, [t])
t["week_number"] = h_fe.day_week_numbers(t["t_dat"])
t["t_dat"] = h_fe.day_numbers(t["t_dat"])

CPU times: user 3.03 s, sys: 2.43 s, total: 5.46 s
Wall time: 10.2 s


# Get item pairs

In [7]:
%%time
# Tạo cặp bài viết cho nhiều tuần lịch sử (94–104) với 15 cặp mỗi bài để nâng recall ứng viên
pairs_per_item = 15

week_number_pairs = {}
for week_number in [94,95,96, 97, 98, 99, 100, 101, 102, 103, 104]:
    print(f"Creating pairs for week number {week_number}")
    week_number_pairs[week_number] = h_pairs.create_pairs(
        t, week_number, pairs_per_item, verbose=False
    )

Creating pairs for week number 94
Creating pairs for week number 95
Creating pairs for week number 96
Creating pairs for week number 97
Creating pairs for week number 98
Creating pairs for week number 99
Creating pairs for week number 100
Creating pairs for week number 101
Creating pairs for week number 102
Creating pairs for week number 103
Creating pairs for week number 104
CPU times: user 32.4 s, sys: 5.99 s, total: 38.4 s
Wall time: 39.6 s


## Main retrieval/features function!

In [14]:
def create_candidates_with_features_df(t, c, a, customer_batch=None, **kwargs):
    # Tách dữ liệu theo tuần label: features dùng các tuần trước, label là tuần đích
    features_df, label_df = h_cv.feature_label_split(
        t, kwargs["label_week"], kwargs["feature_periods"]
    )
    # Đưa thời gian về dạng “cách đây bao nhiêu ngày/tuần” để model không lộ lịch tuyệt đối
    features_df["t_dat"] = h_fe.how_many_ago(features_df["t_dat"])
    features_df["week_number"] = h_fe.how_many_ago(features_df["week_number"])

    # Lấy bảng cặp bài viết của tuần ngay trước tuần label
    article_pairs_df = week_number_pairs[kwargs["label_week"] - 1]

    # Xác định tập khách hàng cần xử lý (full, batch, hoặc chỉ khách có label)
    if len(label_df) > 0:
        customers = label_df["customer_id"].unique()
    elif customer_batch is not None:
        customers = customer_batch
    else:
        customers = None

    # ----- Tạo ứng viên từ nhiều nguồn và lưu đặc trưng rule -----
    features_db = {}

    # NEW: ứng viên mua gần đây (theo ca_num_weeks), dùng cho recall ngắn hạn
    recent_customer_cand, features_db["customer_article"] = h_can.create_recent_customer_candidates(
        features_df, kwargs["ca_num_weeks"], customers=customers
    )

    # NEW: ứng viên tuần gần nhất + cặp lift theo tuần (clw_num_weeks, clw_num_pair_weeks)
    (
        cust_last_week_cand,
        cust_last_week_pair_cand,
        features_db["clw"],
        features_db["clw_pairs"],
    ) = h_can.create_last_customer_weeks_and_pairs(
        features_df, article_pairs_df, kwargs["clw_num_weeks"], kwargs["clw_num_pair_weeks"], customers=customers
    )

    # Bài phổ biến theo hierarchy (department/index…), số lượng điều khiển bằng num_recent_candidates/articles
    _, features_db["popular_articles"] = h_can.create_popular_article_cand(
        features_df, c, a, kwargs["pa_num_weeks"], kwargs["hier_col"],
        num_candidates=kwargs["num_recent_candidates"],
        num_articles=kwargs["num_recent_articles"],
        customers=customers,
    )

    # NEW: ứng viên theo bucket tuổi, thêm đặc trưng cặp theo bucket
    (
        age_bucket_can,
        age_bucket_cust_features,
        age_bucket_pair_features,
    ) = h_can.create_age_bucket_candidates(
        features_df, c, kwargs["num_age_buckets"], articles=kwargs["num_recent_articles"], customers=customers
    )
    features_db["age_bucket"] = age_bucket_pair_features

    # Gom rule-score từ từng nguồn ứng viên thành bảng rule_features_df
    def build_rule_part(cand_df, feature_tuple, score_col, rule_name):
        feature_df = feature_tuple[1].reset_index()[["customer_id", "article_id", score_col]]
        tmp = cand_df.merge(feature_df, on=["customer_id", "article_id"], how="left")
        tmp = tmp.rename(columns={score_col: "rule_score"})
        tmp["rule_score"] = tmp["rule_score"].fillna(-1)
        tmp["rule"] = rule_name
        return tmp[["customer_id", "article_id", "rule", "rule_score"]]

    rule_parts = [
        build_rule_part(recent_customer_cand, features_db["customer_article"], "ca_purchase_count", "recent"),
        build_rule_part(cust_last_week_cand, features_db["clw"], "ca_count", "last_weeks"),
        build_rule_part(cust_last_week_pair_cand, features_db["clw_pairs"], "pair_lift", "pairs"),
        build_rule_part(age_bucket_can, features_db["age_bucket"], "article_bucket_count", "age_bucket"),
    ]
    rule_df = cudf.concat(rule_parts).sort_values(
        ["rule", "customer_id", "rule_score"], ascending=[True, True, False]
    )
    rule_df["rank_within_rule"] = rule_df.groupby(["rule", "customer_id"]).cumcount()

    rule_features_df = (
        rule_df.groupby(["customer_id", "article_id"])
        .agg({"rule": "nunique", "rule_score": "max", "rank_within_rule": "min"})
        .reset_index()
    )
    rule_features_df.columns = ["customer_id", "article_id", "n_sources", "best_rule_score", "best_rank_within_rule"]

    # Thêm cờ nguồn (recent/last_weeks/pairs/age_bucket)
    for rule_name in rule_df["rule"].unique().to_pandas():
        flag_df = rule_df[rule_df["rule"] == rule_name][["customer_id", "article_id"]].drop_duplicates()
        flag_df[f"{rule_name}_flag"] = 1
        rule_features_df = rule_features_df.merge(flag_df, how="left", on=["customer_id", "article_id"])
        rule_features_df[f"{rule_name}_flag"] = rule_features_df[f"{rule_name}_flag"].fillna(0).astype("int8")

    # Hợp nhất các nguồn ứng viên và lọc trùng
    cand = cudf.concat([recent_customer_cand, cust_last_week_cand, cust_last_week_pair_cand, age_bucket_can])\
              .drop_duplicates()\
              .sort_values(["customer_id", "article_id"])\
              .reset_index(drop=True)
    del recent_customer_cand, cust_last_week_cand, cust_last_week_pair_cand, age_bucket_can

    cand = h_can.filter_candidates(cand, t, **kwargs)

    # ----- Sinh thêm đặc trưng hành vi/giá/recency/lag -----
    h_fe.create_cust_hier_features(features_df, a, kwargs["hier_cols"], features_db)
    h_fe.create_cust_hier_decay_features(features_df, a, kwargs["hier_cols"], features_db,
                                         decay_gamma=kwargs.get("hier_decay_gamma", 0.3))
    # NEW: đổi tên cột decay để nhất quán “last_seen_<hier>_weeks_ago”
    for k, v in list(features_db.items()):
        if k.endswith("_decay_features"):
            hier = k[len("cust_"):-len("_decay_features")]
            cols, df = v
            df = df.rename(columns={"last_seen_category_weeks_ago": f"last_seen_{hier}_weeks_ago"})
            features_db[k] = (cols, df)

    h_fe.create_price_features(features_df, features_db)
    h_fe.create_cust_features(c, features_db)
    h_fe.create_article_cust_features(features_df, c, features_db)
    h_fe.create_lag_features(features_df, a, kwargs["lag_days"], features_db)
    h_fe.create_rebuy_features(features_df, features_db)
    h_fe.create_cust_t_features(features_df, a, features_db)
    h_fe.create_art_t_features(features_df, a, features_db)
    del features_df

    # Giới hạn lại tập ứng viên nếu chỉ chạy trên subset khách
    if customers is not None:
        cand = cand[cand["customer_id"].isin(customers)]

    # Báo cáo recall/precision ứng viên trên CV để theo dõi trần mô hình
    if kwargs["cv"]:
        ground_truth_candidates = label_df[["customer_id", "article_id"]].drop_duplicates()
        h_cv.report_candidates(cand, ground_truth_candidates)
        del ground_truth_candidates

    # Gắn đặc trưng vào ứng viên
    cand_with_f_df = h_can.add_features_to_candidates(cand, features_db, c, a)
    cand_with_f_df = cand_with_f_df.merge(rule_features_df, how="left", on=["customer_id", "article_id"])

    # Thêm cột article thủ công (không dùng shelve được)
    for article_col in kwargs["article_columns"]:
        art_col_map = a.set_index("article_id")[article_col]
        cand_with_f_df[article_col] = cand_with_f_df["article_id"].map(art_col_map)

    # NEW: target encode các cột category dựa trên best_rule_score (nếu có), tránh mã thứ tự tùy ý
    target_col = "best_rule_score" if "best_rule_score" in cand_with_f_df.columns else None
    for col in cand_with_f_df.columns:
        if col in ["customer_id", "article_id"]:
            continue
        if str(cand_with_f_df[col].dtype) not in ["int8","int16","int32","int64","float16","float32","float64","bool"]:
            if target_col:
                te = cand_with_f_df.groupby(col)[target_col].mean()
                cand_with_f_df[col] = cand_with_f_df[col].map(te).fillna(0).astype("float32")
            else:
                cand_with_f_df[col] = cand_with_f_df[col].astype("category").cat.codes.astype("float32")

    # Giữ subset đặc trưng nếu được chọn sẵn
    if kwargs["selected_features"] is not None:
        cand_with_f_df = cand_with_f_df[["customer_id", "article_id"] + kwargs["selected_features"]]

    assert len(cand) == len(cand_with_f_df), "seem to have duplicates in the feature dfs"
    del cand

    return cand_with_f_df, label_df


In [15]:
def calculate_model_score(ids_df, preds, truth_df):
    predictions = h_modeling.create_predictions(ids_df, preds)
    true_labels = h_cv.ground_truth(truth_df).set_index("customer_id")["prediction"]
    score = round(h_cv.comp_average_precision(true_labels, predictions),5)
    
    return score

## Parameters - one place for all!

In [16]:
# Cấu hình chạy CV (đa tuần, tăng ứng viên/estimators)
cv_params = {
    "cv": True,                     # bật báo cáo recall ứng viên
    "feature_periods": 105,         # dùng 105 tuần lịch sử cho feature
    "label_week": 104,              # tuần label mặc định, sẽ bị override bởi cv_weeks
    "index_to_id_dict_path": index_to_id_dict_path,
    "pairs_file_version": "_v3_5_ex",
    "num_recent_candidates": 80,    # NEW: tăng số ứng viên recent để nâng recall
    "num_recent_articles": 20,      # NEW: tăng bài phổ biến/age bucket
    "hier_col": "department_no",
    "ca_num_weeks": 3,
    "clw_num_weeks": 12,
    "clw_num_pair_weeks": 2,
    "pa_num_weeks": 2,              # NEW: kéo dài phổ biến thêm 2 tuần
    "num_age_buckets": 4,
    "filter_recent_art_weeks": 1,
    "filter_num_articles": None,
    "lag_days": [1, 3, 7, 14, 28],
    "article_columns": ["index_code"],
    "hier_cols": [
        "department_no", "section_no", "index_group_no", "index_code",
        "product_type_no", "product_group_name"
    ],
    "hier_decay_gamma": 0.3,
    "selected_features": None,
    "lgbm_params": {                # NEW: tăng capacity model cho CV
        "n_estimators": 400,
        "learning_rate": 0.05,
        "num_leaves": 64,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
    },
    "log_evaluation": 10,
    "early_stopping": 30,
    "eval_at": 12,
    "save_model": True,
    "num_concats": 5,               # ghép 5 tuần train để tăng dữ liệu
}

# Cấu hình train/predict submit (tương tự CV nhưng n_estimators cao hơn, predict ensembe 2 model)
sub_params = {
    "cv": False,
    "feature_periods": 105,
    "label_week": 105,
    "index_to_id_dict_path": index_to_id_dict_path,
    "pairs_file_version": "_v3_5_ex",
    "num_recent_candidates": 80,
    "num_recent_articles": 20,
    "hier_col": "department_no",
    "ca_num_weeks": 3,
    "clw_num_weeks": 12,
    "clw_num_pair_weeks": 2,
    "pa_num_weeks": 2,
    "num_age_buckets": 4,
    "filter_recent_art_weeks": 1,
    "filter_num_articles": None,
    "lag_days": [1, 3, 7, 14, 28],
    "article_columns": ["index_code"],
    "hier_cols": [
        "department_no", "section_no", "index_group_no", "index_code",
        "product_type_no", "product_group_name"
    ],
    "hier_decay_gamma": 0.3,
    "selected_features": None,
    "lgbm_params": {                # NEW: nhiều cây hơn cho submit
        "n_estimators": 500,
        "learning_rate": 0.05,
        "num_leaves": 64,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
    },
    "log_evaluation": 10,
    "eval_at": 12,
    "prediction_models": ["model_104", "model_105"],  # ensembe 2 model
    "save_model": True,
    "num_concats": 5,
}


In [17]:
cand_features_func = create_candidates_with_features_df
scoring_func = calculate_model_score

In [18]:
%%time
# Chạy cross-validation cho các tuần 102–104 (đa tuần để ổn định hơn so với 1 tuần)
cv_weeks = [102, 103, 104]
results = h_modeling.run_all_cvs(
    t, c, a, cand_features_func, scoring_func,
    cv_weeks=cv_weeks, **cv_params
)

preparing training modeling dfs for 101...
candidates recall: 9.63% (24,570/255,172)
candidates precision: 0.58% (24,570/4,270,526)
preparing training modeling dfs for 100...
candidates recall: 9.63% (22,224/230,825)
candidates precision: 0.55% (22,224/4,039,298)
preparing training modeling dfs for 99...
candidates recall: 9.35% (22,171/237,160)
candidates precision: 0.53% (22,171/4,209,718)
preparing training modeling dfs for 98...
candidates recall: 8.48% (22,010/259,512)
candidates precision: 0.50% (22,010/4,395,378)
preparing training modeling dfs for 97...
candidates recall: 7.82% (22,502/287,700)
candidates precision: 0.48% (22,502/4,640,200)
concatenating all weeks together
preparing evaluation modeling dfs...
candidates recall: 10.32% (24,561/238,074)
candidates precision: 0.61% (24,561/4,033,310)
[LightGBM] [Info] Total groups: 82212, total data: 5960226
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.849291 seconds.
You can set `force_r

  model = ForestInference.load(model_path, model_type="lightgbm", output_class=False)


Train AUC 0.7845
Train score:  0.06832
Eval AUC 0.7467
Eval score: 0.03406


age_bucket_flag         0
last_weeks_flag         0
pairs_flag              0
recent_flag             1
ca_count               11
                     ... 
rebuy_count_ratio     398
art_sales_channel     406
trend_ratio_7_28      429
cust_sales_channel    461
last_1_days_count     516
Length: 71, dtype: int32
Finished cv of week 102 in 0:07:51.128386. Score: 0.03406

preparing training modeling dfs for 102...
candidates recall: 10.32% (24,561/238,074)
candidates precision: 0.61% (24,561/4,033,310)
preparing training modeling dfs for 101...
candidates recall: 9.63% (24,570/255,172)
candidates precision: 0.58% (24,570/4,270,526)
preparing training modeling dfs for 100...
candidates recall: 9.63% (22,224/230,825)
candidates precision: 0.55% (22,224/4,039,298)
preparing training modeling dfs for 99...
candidates recall: 9.35% (22,171/237,160)
candidates precision: 0.53% (22,171/4,209,718)
preparing training modeli

  model = ForestInference.load(model_path, model_type="lightgbm", output_class=False)


Train AUC 0.7824
Train score:  0.06976
Eval AUC 0.7575
Eval score: 0.03419


last_weeks_flag             0
pairs_flag                  0
recent_flag                 0
age_bucket_flag             1
pair_article_max_price     15
                         ... 
cust_sales_channel        370
trend_ratio_7_28          374
rebuy_count_ratio         377
newness_days              390
last_1_days_count         491
Length: 71, dtype: int32
Finished cv of week 103 in 0:07:00.376217. Score: 0.03419

preparing training modeling dfs for 103...
candidates recall: 9.97% (22,729/227,910)
candidates precision: 0.59% (22,729/3,859,190)
preparing training modeling dfs for 102...
candidates recall: 10.32% (24,561/238,074)
candidates precision: 0.61% (24,561/4,033,310)
preparing training modeling dfs for 101...
candidates recall: 9.63% (24,570/255,172)
candidates precision: 0.58% (24,570/4,270,526)
preparing training modeling dfs for 100...
candidates recall: 9.63% (22,224/230,825)
candidates precision: 0.55%

  model = ForestInference.load(model_path, model_type="lightgbm", output_class=False)


Train AUC 0.7844
Train score:  0.07276
Eval AUC 0.7459
Eval score: 0.03548


last_weeks_flag             0
pairs_flag                  0
recent_flag                 0
age_bucket_flag             1
pair_article_max_price     15
                         ... 
rebuy_count_ratio         362
art_sales_channel         370
trend_ratio_7_28          378
newness_days              438
last_1_days_count         451
Length: 71, dtype: int32
Finished cv of week 104 in 0:07:09.521138. Score: 0.03548

Finished all 3 cvs in 0:22:01.025739. Average cv score: 0.03458
CPU times: user 38min 20s, sys: 59.1 s, total: 39min 19s
Wall time: 22min 1s


In [19]:
from cuml.fil import ForestInference as _FI

_real_load = _FI.load

def _load_compat(*args, output_class=None, is_classifier=None, **kwargs):
    # helper cũ truyền output_class -> map sang is_classifier
    if is_classifier is None and output_class is not None:
        is_classifier = output_class
    return _real_load(*args, is_classifier=is_classifier, **kwargs)

_FI.load = staticmethod(_load_compat)

In [20]:
import warnings
warnings.filterwarnings(
    "ignore",
    message=r".*Parameter `output_class` was deprecated.*",
    category=FutureWarning,
)

In [23]:
import math
import handmhelpers.modeling as h_modeling

def full_sub_predict_run_small_batches(t, c, a, cand_features_func, batch_splits=8, **kwargs):
    # Chia khách hàng thành nhiều batch nhỏ để tránh OOM khi tạo feature/predict
    customer_batches = []
    n = len(c)
    for i in range(batch_splits):
        start = i * n // batch_splits
        end = (i + 1) * n // batch_splits
        customer_batches.append(c[start:end]["customer_id"].to_pandas().to_list())

    batch_preds = []
    for idx, customer_batch in enumerate(customer_batches):
        print(f"generating candidates/features for batch #{idx+1} of {len(customer_batches)}")
        sub_ids_df, sub_X = h_modeling.prepare_prediction_dfs(
            t, c, a, cand_features_func, customer_batch=customer_batch, **kwargs
        )
        print(f"candidate/features shape of batch: ({sub_X.shape[0]:,}, {sub_X.shape[1]})")

        # Ensemble các model trong prediction_models, trung bình điểm
        model_paths = kwargs.get("prediction_models")
        model_nums = len(model_paths)
        first_model = h_modeling.ForestInference.load(model_paths[0], model_type="lightgbm", output_class=False)
        sub_pred = h_modeling.pred_in_batches(first_model, sub_X) / model_nums
        del first_model

        for mp in model_paths[1:]:
            m = h_modeling.ForestInference.load(mp, model_type="lightgbm", output_class=False)
            sub_pred += h_modeling.pred_in_batches(m, sub_X) / model_nums
            del m

        batch_preds.append(h_modeling.create_predictions(sub_ids_df, sub_pred))
        del sub_ids_df, sub_X, sub_pred

    return cudf.concat(batch_preds)

# Ghi đè hàm predict gốc để dùng bản chia batch nhỏ
h_modeling.full_sub_predict_run = full_sub_predict_run_small_batches

In [25]:
%%time
gc.collect()
# Train full dữ liệu submit và lưu model theo sub_params
h_modeling.full_sub_train_run(t, c, a, cand_features_func, scoring_func, **sub_params)
# Predict theo batch nhỏ (batch_splits=8) để tránh OOM, dùng ensemble model_104/model_105
predictions = h_modeling.full_sub_predict_run(
    t, c, a, cand_features_func, batch_splits=8, **sub_params
)

preparing training modeling dfs for 104...
preparing training modeling dfs for 103...
preparing training modeling dfs for 102...
preparing training modeling dfs for 101...
preparing training modeling dfs for 100...
concatenating all weeks together
[LightGBM] [Info] Total groups: 86093, total data: 5761241
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.873492 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12668
[LightGBM] [Info] Number of data points in the train set: 5761241, number of used features: 71
[10]	train's map@12: 0.238545	train's ndcg@12: 0.319674
[20]	train's map@12: 0.242711	train's ndcg@12: 0.324727
[30]	train's map@12: 0.247015	train's ndcg@12: 0.329551
[40]	train's map@12: 0.250503	train's ndcg@12: 0.333571
[50]	train's map@12: 0.253272	train's ndcg@12: 0.336888
[60]	train's map@12: 0.255654	train's ndcg@12: 0.339579
[

In [26]:
sub = h_sub.create_sub(c["customer_id"], predictions, index_to_id_dict_path)
sub.to_csv('dev_submission.csv', index=False)

display(sub.head())
print(sub.shape)

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0779781015 0568601044 0858856005 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0863583001 0918522001 0448509014 0915529003 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0794321008 0805000001 0918522001 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0861803009 0852584001 0730683050 0928206001 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0730683050 0918522001 0791587021 0928206001 09...


(1371980, 2)
