# 52nd Place Solution Notebook

This notebook is a cleaned version of my final submission.  
See [this post](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/324076/) for some details about my solution.

The notebook has minimal code in it - most of the code is imported from my [handmhelpers dataset](https://www.kaggle.com/datasets/jacob34/handmhelpers), which is synced to [this github repo](https://github.com/JacobCP/kaggle-handm-helpers) .  
See [this post](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/324078) for some details about my code development.  

**Please note:**  
I plan on continuing to update the github repo, as I try to recreate some of the strategies shared by winning teams.  
Some of those changes may break the code usage for this notebook.  
In order to keep this notebook functional, I will no longer be updating the dataset to reflect the changes made to the repo - it will remain at commit 86c412e902a7692b24e15791322a8dfeb5a761eb

In [None]:
%%time
import os
import sys
import copy
from datetime import datetime
import gc
import pickle as pkl
import shelve

import pandas as pd
import numpy as np
import cudf
    
sys.path.append("../input/")
from handmhelpers import io as h_io, sub as h_sub, cv as h_cv, fe as h_fe
from handmhelpers import modeling as h_modeling, candidates as h_can, pairs as h_pairs

## Load and convert data

In [None]:
from datetime import timedelta
import cudf
import numpy as np

def patched_day_week_numbers(dates: cudf.Series):
    pd_dates = cudf.to_datetime(dates)
    unique_dates = cudf.Series(pd_dates.unique())
    numbered_days = unique_dates - unique_dates.min() + timedelta(1)
    numbered_days = numbered_days.dt.days
    extra_days = numbered_days.max() % 7
    numbered_days -= extra_days
    day_weeks = (numbered_days + 6) // 7  # không dùng applymap
    day_weeks_map = cudf.DataFrame({"day_weeks": day_weeks, "unique_dates": unique_dates}).set_index("unique_dates")["day_weeks"]
    all_day_weeks = pd_dates.map(day_weeks_map).astype("int8")
    return all_day_weeks

import handmhelpers.fe as h_fe
h_fe.day_week_numbers = patched_day_week_numbers

In [None]:
%%time

c, t, a = h_io.load_data(files=['customers.csv', 'transactions_train.csv', 'articles.csv'])        

index_to_id_dict_path = h_fe.reduce_customer_id_memory(c, [t])
t["week_number"] = h_fe.day_week_numbers(t["t_dat"])
t["t_dat"] = h_fe.day_numbers(t["t_dat"])

# Get item pairs

In [None]:
%%time

pairs_per_item = 5

week_number_pairs = {}
for week_number in [96, 97, 98, 99, 100, 101, 102, 103, 104]:
    print(f"Creating pairs for week number {week_number}")
    week_number_pairs[week_number] = h_pairs.create_pairs(
        t, week_number, pairs_per_item, verbose=False
    )

## Main retrieval/features function!

In [None]:
def create_candidates_with_features_df(t, c, a, customer_batch=None, **kwargs):
    # splitting cv
    features_df, label_df = h_cv.feature_label_split(
        t, kwargs["label_week"], kwargs["feature_periods"]
    )
    
    # converting relative day_number
    features_df["t_dat"] = h_fe.how_many_ago(features_df["t_dat"])
    features_df["week_number"] = h_fe.how_many_ago(features_df["week_number"])
    
    # pull out the cv week
    article_pairs_df = week_number_pairs[kwargs["label_week"] - 1]
    
    # check if we can limit customers
    if len(label_df) > 0:
        customers = label_df["customer_id"].unique()
    elif customer_batch is not None:
        customers = customer_batch
    else:
        customers = None
    
    ############################################
    # creating candidates (and adding features)
    ###########################################
    
    features_db = {}
    
    # creating candidate (and saving features created)
    recent_customer_cand, features_db["customer_article"] = (
        h_can.create_recent_customer_candidates(
            features_df,
            kwargs["ca_num_weeks"],
            customers=customers,
        )
    )
    
    (
        cust_last_week_cand,
        cust_last_week_pair_cand,
        features_db["clw"],
        features_db["clw_pairs"],
    ) = h_can.create_last_customer_weeks_and_pairs(
        features_df,
        article_pairs_df,
        kwargs["clw_num_weeks"],
        kwargs["clw_num_pair_weeks"],
        customers=customers,
    )
    
    _, features_db["popular_articles"] = h_can.create_popular_article_cand(
        features_df,
        c,
        a,
        kwargs["pa_num_weeks"],
        kwargs["hier_col"],
        num_candidates=kwargs["num_recent_candidates"],
        num_articles=kwargs["num_recent_articles"],
        customers=customers,
    )
    (
        age_bucket_can,
        age_bucket_cust_features,
        age_bucket_pair_features,
    ) = h_can.create_age_bucket_candidates(
        features_df,
        c,
        kwargs["num_age_buckets"],
        articles=kwargs["num_recent_articles"],
        customers=customers,
    )
    features_db["age_bucket"] = age_bucket_pair_features

    # build source/rule features
    def build_rule_part(cand_df, feature_tuple, score_col, rule_name):
        feature_df = feature_tuple[1].reset_index()[
            ["customer_id", "article_id", score_col]
        ]
        tmp = cand_df.merge(feature_df, on=["customer_id", "article_id"], how="left")
        tmp = tmp.rename(columns={score_col: "rule_score"})
        tmp["rule_score"] = tmp["rule_score"].fillna(-1)
        tmp["rule"] = rule_name
        return tmp[["customer_id", "article_id", "rule", "rule_score"]]

    rule_parts = [
        build_rule_part(
            recent_customer_cand,
            features_db["customer_article"],
            "ca_purchase_count",
            "recent",
        ),
        build_rule_part(
            cust_last_week_cand,
            features_db["clw"],
            "ca_count",
            "last_weeks",
        ),
        build_rule_part(
            cust_last_week_pair_cand,
            features_db["clw_pairs"],
            "pair_lift",
            "pairs",
        ),
        build_rule_part(
            age_bucket_can,
            features_db["age_bucket"],
            "article_bucket_count",
            "age_bucket",
        ),
    ]
    rule_df = cudf.concat(rule_parts)
    rule_df = rule_df.sort_values(
        ["rule", "customer_id", "rule_score"], ascending=[True, True, False]
    )
    rule_df["rank_within_rule"] = rule_df.groupby(["rule", "customer_id"]).cumcount()

    rule_features_df = (
        rule_df.groupby(["customer_id", "article_id"])
        .agg({"rule": "nunique", "rule_score": "max", "rank_within_rule": "min"})
        .reset_index()
    )
    rule_features_df.columns = [
        "customer_id",
        "article_id",
        "n_sources",
        "best_rule_score",
        "best_rank_within_rule",
    ]

    for rule_name in rule_df["rule"].unique().to_pandas():
        flag_df = rule_df[rule_df["rule"] == rule_name][
            ["customer_id", "article_id"]
        ].drop_duplicates()
        flag_df[f"{rule_name}_flag"] = 1
        rule_features_df = rule_features_df.merge(
            flag_df, how="left", on=["customer_id", "article_id"]
        )
        rule_features_df[f"{rule_name}_flag"] = (
            rule_features_df[f"{rule_name}_flag"].fillna(0).astype("int8")
        )

    # features_db["rule_features"] = (
    #     ["customer_id", "article_id"],
    #     rule_features_df.set_index(["customer_id", "article_id"]),
    # )

    cand = [
        recent_customer_cand,
        cust_last_week_cand,
        cust_last_week_pair_cand,
        age_bucket_can,
    ]
    cand = cudf.concat(cand).drop_duplicates()
    cand = cand.sort_values(["customer_id", "article_id"]).reset_index(drop=True)
    
    del recent_customer_cand, cust_last_week_cand, cust_last_week_pair_cand, age_bucket_can
    
    cand = h_can.filter_candidates(cand, t, **kwargs)
    
    # creating other features
    h_fe.create_cust_hier_features(features_df, a, kwargs["hier_cols"], features_db)
    h_fe.create_cust_hier_decay_features(
        features_df,
        a,
        kwargs["hier_cols"],
        features_db,
        decay_gamma=kwargs.get("hier_decay_gamma", 0.3),
    )
        # sau khi đã gọi create_cust_hier_decay_features
    for k, v in list(features_db.items()):
        if k.endswith("_decay_features"):
            hier = k[len("cust_"):-len("_decay_features")]
            cols, df = v
            df = df.rename(
                columns={
                    "last_seen_category_weeks_ago": f"last_seen_{hier}_weeks_ago"
                }
            )
            features_db[k] = (cols, df)

    h_fe.create_price_features(features_df, features_db)
    h_fe.create_cust_features(c, features_db)
    h_fe.create_article_cust_features(features_df, c, features_db)
    h_fe.create_lag_features(features_df, a, kwargs["lag_days"], features_db)
    h_fe.create_rebuy_features(features_df, features_db)
    h_fe.create_cust_t_features(features_df, a, features_db)
    h_fe.create_art_t_features(features_df, features_db)
    
    del features_df

    # another filter at the end, for the ones that didn't get filtered earlier
    if customers is not None:
        cand = cand[cand["customer_id"].isin(customers)]
    
    # report on recall/precision of candidates
    if kwargs["cv"]:
        ground_truth_candidates = label_df[["customer_id", "article_id"]].drop_duplicates()
        h_cv.report_candidates(cand, ground_truth_candidates)
        del ground_truth_candidates        
    
    # adding features to candidates
    cand_with_f_df = h_can.add_features_to_candidates(
        cand, features_db, c, a
    )

    cand_with_f_df = cand_with_f_df.merge(
        rule_features_df, how="left", on=["customer_id", "article_id"]
    )
    
    # manually adding article features (couldn't use shelve for some reason)
    for article_col in kwargs["article_columns"]:
        art_col_map = a.set_index("article_id")[article_col]
        cand_with_f_df[article_col] = cand_with_f_df["article_id"].map(art_col_map)
        
    # ép mọi cột không phải số sang mã category để model nhận numeric
    for col in cand_with_f_df.columns:
        if col in ["customer_id", "article_id"]:
            continue
        if str(cand_with_f_df[col].dtype) not in [
            "int8","int16","int32","int64",
            "float16","float32","float64",
            "bool"
        ]:
            cand_with_f_df[col] = (
                cand_with_f_df[col].astype("category").cat.codes.astype("float32")
            )

    
    # limiting features
    if kwargs["selected_features"] is not None:
        cand_with_f_df = cand_with_f_df[
            ["customer_id", "article_id"] + kwargs["selected_features"]
        ]
        
    
    
    assert len(cand) == len(cand_with_f_df), "seem to have duplicates in the feature dfs"
    del cand
    
    return cand_with_f_df, label_df


In [None]:
def calculate_model_score(ids_df, preds, truth_df):
    predictions = h_modeling.create_predictions(ids_df, preds)
    true_labels = h_cv.ground_truth(truth_df).set_index("customer_id")["prediction"]
    score = round(h_cv.comp_average_precision(true_labels, predictions),5)
    
    return score

## Parameters - one place for all!

In [None]:
cv_params = {
    "cv": True,
    "feature_periods": 105,
    "label_week": 104,
    "index_to_id_dict_path": index_to_id_dict_path,
    "pairs_file_version": "_v3_5_ex",
    "num_recent_candidates": 36,
    "num_recent_articles": 12,
    "hier_col": "department_no",
    "ca_num_weeks": 3,
    "clw_num_weeks": 12,
    "clw_num_pair_weeks": 2,
    "pa_num_weeks": 1,
    "num_age_buckets": 4,
    "filter_recent_art_weeks": 1,
    "filter_num_articles": None,
    "lag_days": [1, 3, 7, 14, 28],
    "article_columns": ["index_code"],
    "hier_cols": [
        "department_no", "section_no", "index_group_no", "index_code",
        "product_type_no", "product_group_name"
    ],
    "hier_decay_gamma": 0.3,
    "selected_features": None,
    "lgbm_params": {"n_estimators": 200, "num_leaves": 20},
    "log_evaluation": 10,
    "early_stopping": 20,
    "eval_at": 12,
    "save_model": True,
    "num_concats": 5,
}
sub_params = {
    "cv": False,
    "feature_periods": 105,
    "label_week": 105,
    "index_to_id_dict_path": index_to_id_dict_path,
    "pairs_file_version": "_v3_5_ex",
    "num_recent_candidates": 60,
    "num_recent_articles": 12,
    "hier_col": "department_no",
    "ca_num_weeks": 3,
    "clw_num_weeks": 12,
    "clw_num_pair_weeks": 2,
    "pa_num_weeks": 1,
    "num_age_buckets": 4,
    "filter_recent_art_weeks": 1,
    "filter_num_articles": None,
    "lag_days": [1, 3, 7, 14, 28],
    "article_columns": ["index_code"],
    "hier_cols": [
        "department_no", "section_no", "index_group_no", "index_code",
        "product_type_no", "product_group_name"
    ],
    "hier_decay_gamma": 0.3,
    "selected_features": None,
    "lgbm_params": {
        "n_estimators": 150,
        "num_leaves": 20,    
    },
    "log_evaluation": 10,
    "eval_at": 12,
    "prediction_models": ["model_104", "model_105"],
    "save_model": True,
    "num_concats": 5,
}

In [None]:
cand_features_func = create_candidates_with_features_df
scoring_func = calculate_model_score

In [None]:
%%time
cv_weeks = [104]
results = h_modeling.run_all_cvs(
    t, c, a, cand_features_func, scoring_func, 
    cv_weeks=cv_weeks, **cv_params
)

In [None]:
from cuml.fil import ForestInference as _FI

_real_load = _FI.load

def _load_compat(*args, output_class=None, is_classifier=None, **kwargs):
    # helper cũ truyền output_class -> map sang is_classifier
    if is_classifier is None and output_class is not None:
        is_classifier = output_class
    return _real_load(*args, is_classifier=is_classifier, **kwargs)

_FI.load = staticmethod(_load_compat)

In [None]:
import warnings
warnings.filterwarnings(
    "ignore",
    message=r".*Parameter `output_class` was deprecated.*",
    category=FutureWarning,
)

In [None]:
%%time
gc.collect()
h_modeling.full_sub_train_run(t, c, a, cand_features_func, scoring_func, **sub_params)
predictions = h_modeling.full_sub_predict_run(
    t, c, a, cand_features_func, **sub_params
)

In [None]:
sub = h_sub.create_sub(c["customer_id"], predictions, index_to_id_dict_path)
sub.to_csv('dev_submission.csv', index=False)

display(sub.head())
print(sub.shape)