In [24]:
import polars as pl


def load_submission_with_confidence(tag, alpha=0.7, k=5):
    print(tag)
    df = pl.read_parquet(f"./ensemble/submission_{tag}_with_confidence.parquet")

    rrf_max = 1 / (k + 1)
    df = df.with_columns(((1 / (k + pl.col("selected"))) / rrf_max).alias("rrf_score"))

    df = df.with_columns(
        (alpha * pl.col("confidence") + (1 - alpha) * pl.col("rrf_score")).alias(
            "confidence"
        )
    )

    df = df.drop(["selected", "rrf_score"])

    return df


def load_submission_with_score(tag, alpha=0.8, k=5):
    print(tag)
    df = pl.read_parquet(f"./submission/score/{tag}.parquet")

    df = df.with_columns(
        (
            (pl.col("pred_score") - pl.min("pred_score").over("ranker_id"))
            / (
                pl.max("pred_score").over("ranker_id")
                - pl.min("pred_score").over("ranker_id")
            )
        ).alias("confidence")
    )

    rrf_max = 1 / (k + 1)
    df = df.with_columns(((1 / (k + pl.col("selected"))) / rrf_max).alias("rrf_score"))

    df = df.with_columns(
        (alpha * pl.col("confidence") + (1 - alpha) * pl.col("rrf_score")).alias(
            "confidence"
        )
    )

    df = df.drop(["selected", "rrf_score"])

    return df.select(["Id", "ranker_id", "confidence"])


dfs = []

timetag_score = {
    # "20250716132706": [0, 0.51381],
    # "20250718083308": [0, 0.51822],
    # "20250719002505": [0, 0.51859],
    # "20250720003111": [0, 0.50693],   # lgb
    # "20250721025740": [0, 0.51960],
    # "20250722050939": [1, 0.52070],
    "20250721083807": [1, 0.52244],
    "20250724032338": [1, 0.51345],  # lgb
    # "20250725040223": [1, 0.52309],
    # "20250725083055": [1, 0.52391],
    "20250727084025": [1, 0.52795],
    # "20250728094305": [1, 0.52492],
    # "20250731122023": [1, 0.52015],
    # "20250729084249": [1, 0.51822],
    "20250802074816": [1, 0.52603],
    "20250807032439": [1, 0.52538],
    "20250804001151": [1, 0.51244],  # lgb
    # "20250809093033": [1, 0.52612],
    # "20250812094947": [1, 0.52364],
    "20250815103211": [1, 0.52000],
    "dl_ranker": [1, 0.48755],
    # "combined": [1, 0.51244],
    # "0.49242": [1, 0.49242],
    # "0.49260": [1, 0.49260],
    # "0.49380": [1, 0.49380],
}

model_quality_dict = {}

for timetag, score_list in timetag_score.items():
    # df = load_submission_with_confidence(timetag)
    if not timetag.startswith("2025"):
        df = load_submission_with_confidence(timetag)
    else:
        df = load_submission_with_confidence(timetag)
    weight, score = score_list
    df = df.with_columns((pl.col("confidence")).alias(f"confidence_{score}")).drop(
        "confidence"
    )
    model_quality_dict[f"confidence_{score}"] = score
    dfs.append(df)

test = pl.read_parquet("./data/test.parquet")
COLS_TO_COMPARE = [
    "legs0_departureAt",
    "legs0_arrivalAt",
    "legs1_departureAt",
    "legs1_arrivalAt",
    "legs0_segments0_flightNumber",
    "legs1_segments0_flightNumber",
]
df_combined = test.select(["Id", "ranker_id"] + COLS_TO_COMPARE)
for i in range(0, len(dfs)):
    df_combined = df_combined.join(dfs[i], on=["Id", "ranker_id"])

print(df_combined)

20250721083807
20250724032338
20250727084025
20250802074816
20250807032439
20250804001151
20250815103211
dl_ranker
shape: (6_897_776, 16)
┌──────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ Id       ┆ ranker_id ┆ legs0_dep ┆ legs0_arr ┆ … ┆ confidenc ┆ confidenc ┆ confidenc ┆ confidenc │
│ ---      ┆ ---       ┆ artureAt  ┆ ivalAt    ┆   ┆ e_0.52538 ┆ e_0.51244 ┆ e_0.52    ┆ e_0.48755 │
│ i64      ┆ str       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│          ┆           ┆ str       ┆ str       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64       │
╞══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 18144679 ┆ c9373e5f7 ┆ 2024-12-1 ┆ 2024-12-1 ┆ … ┆ 0.804672  ┆ 0.705253  ┆ 0.795468  ┆ 0.95544   │
│          ┆ 72e43d593 ┆ 9T06:50:0 ┆ 9T11:20:0 ┆   ┆           ┆           ┆           ┆           │
│          ┆ dd6ad2fa9 ┆ 0         ┆ 0         ┆   ┆  

## Group-wise Spearman

In [6]:
import pandas as pd
from scipy.stats import spearmanr
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np

confidence_cols = [col for col in df_combined.columns if col.startswith("confidence")]

df_pd = df_combined.select(["Id", "ranker_id"] + confidence_cols).to_pandas()
groups = list(df_pd.groupby("ranker_id"))


# 1. 计算全局权重
def compute_spearman_weights(df, confidence_cols):
    corr_mat = pd.DataFrame(index=confidence_cols, columns=confidence_cols, dtype=float)
    for i in range(len(confidence_cols)):
        for j in range(i, len(confidence_cols)):
            col_i, col_j = confidence_cols[i], confidence_cols[j]
            col_i_vals = df[col_i].fillna(0)
            col_j_vals = df[col_j].fillna(0)
            # 过滤 NaN 共同有效样本
            mask = col_i_vals.notna() & col_j_vals.notna()
            if mask.sum() < 2:
                corr = 0
            else:
                corr, _ = spearmanr(col_i_vals[mask], col_j_vals[mask])
                if pd.isna(corr):
                    corr = 0
            corr_mat.loc[col_i, col_j] = corr
            corr_mat.loc[col_j, col_i] = corr
    mean_corr = corr_mat.apply(lambda row: (row.sum() - 1) / (len(row) - 1), axis=1)
    model_uniqueness = 1 - mean_corr.clip(-1, 1)
    if model_uniqueness.sum() == 0 or model_uniqueness.isna().any():
        weights = pd.Series(
            [1 / len(confidence_cols)] * len(confidence_cols), index=confidence_cols
        )
    else:
        weights = model_uniqueness / model_uniqueness.sum()
    return weights


print("计算全局 Spearman 权重...")
global_weights = compute_spearman_weights(df_pd, confidence_cols)
print(global_weights)


# 2. 计算每个 group 的权重
def compute_group_weight(gid, group):
    if len(group) <= 10:
        return gid, pd.Series(
            [1 / len(confidence_cols)] * len(confidence_cols), index=confidence_cols
        )
    weights = compute_spearman_weights(group, confidence_cols)
    return gid, weights


print("计算每个 group Spearman 权重...")
results = Parallel(n_jobs=32, backend="loky")(
    delayed(compute_group_weight)(gid, group) for gid, group in tqdm(groups)
)
group_weights = dict(results)

# 3. 融合权重并对组内数据加权融合 confidence
beta = 0.7  # 全局权重占比


def fuse_group_rows(rows, group_w, global_w, beta=0.7):
    weights = beta * global_w + (1 - beta) * group_w
    weights /= weights.sum()

    relevant_cols = [col for col in weights.index if col in rows]
    rows = rows.copy()
    rows[relevant_cols] = rows[relevant_cols].fillna(0)

    rows["fused_confidence"] = rows[relevant_cols].dot(weights[relevant_cols])
    return rows


def parallel_fuse_group(gid_group_pair):
    gid, group = gid_group_pair
    group_w = group_weights.get(gid, global_weights)
    return fuse_group_rows(group, group_w, global_weights, beta)


print("并行融合组内 confidence ...")

# 多进程融合（确保 groups 是 list of (gid, group_df)）
fused_groups = Parallel(n_jobs=32, backend="loky")(
    delayed(parallel_fuse_group)(pair) for pair in tqdm(groups)
)

# 合并所有组
df_pd = pd.concat(fused_groups, ignore_index=True)
df_fused = df_pd[["Id", "ranker_id", "fused_confidence"]]

print(df_fused.head())

KeyboardInterrupt: 

In [13]:
df_ranked = pl.DataFrame(df_fused).with_columns(
    [
        pl.col("fused_confidence")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    ]
)

# Load original order from one of the submissions
df_original = dfs[0].select(["Id", "ranker_id"])

# Join and keep only required columns in original order
final_submission = df_original.join(
    df_ranked.select(["Id", "ranker_id", "selected"]),
    on=["Id", "ranker_id"],
    how="left",
)
print(final_submission.head())

shape: (5, 3)
┌──────────┬─────────────────────────────────┬──────────┐
│ Id       ┆ ranker_id                       ┆ selected │
│ ---      ┆ ---                             ┆ ---      │
│ i64      ┆ str                             ┆ i32      │
╞══════════╪═════════════════════════════════╪══════════╡
│ 18144679 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 5        │
│ 18144680 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 26       │
│ 18144681 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 244      │
│ 18144682 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 66       │
│ 18144683 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 75       │
└──────────┴─────────────────────────────────┴──────────┘


In [14]:
final_submission.write_parquet("submission_ensemble_fused.parquet")

## Global spearman

In [25]:
confidence_cols = [col for col in df_combined.columns if col.startswith("confidence")]
corr_df = df_combined.select(confidence_cols).to_pandas().corr(method="spearman")
print(corr_df)

                    confidence_0.52244  confidence_0.51345  \
confidence_0.52244            1.000000            0.949373   
confidence_0.51345            0.949373            1.000000   
confidence_0.52795            0.957935            0.939309   
confidence_0.52603            0.958322            0.937384   
confidence_0.52538            0.957988            0.935733   
confidence_0.51244            0.934095            0.945280   
confidence_0.52               0.945438            0.924711   
confidence_0.48755            0.904576            0.902773   

                    confidence_0.52795  confidence_0.52603  \
confidence_0.52244            0.957935            0.958322   
confidence_0.51345            0.939309            0.937384   
confidence_0.52795            1.000000            0.982414   
confidence_0.52603            0.982414            1.000000   
confidence_0.52538            0.979736            0.981877   
confidence_0.51244            0.955373            0.952257   
confide

In [26]:
mean_corr = corr_df.apply(lambda row: (row.sum() - 1) / (len(row) - 1), axis=1)

model_uniqueness = 1 - mean_corr
print(model_uniqueness)

confidence_0.52244    0.056039
confidence_0.51345    0.066491
confidence_0.52795    0.046666
confidence_0.52603    0.046582
confidence_0.52538    0.047500
confidence_0.51244    0.062276
confidence_0.52       0.057037
confidence_0.48755    0.106839
dtype: float64


In [27]:
combined_score = model_uniqueness
weights = combined_score / combined_score.sum()
print(weights)

confidence_0.52244    0.114499
confidence_0.51345    0.135854
confidence_0.52795    0.095348
confidence_0.52603    0.095177
confidence_0.52538    0.097052
confidence_0.51244    0.127241
confidence_0.52       0.116537
confidence_0.48755    0.218292
dtype: float64


In [28]:
weighted_conf = sum(
    df_combined[f].fill_null(0) * w for f, w in zip(confidence_cols, weights.values)
)

df_combined = df_combined.with_columns(weighted_conf.alias("pred_score"))

In [29]:
df_ranked = df_combined.with_columns(
    [
        pl.col("pred_score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    ]
)

# Load original order from one of the submissions
df_original = dfs[0].select(["Id", "ranker_id"])

# Join and keep only required columns in original order
final_submission = df_original.join(
    df_ranked.select(["Id", "ranker_id", "selected"]),
    on=["Id", "ranker_id"],
    how="left",
)
print(final_submission.head())

shape: (5, 3)
┌──────────┬─────────────────────────────────┬──────────┐
│ Id       ┆ ranker_id                       ┆ selected │
│ ---      ┆ ---                             ┆ ---      │
│ i64      ┆ str                             ┆ i32      │
╞══════════╪═════════════════════════════════╪══════════╡
│ 18144679 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 4        │
│ 18144680 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 32       │
│ 18144681 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 260      │
│ 18144682 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 11       │
│ 18144683 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 60       │
└──────────┴─────────────────────────────────┴──────────┘


In [30]:
final_submission.write_parquet("./submission_ensemble.parquet")

## PCA

In [3]:
import polars as pl
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.decomposition import PCA


def load_submission_with_confidence(tag):
    df = pl.read_parquet(f"./ensemble/submission_{tag}_with_confidence.parquet").drop(
        "selected"
    )
    return df


# # 模型和分数配置
# timetag_score = {
#     "20250721025740": [0, 0.51960],
#     "20250722050939": [1, 0.52070],
#     "20250721083807": [1, 0.52244],
#     "20250724032338": [1, 0.51345],  # 注意去除字符串后缀
#     "20250725083055": [1, 0.52391],
#     "20250727084025": [1, 0.52795],
#     "20250728094305": [1, 0.52492],
#     "20250729084249": [1, 0.51822],
#     "0.49242": [1, 0.49242],
# }

# # 载入数据，重命名 confidence 列
# dfs = []
# for timetag, score_list in timetag_score.items():
#     df = load_submission_with_confidence(timetag)
#     _, score = score_list
#     score = float(score)  # 确保是float
#     df = df.with_columns(pl.col("confidence").alias(f"confidence_{score}")).drop(
#         "confidence"
#     )
#     dfs.append(df)

# # 合并所有模型结果
# df_combined = dfs[0]
# for i in range(1, len(dfs)):
#     df_combined = df_combined.join(dfs[i], on=["Id", "ranker_id"])

# 提取所有 confidence 列名
confidence_cols = [col for col in df_combined.columns if col.startswith("confidence")]
X = df_combined.select(confidence_cols).to_numpy()

# 计算 Spearman 相关性矩阵和权重
corr_mat = np.zeros((len(confidence_cols), len(confidence_cols)))
for i in range(len(confidence_cols)):
    for j in range(i, len(confidence_cols)):
        corr, _ = spearmanr(X[:, i], X[:, j])
        if np.isnan(corr):
            corr = 0
        corr_mat[i, j] = corr
        corr_mat[j, i] = corr

mean_corr = corr_mat.mean(axis=1)
model_uniqueness = 1 - mean_corr.clip(-1, 1)
spearman_weights = model_uniqueness / model_uniqueness.sum()

print("Spearman 权重:")
for col, w in zip(confidence_cols, spearman_weights):
    print(f"{col}: {w:.4f}")

# 用 Spearman 权重加权输入矩阵
X_weighted = X * spearman_weights

# PCA 融合
pca = PCA(n_components=1)
pc1_scores = pca.fit_transform(X_weighted).flatten()

print("\nPCA 各模型权重:")
for col, weight in zip(confidence_cols, pca.components_[0]):
    print(f"{col}: {weight:.4f}")

# 添加融合分数到数据
df_combined = df_combined.with_columns(pl.Series("ensemble_confidence", pc1_scores))

# 分组内按融合分数排名（越大越好）
df_ranked = df_combined.with_columns(
    pl.col("ensemble_confidence")
    .rank(method="ordinal", descending=True)
    .over("ranker_id")
    .cast(pl.Int32)
    .alias("selected")
)

# 准备输出
df_original = dfs[0].select(["Id", "ranker_id"])
final_submission = df_original.join(
    df_ranked.select(["Id", "ranker_id", "selected"]),
    on=["Id", "ranker_id"],
    how="left",
)

print("\n最终结果示例:")
print(final_submission.head())

KeyboardInterrupt: 

In [16]:
final_submission.write_parquet("submission_ensemble_spearman_pca.parquet")

## Spearman weight as feature & PCA

In [17]:
import polars as pl
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.decomposition import PCA


def load_submission_with_confidence(tag):
    df = pl.read_parquet(f"./ensemble/submission_{tag}_with_confidence.parquet").drop(
        "selected"
    )
    return df


# 模型和分数配置
timetag_score = {
    "20250721025740": [0, 0.51960],
    "20250722050939": [1, 0.52070],
    "20250721083807": [1, 0.52244],
    "20250724032338": [1, 0.51345],
    "20250725083055": [1, 0.52391],
    "20250727084025": [1, 0.52795],
    "20250728094305": [1, 0.52492],
    "20250729084249": [1, 0.51822],
    "0.49242": [1, 0.49242],
}

# 载入数据，重命名 confidence 列
dfs = []
for timetag, score_list in timetag_score.items():
    df = load_submission_with_confidence(timetag)
    _, score = score_list
    score = float(score)
    df = df.with_columns(pl.col("confidence").alias(f"confidence_{score}")).drop(
        "confidence"
    )
    dfs.append(df)

# 合并所有模型结果
df_combined = dfs[0]
for i in range(1, len(dfs)):
    df_combined = df_combined.join(dfs[i], on=["Id", "ranker_id"])

# 提取所有 confidence 列名和对应数据矩阵
confidence_cols = [col for col in df_combined.columns if col.startswith("confidence")]
X = df_combined.select(confidence_cols).to_numpy()

# 计算 Spearman 相关性矩阵和权重
corr_mat = np.zeros((len(confidence_cols), len(confidence_cols)))
for i in range(len(confidence_cols)):
    for j in range(i, len(confidence_cols)):
        corr, _ = spearmanr(X[:, i], X[:, j])
        if np.isnan(corr):
            corr = 0
        corr_mat[i, j] = corr
        corr_mat[j, i] = corr

mean_corr = corr_mat.mean(axis=1)
model_uniqueness = 1 - mean_corr.clip(-1, 1)
spearman_weights = model_uniqueness / model_uniqueness.sum()

print("Spearman 权重:")
for col, w in zip(confidence_cols, spearman_weights):
    print(f"{col}: {w:.4f}")

# 把 Spearman 权重扩展成和 X 同行数，拼接成增强矩阵
weights_repeated = np.tile(spearman_weights, (X.shape[0], 1))
X_enhanced = np.hstack([X, weights_repeated])  # shape (样本数, 2 * 模型数)

# 用增强特征做 PCA 融合
pca = PCA(n_components=1)
pc1_scores = pca.fit_transform(X_enhanced).flatten()

print("\nPCA 各模型权重（前半段是原始模型列，后半段是权重列）:")
for i, col in enumerate(confidence_cols):
    print(f"{col}: {pca.components_[0][i]:.4f}")
for i in range(len(confidence_cols)):
    print(f"weight_{i}: {pca.components_[0][len(confidence_cols)+i]:.4f}")

# 添加融合得分
df_combined = df_combined.with_columns(pl.Series("ensemble_confidence", pc1_scores))

# 组内排名（score越大越靠前）
df_ranked = df_combined.with_columns(
    pl.col("ensemble_confidence")
    .rank(method="ordinal", descending=True)
    .over("ranker_id")
    .cast(pl.Int32)
    .alias("selected")
)

# 输出结果
df_original = dfs[0].select(["Id", "ranker_id"])
final_submission = df_original.join(
    df_ranked.select(["Id", "ranker_id", "selected"]),
    on=["Id", "ranker_id"],
    how="left",
)

print("\n最终融合排序结果示例:")
print(final_submission.head())

Spearman 权重:
confidence_0.5196: 0.0937
confidence_0.5207: 0.0915
confidence_0.52244: 0.0930
confidence_0.51345: 0.1339
confidence_0.52391: 0.0949
confidence_0.52795: 0.0932
confidence_0.52492: 0.0910
confidence_0.51822: 0.0970
confidence_0.49242: 0.2119

PCA 各模型权重（前半段是原始模型列，后半段是权重列）:
confidence_0.5196: 0.3358
confidence_0.5207: 0.3361
confidence_0.52244: 0.3359
confidence_0.51345: 0.3300
confidence_0.52391: 0.3356
confidence_0.52795: 0.3359
confidence_0.52492: 0.3362
confidence_0.51822: 0.3353
confidence_0.49242: 0.3186
weight_0: 0.0000
weight_1: 0.0000
weight_2: -0.0000
weight_3: 0.0000
weight_4: 0.0000
weight_5: 0.0000
weight_6: 0.0000
weight_7: -0.0000
weight_8: -0.0000

最终融合排序结果示例:
shape: (5, 3)
┌──────────┬─────────────────────────────────┬──────────┐
│ Id       ┆ ranker_id                       ┆ selected │
│ ---      ┆ ---                             ┆ ---      │
│ i64      ┆ str                             ┆ i32      │
╞══════════╪═════════════════════════════════╪══════════╡
│

## Ensemble of ensemble

In [81]:
import polars as pl


def load_submission_with_confidence(tag):
    df = pl.read_parquet(f"./ensemble/submission_{tag}_with_confidence.parquet")
    df = df.drop("selected")

    return df


def combine_same_model(timetag_score: dict):
    dfs = []
    for timetag, score_list in timetag_score.items():
        df = load_submission_with_confidence(timetag)
        weight, score = score_list
        df = df.with_columns((pl.col("confidence")).alias(f"confidence_{score}")).drop(
            "confidence"
        )
        dfs.append(df)

    df_combined = dfs[0]
    for i in range(1, len(dfs)):
        df_combined = df_combined.join(dfs[i], on=["Id", "ranker_id"])

    return df_combined, dfs[0]

In [82]:
xgb_score = {
    "20250721083807": [1, 0.52244],
    "20250725083055": [1, 0.52391],
    "20250727084025": [1, 0.52795],
    "20250802074816": [1, 0.52603],
    "20250807032439": [1, 0.52538],
}

lgb_score = {
    # "20250720003111": [0, 0.50693],
    "20250724032338": [1, 0.51345],
    "20250804001151": [1, 0.51244],
    # "20250805032352": [1, 0.51143],
}

dlr_score = {
    "dl_ranker": [1, 0.48755],
    # "0.49380": [1, 0.49380],
}

In [83]:
xgb_combined, anchor = combine_same_model(xgb_score)
lgb_combined, _ = combine_same_model(lgb_score)
dlr_combined, _ = combine_same_model(dlr_score)

In [84]:
def weighted_confidence_fusion(df: pl.DataFrame, prefix: str) -> pl.DataFrame:
    conf_cols = [col for col in df.columns if col.startswith("confidence_")]

    # 建立权重：根据列名中的 public LB 分数
    weights = [float(col.split("_")[1]) for col in conf_cols]
    weights = [w / sum(weights) for w in weights]

    # 加权求和
    weighted_sum_expr = sum(
        pl.col(col) * weight for col, weight in zip(conf_cols, weights)
    )
    fused_col = f"{prefix}_confidence_fused"

    return df.with_columns(weighted_sum_expr.alias(fused_col)).select(
        ["Id", "ranker_id", fused_col]
    )


xgb_fused = weighted_confidence_fusion(xgb_combined, "xgb")
lgb_fused = weighted_confidence_fusion(lgb_combined, "lgb")
dlr_fused = weighted_confidence_fusion(dlr_combined, "dlr")
print(xgb_fused.head())
print(lgb_fused.head())
print(dlr_fused.head())

shape: (5, 3)
┌──────────┬─────────────────────────────────┬──────────────────────┐
│ Id       ┆ ranker_id                       ┆ xgb_confidence_fused │
│ ---      ┆ ---                             ┆ ---                  │
│ i64      ┆ str                             ┆ f64                  │
╞══════════╪═════════════════════════════════╪══════════════════════╡
│ 18144679 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 0.984909             │
│ 18144680 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 0.912308             │
│ 18144681 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 0.375555             │
│ 18144682 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 0.858067             │
│ 18144683 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 0.809281             │
└──────────┴─────────────────────────────────┴──────────────────────┘
shape: (5, 3)
┌──────────┬─────────────────────────────────┬──────────────────────┐
│ Id       ┆ ranker_id                       ┆ lgb_confidence_fused │
│ ---      ┆ ---                             ┆ ---            

In [85]:
ensemble_df = xgb_fused.join(lgb_fused, on=["Id", "ranker_id"]).join(
    dlr_fused, on=["Id", "ranker_id"]
)

group_weights = {
    "xgb": 0.4,
    "lgb": 0.3,
    "dlr": 0.3,
}

ensemble_df = ensemble_df.with_columns(
    (
        pl.col("xgb_confidence_fused") * group_weights["xgb"]
        + pl.col("lgb_confidence_fused") * group_weights["lgb"]
        + pl.col("dlr_confidence_fused") * group_weights["dlr"]
    ).alias("final_confidence")
)
print(ensemble_df.head())

shape: (5, 6)
┌──────────┬─────────────────┬─────────────────┬─────────────────┬────────────────┬────────────────┐
│ Id       ┆ ranker_id       ┆ xgb_confidence_ ┆ lgb_confidence_ ┆ dlr_confidence ┆ final_confiden │
│ ---      ┆ ---             ┆ fused           ┆ fused           ┆ _fused         ┆ ce             │
│ i64      ┆ str             ┆ ---             ┆ ---             ┆ ---            ┆ ---            │
│          ┆                 ┆ f64             ┆ f64             ┆ f64            ┆ f64            │
╞══════════╪═════════════════╪═════════════════╪═════════════════╪════════════════╪════════════════╡
│ 18144679 ┆ c9373e5f772e43d ┆ 0.984909        ┆ 0.958664        ┆ 0.997567       ┆ 0.980833       │
│          ┆ 593dd6ad2fa90f6 ┆                 ┆                 ┆                ┆                │
│          ┆ …               ┆                 ┆                 ┆                ┆                │
│ 18144680 ┆ c9373e5f772e43d ┆ 0.912308        ┆ 0.905136        ┆ 0.77129   

In [86]:
df_fused = ensemble_df.select(["Id", "ranker_id", "final_confidence"])

df_ranked = df_fused.with_columns(
    pl.col("final_confidence")
    .rank(method="ordinal", descending=True)
    .over("ranker_id")
    .cast(pl.Int32)
    .alias("selected")
)
print(df_ranked.head())

shape: (5, 4)
┌──────────┬─────────────────────────────────┬──────────────────┬──────────┐
│ Id       ┆ ranker_id                       ┆ final_confidence ┆ selected │
│ ---      ┆ ---                             ┆ ---              ┆ ---      │
│ i64      ┆ str                             ┆ f64              ┆ i32      │
╞══════════╪═════════════════════════════════╪══════════════════╪══════════╡
│ 18144679 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 0.980833         ┆ 2        │
│ 18144680 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 0.867851         ┆ 36       │
│ 18144681 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 0.402796         ┆ 251      │
│ 18144682 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 0.922796         ┆ 9        │
│ 18144683 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 0.836111         ┆ 50       │
└──────────┴─────────────────────────────────┴──────────────────┴──────────┘


In [87]:
df_original = anchor.select(["Id", "ranker_id"])

# Join and keep only required columns in original order
final_submission = df_original.join(
    df_ranked.select(["Id", "ranker_id", "selected"]),
    on=["Id", "ranker_id"],
    how="left",
)
print(final_submission.head())

shape: (5, 3)
┌──────────┬─────────────────────────────────┬──────────┐
│ Id       ┆ ranker_id                       ┆ selected │
│ ---      ┆ ---                             ┆ ---      │
│ i64      ┆ str                             ┆ i32      │
╞══════════╪═════════════════════════════════╪══════════╡
│ 18144679 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 2        │
│ 18144680 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 36       │
│ 18144681 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 251      │
│ 18144682 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 9        │
│ 18144683 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 50       │
└──────────┴─────────────────────────────────┴──────────┘


In [88]:
final_submission.write_parquet("./submission_ensemble_ensemble.parquet")