## task1

In [5]:
# ======================================================================
# TASK 1 — is_cheater
# FE v2 + Polynomial Interaction Features + LGBM + XGB + CatBoost + Stacking + F2 Tuning
# ======================================================================

!pip install catboost xgboost lightgbm --quiet

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import fbeta_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

SEED = 42
N_FOLDS = 5
TARGET = "is_cheater"


# ============================================================
# 1. Load data
# ============================================================
TRAIN_PATH = "/kaggle/input/dataset22/train.csv"
TEST_PATH  = "/kaggle/input/dataset22/test.csv"

train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

print("Train shape:", train.shape, " Test shape:", test.shape)




Train shape: (99872, 34)  Test shape: (25889, 33)


In [6]:
# ============================================================
# 2. FE v2 + Polynomial Interaction Features (auto)
# ============================================================
def fe_v2_poly(train_df: pd.DataFrame, test_df: pd.DataFrame):
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # ---------- basic missing handling ----------
    full = pd.concat([train_df.drop(columns=[TARGET]), test_df], axis=0, ignore_index=True)
    
    for col in full.columns:
        if full[col].dtype == "O":
            full[col] = full[col].fillna("missing")
        else:
            full[col] = full[col].fillna(full[col].median())
    
    # reconstruct back
    train_features = full.iloc[:len(train_df)].copy()
    test_features  = full.iloc[len(train_df):].copy()
    
    # ---------- detect types ----------
    ignore_cols = ["id"]
    num_cols = [c for c in train_features.columns if c not in ignore_cols and train_features[c].dtype != "O"]
    cat_cols = [c for c in train_features.columns if c not in ignore_cols and train_features[c].dtype == "O"]
    
    print("Numeric cols:", len(num_cols), " | Categorical cols:", len(cat_cols))
    
    # ---------- base numeric transforms ----------
    for df in [train_features, test_features]:
        num_data = df[num_cols].astype(float)
        
        # row-wise stats
        df["num_mean"] = num_data.mean(axis=1)
        df["num_std"]  = num_data.std(axis=1)
        df["num_max"]  = num_data.max(axis=1)
        df["num_min"]  = num_data.min(axis=1)
        
        # log / sqrt for skewed features
        for col in num_cols:
            col_clip = num_data[col].clip(lower=0)
            if (col_clip > 0).mean() > 0.3:
                df[f"{col}_log1p"] = np.log1p(col_clip)
            df[f"{col}_sqrt"] = np.sqrt(col_clip)
    
    # update numeric list after new features
    num_cols_all = [c for c in train_features.columns if c not in ignore_cols and train_features[c].dtype != "O"]
    
    # ---------- Polynomial-style interaction (auto) ----------
    # ใช้ top-K numeric features ที่ variance สูงสุดมาทำ pairwise interaction
    K = min(15, len(num_cols_all))  # limit เพื่อกัน feature ระเบิด
    
    var_series = train_features[num_cols_all].var().sort_values(ascending=False)
    top_num_cols = list(var_series.index[:K])
    print("Top numeric for interactions:", top_num_cols)
    
    def add_poly_interactions(df, cols):
        # สร้าง sum, diff, product, ratio (ระวังหาร 0)
        for i in range(len(cols)):
            for j in range(i+1, len(cols)):
                c1, c2 = cols[i], cols[j]
                c1v = df[c1].astype(float)
                c2v = df[c2].astype(float)
                df[f"{c1}_plus_{c2}"]  = c1v + c2v
                df[f"{c1}_minus_{c2}"] = c1v - c2v
                df[f"{c1}_mul_{c2}"]   = c1v * c2v
                df[f"{c1}_ratio_{c2}"] = c1v / (c2v.abs() + 1e-3)
    
    for df in [train_features, test_features]:
        add_poly_interactions(df, top_num_cols)
    
    # ---------- LabelEncode categorical ----------
    full2 = pd.concat([train_features, test_features], axis=0, ignore_index=True)
    cat_cols = [c for c in full2.columns if full2[c].dtype == "O"]
    print("Detected categorical columns for LabelEncoder:", cat_cols)
    
    for col in cat_cols:
        le = LabelEncoder()
        full2[col] = le.fit_transform(full2[col].astype(str))
    
    train_final = full2.iloc[:len(train_df)].copy()
    test_final  = full2.iloc[len(train_df):].copy()
    
    # add target back
    train_final[TARGET] = train_df[TARGET].values
    
    # final feature list
    features = [c for c in train_final.columns if c not in ["id", TARGET]]
    
    return train_final, test_final, features

print("Applying FE v2 + Polynomial interactions ...")
train_fe, test_fe, features = fe_v2_poly(train, test)
print("Num features after FE:", len(features))

X = train_fe[features]
train_fe[TARGET] = pd.to_numeric(train_fe[TARGET], errors='coerce').fillna(0).astype(int)
y = train_fe[TARGET].values
X_test = test_fe[features]

pos_rate = y.mean()
print(f"Positive class rate: {pos_rate:.4f}")

Applying FE v2 + Polynomial interactions ...
Numeric cols: 31  | Categorical cols: 1
Top numeric for interactions: ['account_age_days', 'num_max', 'friend_network_size', 'avg_session_length_min', 'num_std', 'reaction_time_ms', 'damage_per_round', 'survival_time_avg', 'level', 'win_rate', 'headshot_percentage', 'num_mean', 'accuracy_score', 'account_age_days_sqrt', 'reports_received']
Detected categorical columns for LabelEncoder: ['id', 'player_id']
Num features after FE: 518
Positive class rate: 0.3417


In [8]:
# ============================================================
# 3. Model definitions (Ultra: 3 trees + stacking)
# ============================================================

np.random.seed(SEED)
LGB_PARAMS = dict(
    objective="binary",
    metric="binary_logloss",
    learning_rate=0.03,
    num_leaves=80,
    max_depth=-1,
    min_child_samples=40,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=2.0,
    reg_alpha=1.0,
    random_state=SEED,
    n_estimators=4000,
    verbose=-1
)

XGB_PARAMS = dict(
    objective="binary:logistic",
    eval_metric="logloss",
    learning_rate=0.03,
    max_depth=8,
    subsample=0.9,
    colsample_bytree=0.9,
    min_child_weight=4,
    reg_lambda=2.0,
    reg_alpha=2.0,
    tree_method="gpu_hist",  # ถ้าไม่มี GPU ให้เปลี่ยนเป็น "hist"
    random_state=SEED,
    n_estimators=4000
)

CAT_PARAMS = dict(
    loss_function="Logloss",
    eval_metric="Logloss",
    learning_rate=0.03,
    depth=7,
    l2_leaf_reg=4.0,
    random_strength=1.5,
    iterations=4000,
    task_type="GPU",  # ถ้าไม่มี GPU ให้ลบบรรทัดนี้
    verbose=False,
    random_seed=SEED
)

In [9]:
# ============================================================
# 4. K-Fold + OOF for stacking
# ============================================================
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))

test_lgb = np.zeros(len(X_test))
test_xgb = np.zeros(len(X_test))
test_cat = np.zeros(len(X_test))

def f2_score(y_true, y_prob, thr):
    y_pred = (y_prob > thr).astype(int)
    return fbeta_score(y_true, y_pred, beta=2)

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n========== Fold {fold}/{N_FOLDS} ==========")
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]
    
    # LightGBM
    m_lgb = lgb.LGBMClassifier(**LGB_PARAMS)
    m_lgb.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)]
    )
    oof_lgb[va_idx] = m_lgb.predict_proba(X_va)[:, 1]
    test_lgb += m_lgb.predict_proba(X_test)[:, 1] / N_FOLDS
    
    # XGBoost
    m_xgb = xgb.XGBClassifier(**XGB_PARAMS)
    m_xgb.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        early_stopping_rounds=200,
        verbose=False
    )
    oof_xgb[va_idx] = m_xgb.predict_proba(X_va)[:, 1]
    test_xgb += m_xgb.predict_proba(X_test)[:, 1] / N_FOLDS
    
    # CatBoost
    m_cat = CatBoostClassifier(**CAT_PARAMS)
    m_cat.fit(
        X_tr, y_tr,
        eval_set=(X_va, y_va),
        early_stopping_rounds=200
    )
    oof_cat[va_idx] = m_cat.predict_proba(X_va)[:, 1]
    test_cat += m_cat.predict_proba(X_test)[:, 1] / N_FOLDS
    
    # quick F2 check @0.5
    for name, oof_part in [("lgb", oof_lgb), ("xgb", oof_xgb), ("cat", oof_cat)]:
        f2 = f2_score(y, oof_part, 0.5)
        print(f"  current F2 ({name}) @0.5 = {f2:.4f}", end=" | ")
    print()



  current F2 (lgb) @0.5 = 0.1416 |   current F2 (xgb) @0.5 = 0.1401 |   current F2 (cat) @0.5 = 0.1393 | 

  current F2 (lgb) @0.5 = 0.2754 |   current F2 (xgb) @0.5 = 0.2720 |   current F2 (cat) @0.5 = 0.2701 | 

  current F2 (lgb) @0.5 = 0.3997 |   current F2 (xgb) @0.5 = 0.3949 |   current F2 (cat) @0.5 = 0.3908 | 

  current F2 (lgb) @0.5 = 0.5166 |   current F2 (xgb) @0.5 = 0.5105 |   current F2 (cat) @0.5 = 0.5059 | 

  current F2 (lgb) @0.5 = 0.6252 |   current F2 (xgb) @0.5 = 0.6187 |   current F2 (cat) @0.5 = 0.6123 | 


In [10]:
# ============================================================
# 5. Stacking Meta-Model (Logistic Regression)
# ============================================================
meta_train = np.vstack([oof_lgb, oof_xgb, oof_cat]).T
meta_test  = np.vstack([test_lgb, test_xgb, test_cat]).T

meta_clf = LogisticRegression(
    C=2.0,
    max_iter=1000,
    class_weight="balanced",
    random_state=SEED
)
meta_clf.fit(meta_train, y)

oof_meta = meta_clf.predict_proba(meta_train)[:, 1]
test_meta = meta_clf.predict_proba(meta_test)[:, 1]

In [11]:
# ============================================================
# 6. Threshold tuning on OOF (F2)
# ============================================================
print("\n========== Threshold tuning (F2) on OOF ==========")
best_thr = 0.5
best_f2 = 0.0
for t in np.arange(0.05, 0.95, 0.01):
    f2 = f2_score(y, oof_meta, t)
    if f2 > best_f2:
        best_f2 = f2
        best_thr = t

print(f"Best threshold (meta): {best_thr:.3f}  |  F2 = {best_f2:.5f}")

# optional: F2 of each model at best_thr
for name, oof_part in [("lgb", oof_lgb), ("xgb", oof_xgb), ("cat", oof_cat), ("meta", oof_meta)]:
    f2 = f2_score(y, oof_part, best_thr)
    print(f"  F2 {name:5s} @ {best_thr:.3f} = {f2:.5f}")



Best threshold (meta): 0.190  |  F2 = 0.83301
  F2 lgb   @ 0.190 = 0.83304
  F2 xgb   @ 0.190 = 0.83169
  F2 cat   @ 0.190 = 0.82972
  F2 meta  @ 0.190 = 0.83301


In [12]:
# ============================================================
# 7. Final prediction & submission
# ============================================================
final_pred = (test_meta > best_thr).astype(int)

sub = pd.DataFrame({
    "id": test["id"],
    "is_cheater": final_pred
})
sub.to_csv("submission_task1_poly_ultra.csv", index=False)
print("\nSaved: submission_task1_poly_ultra.csv")



Saved: submission_task1_poly_ultra.csv
