In [1]:
!pip install lightgbm
!pip install catboost
!pip install xgboost

[0m

In [2]:
pip install --upgrade pip

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
pip install --upgrade xgboost

[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss

def calculate_map7(y_true, y_pred, id_series):
    df = pd.DataFrame({'id': id_series, 'y_true': y_true, 'y_pred': y_pred})
    df.sort_values(['id', 'y_pred'], ascending=[True, False], inplace=True)
    df['rank'] = df.groupby('id').cumcount()
    df = df[df['rank'] < 7]
    df['precision_at_k'] = df['y_true'] / (df['rank'] + 1)
    map7 = df.groupby('id')['precision_at_k'].sum().mean()
    return map7

def clean_data(df):
    df = df.replace(["None", "N/A", "nan"], np.nan)
    # convert date columns
    for col in ['id4','id5']:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
    # numeric placeholders
    num_cols = df.select_dtypes(include=['number']).columns
    df[num_cols] = df[num_cols].replace(-9999.0, np.nan)
    return df

train = pd.read_parquet("/kaggle/input/amex-problem1/train_data.parquet")
test = pd.read_parquet("/kaggle/input/amex-problem1/test_data.parquet")
event = pd.read_parquet("/kaggle/input/amex-problem1/add_event.parquet")
trans = pd.read_parquet("/kaggle/input/amex-problem1/add_trans.parquet")
offer = pd.read_parquet("/kaggle/input/amex-problem1/offer_metadata.parquet")

train = clean_data(train)
test = clean_data(test)
event = clean_data(event)
trans = clean_data(trans)
offer = clean_data(offer)

In [5]:
target = train["y"]
id_col = 'id2'

cols_to_exclude = ['y', 'id1', 'id2', 'id3', 'id4', 'id5']

common_cols = list(set(train.columns) & set(test.columns))
feature_cols = [col for col in common_cols if col not in cols_to_exclude]

X = train[feature_cols].copy()
X_test = test[feature_cols].copy()

for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')
    X_test[col] = pd.to_numeric(X_test[col], errors='coerce')

X_test = X_test[X.columns]

print(f"Training with {len(X.columns)}")
target = pd.to_numeric(target)

Training with 366


In [6]:
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
import xgboost as xgb
from catboost import CatBoostClassifier

tscv = TimeSeriesSplit(n_splits=5)
splits = list(tscv.split(X))
train_idx, val_idx = splits[-1]

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = target.iloc[train_idx], target.iloc[val_idx]
id_val = train.iloc[val_idx][id_col]

print("\n--- Training LightGBM ---")
lgb_model = LGBMClassifier(
    objective='binary',
    random_state=42,
    n_estimators=1000,
    learning_rate=0.03,
    max_depth=9
)
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[early_stopping(20, verbose=False), log_evaluation(100)]
)
val_pred_lgb = lgb_model.predict_proba(X_val)[:, 1]
print(f"MAP@7 LightGBM: {calculate_map7(y_val, val_pred_lgb, id_val):.5f}")

print("\n--- Training XGBoost ---")
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'learning_rate': 0.03,
    'max_depth': 9,
    'seed': 42
}

xgb_model = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=1000,
    evals=[(dval, 'eval')],
    early_stopping_rounds=20,
    verbose_eval=100
)
val_pred_xgb = xgb_model.predict(dval)
print(f"MAP@7 XGBoost: {calculate_map7(y_val, val_pred_xgb, id_val):.5f}")

print("\n--- Training CatBoost ---")
cat_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.03,
    depth=9,
    loss_function='Logloss',
    random_seed=42,
    verbose=100
)
cat_model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    early_stopping_rounds=20
)
val_pred_cat = cat_model.predict_proba(X_val)[:, 1]
print(f"MAP@7 CatBoost: {calculate_map7(y_val, val_pred_cat, id_val):.5f}")



--- Training LightGBM ---
[LightGBM] [Info] Number of positive: 31192, number of negative: 610612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.442696 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 47455
[LightGBM] [Info] Number of data points in the train set: 641804, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048601 -> initscore=-2.974300
[LightGBM] [Info] Start training from score -2.974300
[100]	valid_0's binary_logloss: 0.108895
MAP@7 LightGBM: 0.08625

--- Training XGBoost ---
[0]	eval-logloss:0.17478
[100]	eval-logloss:0.10681
[139]	eval-logloss:0.10639
MAP@7 XGBoost: 0.08758

--- Training CatBoost ---
0:	learn: 0.6386980	test: 0.6391665	best: 0.6391665 (0)	total: 160ms	remaining: 2m 39s
100:	learn: 0.0973625	test: 0.1118770	best: 0.1118770 (100)	total: 8.5s	remaining: 1m 15s
200:	learn: 0.0868455	test: 0.1092716	best: 0.1092675 (199)	total: 17s	remaini

In [7]:
print("\n--- Calculating Feature Importances ---")

xgb_importance_dict = xgb_model.get_score(importance_type='weight')
xgb_importance = [xgb_importance_dict.get(f, 0) for f in X_train.columns]

importances = pd.DataFrame({
    "feature": X_train.columns,
    "lgb_importance": lgb_model.feature_importances_,
    "xgb_importance": xgb_importance,
    "cat_importance": cat_model.get_feature_importance()
})

importances["mean_importance"] = importances[["lgb_importance", "xgb_importance", "cat_importance"]].mean(axis=1)

top_features = importances.sort_values(by="mean_importance", ascending=False).head(100)

print("\n--- Top 100 Features by Mean Importance ---")
print(top_features["feature"].values)



--- Calculating Feature Importances ---

--- Top 100 Features by Mean Importance ---
['f350' 'f366' 'f363' 'f206' 'f351' 'f204' 'f364' 'f39' 'f314' 'f210'
 'f38' 'f41' 'f365' 'f139' 'f30' 'f132' 'f224' 'f58' 'f361' 'f203' 'f358'
 'f207' 'f216' 'f219' 'f336' 'f77' 'f51' 'f85' 'f22' 'f76' 'f343' 'f2'
 'f151' 'f319' 'f346' 'f137' 'f169' 'f337' 'f342' 'f134' 'f28' 'f344'
 'f140' 'f172' 'f9' 'f341' 'f167' 'f340' 'f31' 'f217' 'f142' 'f59' 'f125'
 'f47' 'f214' 'f10' 'f312' 'f68' 'f5' 'f98' 'f107' 'f29' 'f130' 'f133'
 'f225' 'f141' 'f1' 'f138' 'f143' 'f12' 'f11' 'f212' 'f100' 'f170' 'f8'
 'f46' 'f115' 'f355' 'f347' 'f180' 'f345' 'f43' 'f93' 'f146' 'f215' 'f27'
 'f113' 'f339' 'f74' 'f149' 'f126' 'f6' 'f173' 'f186' 'f26' 'f338' 'f96'
 'f356' 'f67' 'f116']


In [8]:
top_100_features = [
    'f350' 'f366' 'f363' 'f206' 'f351' 'f204' 'f364' 'f39' 'f314' 'f210'
 'f38' 'f41' 'f365' 'f139' 'f30' 'f132' 'f224' 'f58' 'f361' 'f203' 'f358'
 'f207' 'f216' 'f219' 'f336' 'f77' 'f51' 'f85' 'f22' 'f76' 'f343' 'f2'
 'f151' 'f319' 'f346' 'f137' 'f169' 'f337' 'f342' 'f134' 'f28' 'f344'
 'f140' 'f172' 'f9' 'f341' 'f167' 'f340' 'f31' 'f217' 'f142' 'f59' 'f125'
 'f47' 'f214' 'f10' 'f312' 'f68' 'f5' 'f98' 'f107' 'f29' 'f130' 'f133'
 'f225' 'f141' 'f1' 'f138' 'f143' 'f12' 'f11' 'f212' 'f100' 'f170' 'f8'
 'f46' 'f115' 'f355' 'f347' 'f180' 'f345' 'f43' 'f93' 'f146' 'f215' 'f27'
 'f113' 'f339' 'f74' 'f149' 'f126' 'f6' 'f173' 'f186' 'f26' 'f338' 'f96'
 'f356' 'f67' 'f116'
]
id_cols = ['id1', 'id2', 'id3', 'id4', 'id5']

In [9]:
def create_features(df, offer_df, trans_df, event_df, top_feats):
    df = df.copy()
    for c in ['id2','id3','id4','id5']:
        if c in df.columns: df[c]=df[c].astype(str)
    if 'id3' in offer_df.columns: offer_df['id3']=offer_df['id3'].astype(str)
    if 'id2' in trans_df.columns: trans_df['id2']=trans_df['id2'].astype(str)
    if 'id2' in event_df.columns: event_df['id2']=event_df['id2'].astype(str)
    if 'id3' in event_df.columns: event_df['id3']=event_df['id3'].astype(str)

    for f in top_feats:
        if f in df.columns:
            df[f]=pd.to_numeric(df[f],errors='coerce')

    tnum = [c for c in trans_df.select_dtypes(include='number').columns if c not in ['id2','id3']]
    if tnum:
        agg = trans_df.groupby('id2')[tnum].agg(['mean','sum','count']).reset_index()
        agg.columns = ['id2'] + [f"trans_{c}_{stat}" for c,stat in agg.columns.tolist() if c!='id2']
        df = df.merge(agg,on='id2',how='left')

    if {'id2','id3','id4'}.issubset(event_df.columns) and 'id5' in df.columns:
        event_df['id4']=pd.to_datetime(event_df['id4'],errors='coerce')
        df['id5']=pd.to_datetime(df['id5'],errors='coerce')
        ev = event_df[['id2','id3','id4']].drop_duplicates().rename(columns={'id4':'evt_time'})
        df = df.merge(ev,on=['id2','id3'],how='left')
        df['days_event_offer_valid'] = (df['id5'] - df['evt_time']).dt.days
        df.drop(columns='evt_time',inplace=True)

    if 'id3' in offer_df.columns:
        tmp = offer_df[['id3']].copy()
        for c in ['id6','id7']:
            if c in offer_df.columns:
                tmp[c]=offer_df[c].astype('category').cat.codes
        df = df.merge(tmp,on='id3',how='left')

    for c in df.columns:
        if pd.api.types.is_numeric_dtype(df[c]):
            df[c]=df[c].fillna(0)
        else:
            df[c]=df[c].fillna(df[c].mode().iat[0] if not df[c].mode().empty else '')
    return df

train_fe = create_features(train, offer, trans, event, top_100_features)
test_fe  = create_features(test,  offer, trans, event, top_100_features)

In [10]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

y_train = train_fe['y'].astype(int)

feature_cols = [col for col in train_fe.columns if col not in ['y', 'id1', 'id2', 'id3', 'id4', 'id5']]

X_train = train_fe[feature_cols]
X_test  = test_fe[feature_cols]

from sklearn.preprocessing import LabelEncoder

combined = pd.concat([X_train, X_test], axis=0)

for col in combined.select_dtypes(include='object').columns:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))

X_train = combined.iloc[:len(X_train), :]
X_test = combined.iloc[len(X_train):, :]

model = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [11]:
X_test = test_fe.copy()
model_features = model.feature_names_in_
for col in model_features:
    if col not in X_test.columns:
        X_test[col] = 0
X_test = X_test[model_features].apply(pd.to_numeric, errors='coerce').fillna(0)

print("\nPredicting on the test set...")
y_preds = model.predict_proba(X_test)[:, 1]
test_fe["pred"] = y_preds + 1e-6 * np.random.rand(len(y_preds))  # small jitter for tie-breaking

print("Formatting submission file...")
for col in ['id1', 'id2', 'id3', 'id5']:
    if col in test_fe.columns:
        test_fe[col] = test_fe[col].astype(str).str.strip()

submission = test_fe[['id1', 'id2', 'id3', 'id5', 'pred']]
submission.to_csv("submission.csv", index=False)

print(f"\n✅ Submission saved: submission.csv with shape {submission.shape}")
print(submission.head())



Predicting on the test set...


  test_fe["pred"] = y_preds + 1e-6 * np.random.rand(len(y_preds))  # small jitter for tie-breaking


Formatting submission file...

✅ Submission saved: submission.csv with shape (369301, 5)
                                               id1      id2     id3  \
0   1362907_91950_16-23_2023-11-04 18:56:26.000794  1362907   91950   
1      1082599_88356_16-23_2023-11-04 06:08:53.373  1082599   88356   
2  1888466_958700_16-23_2023-11-05 10:07:28.000725  1888466  958700   
3     1888971_795739_16-23_2023-11-04 12:25:28.244  1888971  795739   
4      1256369_82296_16-23_2023-11-05 06:45:26.657  1256369   82296   

          id5      pred  
0  2023-11-04  0.019711  
1  2023-11-04  0.015595  
2  2023-11-05  0.007623  
3  2023-11-04  0.072134  
4  2023-11-05  0.013924  


In [18]:
# Show rows where predicted probability is above 0.9
high_confidence_preds = test_fe[test_fe["pred"] > 0.5]

print(f"\n🔍 Number of high-confidence predictions (pred > 0.9): {len(high_confidence_preds)}")
print(high_confidence_preds[["id1", "id2", "id3", "id5", "pred"]].head(10))  # Show top 10 for preview



🔍 Number of high-confidence predictions (pred > 0.9): 297
                                                id1      id2     id3  \
343     1049108_30424_16-23_2023-11-04 11:38:46.817  1049108   30424   
971      1735932_7337_16-23_2023-11-05 17:00:59.528  1735932    7337   
3573    1489669_80893_16-23_2023-11-04 18:11:50.214  1489669   80893   
5226    1014253_95807_16-23_2023-11-05 23:45:58.409  1014253   95807   
7525    1489669_32049_16-23_2023-11-04 18:11:41.857  1489669   32049   
10026   1865930_61740_16-23_2023-11-04 14:25:56.400  1865930   61740   
15437   1489669_90984_16-23_2023-11-04 18:11:44.712  1489669   90984   
17030  1489669_920103_16-23_2023-11-05 21:42:14.705  1489669  920103   
17148  1639764_241382_16-23_2023-11-05 16:58:12.593  1639764  241382   
17587   1014253_25086_16-23_2023-11-05 23:48:38.110  1014253   25086   

              id5      pred  
343    2023-11-04  0.527222  
971    2023-11-05  0.517609  
3573   2023-11-04  0.656351  
5226   2023-11-05  0.609581 