In [1]:
import pandas as pd

# sub=pd.read_csv("/kaggle/input/amex-hack/685404e30cfdb_submission_template.csv")
# data=pd.read_csv('/kaggle/input/amex-hack/data_dictionary.csv')
train = pd.read_parquet("/kaggle/input/train-dataset/train_data.parquet")
# transactions = pd.read_parquet('/kaggle/input/datahumara/add_trans.parquet')
# events = pd.read_parquet('/kaggle/input/datahumara/add_event.parquet')
# offers = pd.read_parquet('/kaggle/input/datahumara/offer_metadata.parquet')
test = pd.read_parquet('/kaggle/input/test-dataset/test_data.parquet')

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# ========== TRAIN FUNCTION ==========
def preprocess_train(df: pd.DataFrame):
    df = df.copy()

    # ---- 1. Time Features ----
    df['DateTime'] = pd.to_datetime(df['id4'])
    df = df.sort_values(['id2', 'DateTime'])

    df['hour'] = df['DateTime'].dt.hour.astype('Int8')
    df['dayofweek'] = df['DateTime'].dt.dayofweek.astype('Int8')
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype('Int8')

    df['prev_time'] = df.groupby('id2')['DateTime'].shift(1)
    df['time_since_last_offer'] = (
        df['DateTime'] - df['prev_time']
    ).dt.total_seconds().fillna(-1).astype('float32')

    df['user_offer_count'] = df.groupby('id2').cumcount().add(1).astype('Int16')

    df = df.drop(columns=['DateTime', 'prev_time', 'id4', 'id5'], errors='ignore')

    # ---- 2. Drop Columns ----
    drop_cols = [
        "f13", "f14", "f15", "f16", "f17", "f18", "f19", "f20", "f21",
        "f34", "f80", "f84", "f112", "f120", "f122", "f135", "f136", "f360"
    ]
    df = df.drop(columns=drop_cols, errors='ignore')

    # ---- 3. Label Encoding ----
    binary_label_encode = ['f50', 'f52', 'f53', 'f354']
    label_encoders = {}
    for col in binary_label_encode:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
            label_encoders[col] = le

    # ---- 4. One-Hot Encoding ----
    multi_class_string = ['f54', 'f56', 'f55', 'f57', 'f42']
    multi_class_numeric = ['f48', 'f349']
    one_hot_cols = multi_class_string + multi_class_numeric
    df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)
    one_hot_columns = df.columns.tolist()

    # ---- 5. Boolean Features ----
    hot_encoded_cols = [f"f{i}" for i in range(226, 310)]
    for col in hot_encoded_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype(bool)

    # ---- 6. Type Casting for Numerics ----
    int_cols = ['id2', 'id3', 'y', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f43', 'f44', 'f45', 'f46', 'f47', 'f49', 'f51', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f81', 'f82', 'f83', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f121', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f134', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f179', 'f180', 'f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f189', 'f190', 'f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198', 'f199', 'f200', 'f201', 'f202', 'f203', 'f204', 'f205', 'f206', 'f207', 'f208', 'f209', 'f210', 'f211', 'f212', 'f213', 'f214', 'f215', 'f216', 'f217', 'f218', 'f219', 'f220', 'f221', 'f222', 'f223', 'f224', 'f225', 'f310', 'f311', 'f312', 'f313', 'f314', 'f315', 'f316', 'f317', 'f318', 'f319', 'f320', 'f321', 'f322', 'f323', 'f324', 'f325', 'f326', 'f327', 'f328', 'f329', 'f330', 'f331', 'f332', 'f333', 'f334', 'f335', 'f336', 'f337', 'f338', 'f339', 'f340', 'f341', 'f342', 'f343', 'f344', 'f345', 'f346', 'f347', 'f348', 'f350', 'f351', 'f352', 'f353', 'f355', 'f356', 'f357', 'f358', 'f359', 'f361', 'f362', 'f363', 'f364', 'f365', 'f366']

    for col in int_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype('float32')

    # ---- 7. Cast Target ----
    if 'y' in df.columns:
        df['y'] = df['y'].astype('int8')

    # ---- 8. y-based Features ----
    y_based_features = []
    if 'y' in df.columns:
        df['user_click_count'] = (
            df.groupby('id2')['y'].cumsum().astype('Int16')
        )
        df['user_ctr_so_far'] = (
            df['user_click_count'] / df['user_offer_count']
        ).replace([np.inf, np.nan], 0).astype('float32')
        y_based_features += ['user_click_count', 'user_ctr_so_far']

    # ---- 9. Time-Based Expanding User Stats (y-independent) ----
    df['user_avg_time_gap'] = (
        df.groupby('id2')['time_since_last_offer']
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
        .fillna(-1)
    ).astype('float32')

    df['user_avg_hour'] = (
        df.groupby('id2')['hour']
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
    ).astype('float32')

    df['user_weekend_rate'] = (
        df.groupby('id2')['is_weekend']
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
    ).astype('float32')

    return df, label_encoders, one_hot_columns, y_based_features


In [4]:
train, label_encoders, one_hot_columns, y_based_features = preprocess_train(train)

  df['user_click_count'] = (
  df['user_ctr_so_far'] = (
  df['user_avg_time_gap'] = (
  df['user_avg_hour'] = (
  df['user_weekend_rate'] = (


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 770164 entries, 575716 to 217226
Columns: 384 entries, id1 to user_weekend_rate
dtypes: Int16(2), Int8(3), bool(113), float32(260), int64(4), int8(1), object(1)
memory usage: 891.7+ MB


In [6]:
train = train.drop('id1', axis = 1)

In [7]:
train

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id2,id3,y,f1,f2,f3,f4,f5,f6,f7,...,f48_3.0,f48_4.0,f349_4,f349_5,f349_6,user_click_count,user_ctr_so_far,user_avg_time_gap,user_avg_hour,user_weekend_rate
575716,1000043.0,93516.0,0,,,,,41.0,32.0,,...,False,False,False,False,True,0,0.0,-1.0000,13.0,0.0
575715,1000043.0,69026.0,0,,,,,41.0,32.0,,...,False,False,False,False,True,0,0.0,-0.3160,13.0,0.0
323212,1000078.0,622070.0,0,,,,,,,,...,False,False,False,True,False,0,0.0,-1.0000,3.0,0.0
258665,1000099.0,209183.0,0,40.0,35.0,,,46.0,44.0,50.0,...,False,False,False,False,True,0,0.0,-1.0000,8.0,0.0
65052,1000101.0,69479.0,0,,,,,,,,...,True,False,False,True,False,0,0.0,-1.0000,12.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763103,1910542.0,923960.0,0,30.0,39.0,,,99.0,94.0,34.0,...,False,False,False,False,True,0,0.0,-0.4470,8.0,0.0
763102,1910542.0,88935.0,0,30.0,39.0,,,99.0,94.0,34.0,...,False,False,False,False,True,0,0.0,0.0350,8.0,0.0
87878,1910565.0,415582.0,0,,,,,24.0,13.0,19.0,...,True,False,True,False,False,0,0.0,-1.0000,11.0,0.0
87877,1910565.0,2486.0,0,,,,,24.0,13.0,19.0,...,True,False,True,False,False,0,0.0,-0.4505,11.0,0.0


In [8]:
def preprocess_test(df: pd.DataFrame, label_encoders: dict, one_hot_columns: list, y_based_features: list):
    df = df.copy()

    # ---- 1. Time Features ----
    df['DateTime'] = pd.to_datetime(df['id4'])
    df = df.sort_values(['id2', 'DateTime'])

    df['hour'] = df['DateTime'].dt.hour.astype('Int8')
    df['dayofweek'] = df['DateTime'].dt.dayofweek.astype('Int8')
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype('Int8')

    df['prev_time'] = df.groupby('id2')['DateTime'].shift(1)
    df['time_since_last_offer'] = (
        df['DateTime'] - df['prev_time']
    ).dt.total_seconds().fillna(-1).astype('float32')

    df['user_offer_count'] = df.groupby('id2').cumcount().add(1).astype('Int16')

    df = df.drop(columns=['DateTime', 'prev_time', 'id4', 'id5'], errors='ignore')

    # ---- 2. Drop Columns ----
    drop_cols = [
        "f13", "f14", "f15", "f16", "f17", "f18", "f19", "f20", "f21",
        "f34", "f80", "f84", "f112", "f120", "f122", "f135", "f136", "f360"
    ]
    df = df.drop(columns=drop_cols, errors='ignore')

    # ---- 3. Label Encoding ----
    for col, le in label_encoders.items():
        if col in df.columns:
            df[col] = le.transform(df[col].astype(str))

    # ---- 4. One-Hot Encoding ----
    multi_class_string = ['f54', 'f56', 'f55', 'f57', 'f42']
    multi_class_numeric = ['f48', 'f349']
    one_hot_cols = multi_class_string + multi_class_numeric
    df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)

    # ---- 5. Align Columns to Training ----
    df = df.reindex(columns=one_hot_columns, fill_value=0)

    # ---- 6. Boolean Features ----
    hot_encoded_cols = [f"f{i}" for i in range(226, 310)]
    for col in hot_encoded_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype(bool)

    # ---- 7. Type Casting ----
    int_cols = ['id2', 'id3', 'y', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f43', 'f44', 'f45', 'f46', 'f47', 'f49', 'f51', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f81', 'f82', 'f83', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f121', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f134', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f179', 'f180', 'f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f189', 'f190', 'f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198', 'f199', 'f200', 'f201', 'f202', 'f203', 'f204', 'f205', 'f206', 'f207', 'f208', 'f209', 'f210', 'f211', 'f212', 'f213', 'f214', 'f215', 'f216', 'f217', 'f218', 'f219', 'f220', 'f221', 'f222', 'f223', 'f224', 'f225', 'f310', 'f311', 'f312', 'f313', 'f314', 'f315', 'f316', 'f317', 'f318', 'f319', 'f320', 'f321', 'f322', 'f323', 'f324', 'f325', 'f326', 'f327', 'f328', 'f329', 'f330', 'f331', 'f332', 'f333', 'f334', 'f335', 'f336', 'f337', 'f338', 'f339', 'f340', 'f341', 'f342', 'f343', 'f344', 'f345', 'f346', 'f347', 'f348', 'f350', 'f351', 'f352', 'f353', 'f355', 'f356', 'f357', 'f358', 'f359', 'f361', 'f362', 'f363', 'f364', 'f365', 'f366']

    for col in int_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype('float32')

    # ---- 8. Time-Based Expanding User Stats (only y-independent)
    df['user_avg_time_gap'] = (
        df.groupby('id2')['time_since_last_offer']
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
        .fillna(-1)
    ).astype('float32')

    df['user_avg_hour'] = (
        df.groupby('id2')['hour']
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
    ).astype('float32')

    df['user_weekend_rate'] = (
        df.groupby('id2')['is_weekend']
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
    ).astype('float32')

    # ---- 9. Fill y-based Features with 0 ----
    for col in y_based_features:
        df[col] = 0.0

    return df


In [9]:
sub_df = test[['id1', 'id2', 'id3', 'id5']].copy()

In [10]:
test = preprocess_test(test, label_encoders, one_hot_columns, y_based_features)

  df['user_avg_time_gap'] = (
  df['user_avg_hour'] = (
  df['user_weekend_rate'] = (
  df[col] = 0.0
  df[col] = 0.0


In [11]:
test

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id1,id2,id3,y,f1,f2,f3,f4,f5,f6,...,f48_3.0,f48_4.0,f349_4,f349_5,f349_6,user_avg_time_gap,user_avg_hour,user_weekend_rate,user_click_count,user_ctr_so_far
138784,1000061_9914_16-23_2023-11-05 09:11:35.557,1000061.0,9914.0,0.0,,,,,,,...,False,False,0,0,False,-1.000000,9.000000,1.0,0.0,0.0
138780,1000061_23690_16-23_2023-11-05 09:11:36.193,1000061.0,23690.0,0.0,,,,,,,...,False,False,0,0,False,-0.182000,9.000000,1.0,0.0,0.0
138778,1000061_522188_16-23_2023-11-05 09:11:37.242,1000061.0,522188.0,0.0,,,,,,,...,False,False,0,0,False,0.228333,9.000000,1.0,0.0,0.0
138799,1000061_5420674_16-23_2023-11-05 09:28:04.153,1000061.0,5420674.0,0.0,,,,,,,...,False,False,0,0,False,246.899002,9.000000,1.0,0.0,0.0
138779,1000061_27945_16-23_2023-11-05 09:28:04.157,1000061.0,27945.0,0.0,,,,,,,...,False,False,0,0,False,197.520004,9.000000,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86055,1910523_65430_16-23_2023-11-04 08:24:17.803,1910523.0,65430.0,0.0,49.0,49.0,,,100.0,92.0,...,False,False,0,0,False,-1.000000,8.000000,1.0,0.0,0.0
86054,1910523_57342_16-23_2023-11-04 08:24:17.804,1910523.0,57342.0,0.0,49.0,49.0,,,100.0,92.0,...,False,False,0,0,False,-0.499500,8.000000,1.0,0.0,0.0
88841,1910523_65699008_16-23_2023-11-05 06:26:41.492,1910523.0,65699008.0,0.0,49.0,49.0,,,100.0,92.0,...,False,False,0,0,False,26447.562500,7.333333,1.0,0.0,0.0
88842,1910523_735023_16-23_2023-11-05 06:26:42.053,1910523.0,735023.0,0.0,49.0,49.0,,,100.0,92.0,...,False,False,0,0,False,19835.812500,7.000000,1.0,0.0,0.0


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 369301 entries, 138784 to 150670
Columns: 384 entries, id1 to user_ctr_so_far
dtypes: Int16(1), Int8(3), bool(111), float32(260), float64(2), int64(6), object(1)
memory usage: 436.7+ MB


In [13]:
test = test.drop('id1', axis = 1)

In [14]:
test = test.drop('y', axis = 1)

In [15]:
test

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id2,id3,f1,f2,f3,f4,f5,f6,f7,f8,...,f48_3.0,f48_4.0,f349_4,f349_5,f349_6,user_avg_time_gap,user_avg_hour,user_weekend_rate,user_click_count,user_ctr_so_far
138784,1000061.0,9914.0,,,,,,,,,...,False,False,0,0,False,-1.000000,9.000000,1.0,0.0,0.0
138780,1000061.0,23690.0,,,,,,,,,...,False,False,0,0,False,-0.182000,9.000000,1.0,0.0,0.0
138778,1000061.0,522188.0,,,,,,,,,...,False,False,0,0,False,0.228333,9.000000,1.0,0.0,0.0
138799,1000061.0,5420674.0,,,,,,,,,...,False,False,0,0,False,246.899002,9.000000,1.0,0.0,0.0
138779,1000061.0,27945.0,,,,,,,,,...,False,False,0,0,False,197.520004,9.000000,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86055,1910523.0,65430.0,49.0,49.0,,,100.0,92.0,84.0,46.0,...,False,False,0,0,False,-1.000000,8.000000,1.0,0.0,0.0
86054,1910523.0,57342.0,49.0,49.0,,,100.0,92.0,84.0,46.0,...,False,False,0,0,False,-0.499500,8.000000,1.0,0.0,0.0
88841,1910523.0,65699008.0,49.0,49.0,,,100.0,92.0,84.0,46.0,...,False,False,0,0,False,26447.562500,7.333333,1.0,0.0,0.0
88842,1910523.0,735023.0,49.0,49.0,,,100.0,92.0,84.0,46.0,...,False,False,0,0,False,19835.812500,7.000000,1.0,0.0,0.0


In [16]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
)

# Features and target
X = train.drop(columns='y')
y = train['y']

# Split into training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=8,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),  # similar to class_weight='balanced'
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
model.fit(X_train, y_train)

# Predict
y_proba = model.predict_proba(X_val)[:, 1]
y_pred = model.predict(X_val)

# Evaluation
print("----- XGBoost Results -----")
print(f"ROC-AUC Score: {roc_auc_score(y_val, y_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print(f"Precision: {precision_score(y_val, y_pred):.4f}")
print(f"Recall: {recall_score(y_val, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_val, y_pred):.4f}")

----- XGBoost Results -----
ROC-AUC Score: 0.9930
Accuracy: 0.9740
Precision: 0.6678
Recall: 0.9082
F1 Score: 0.7697


In [17]:
# Combine back into val_data to create `val_data` with predictions
val_data = X_val.copy()
val_data["y"] = y_val
val_data["id2"] = train.loc[X_val.index, "id2"]
val_data["id3"] = train.loc[X_val.index, "id3"]
val_data["pred_proba"] = model.predict_proba(X_val)[:, 1]

In [18]:
def mapk(actual, predicted, k=7):
    """
    Computes the mean average precision at k.
    `actual`: dict with keys as user IDs and values as clicked offer ID
    `predicted`: dict with keys as user IDs and values as list of predicted offer IDs (ranked)
    """
    def apk(actual_id, predicted_list, k):
        if actual_id not in predicted_list[:k]:
            return 0.0
        return 1.0 / (predicted_list[:k].index(actual_id) + 1)

    total_score = 0.0
    count = 0
    for user in actual:
        if user in predicted:
            total_score += apk(actual[user], predicted[user], k)
            count += 1
    return total_score / count if count else 0.0
# Build ground truth from validation data
ground_truth = val_data[val_data['y'] == 1].set_index('id2')['id3'].to_dict()

# Generate prediction ranks
predicted_ranks = (
    val_data[["id2", "id3", "pred_proba"]]
    .sort_values(by=["id2", "pred_proba"], ascending=[True, False])
    .groupby("id2")["id3"]
    .apply(list)
    .to_dict()
)

# Calculate MAP@7
score = mapk(actual=ground_truth, predicted=predicted_ranks, k=7)
print(f"MAP@7 Score: {score:.5f}")

MAP@7 Score: 0.67835


In [19]:
import joblib

# Save the model
joblib.dump(model, 'final_model_higher_score.pkl')

['final_model_higher_score.pkl']

In [20]:
import joblib
model = joblib.load('/kaggle/input/last_model/other/default/1/final_model_higher_score.pkl')

In [21]:
print("Columns in test but not in train:")
print(set(test.columns) - set(model.feature_names_in_))

print("\nColumns in train but not in test:")
print(set(model.feature_names_in_) - set(test.columns))


Columns in test but not in train:
set()

Columns in train but not in test:
set()


In [22]:
# Align test columns to training set
test = test[model.feature_names_in_]

In [23]:
test['id2'].info()

<class 'pandas.core.series.Series'>
Index: 369301 entries, 138784 to 150670
Series name: id2
Non-Null Count   Dtype  
--------------   -----  
369301 non-null  float32
dtypes: float32(1)
memory usage: 4.2 MB


In [24]:
test['id2'].head()

138784    1000061.0
138780    1000061.0
138778    1000061.0
138799    1000061.0
138779    1000061.0
Name: id2, dtype: float32

In [25]:
test['id3'].head()

138784       9914.0
138780      23690.0
138778     522188.0
138799    5420674.0
138779      27945.0
Name: id3, dtype: float32

In [26]:
test['id2'] = test['id2'].astype(np.float32)
test['id3'] = test['id3'].astype(np.float32)

In [27]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 369301 entries, 138784 to 150670
Columns: 382 entries, id2 to user_weekend_rate
dtypes: Int16(1), Int8(3), bool(111), float32(259), float64(2), int64(6)
memory usage: 432.5 MB


In [28]:
test['pred'] = model.predict_proba(test)[:, 1]

  test['pred'] = model.predict_proba(test)[:, 1]


In [29]:
sub_df

Unnamed: 0,id1,id2,id3,id5
46756,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04
57819,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04
15390,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05
145730,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04
146085,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05
...,...,...,...,...
1806,1874443_95537_16-23_2023-11-05 09:21:24.182,1874443,95537,2023-11-05
127494,1541978_5718_16-23_2023-11-05 00:56:43.946,1541978,5718,2023-11-05
106947,1887841_85905_16-23_2023-11-05 20:40:43.312,1887841,85905,2023-11-05
158372,1569367_944713_16-23_2023-11-05 00:43:04.335,1569367,944713,2023-11-05


In [30]:
test

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id2,id3,f1,f2,f3,f4,f5,f6,f7,f8,...,f48_4.0,f349_4,f349_5,f349_6,user_click_count,user_ctr_so_far,user_avg_time_gap,user_avg_hour,user_weekend_rate,pred
138784,1000061.0,9914.0,,,,,,,,,...,False,0,0,False,0.0,0.0,-1.000000,9.000000,1.0,0.000011
138780,1000061.0,23690.0,,,,,,,,,...,False,0,0,False,0.0,0.0,-0.182000,9.000000,1.0,0.000006
138778,1000061.0,522188.0,,,,,,,,,...,False,0,0,False,0.0,0.0,0.228333,9.000000,1.0,0.000025
138799,1000061.0,5420674.0,,,,,,,,,...,False,0,0,False,0.0,0.0,246.899002,9.000000,1.0,0.000021
138779,1000061.0,27945.0,,,,,,,,,...,False,0,0,False,0.0,0.0,197.520004,9.000000,1.0,0.000031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86055,1910523.0,65430.0,49.0,49.0,,,100.0,92.0,84.0,46.0,...,False,0,0,False,0.0,0.0,-1.000000,8.000000,1.0,0.000009
86054,1910523.0,57342.0,49.0,49.0,,,100.0,92.0,84.0,46.0,...,False,0,0,False,0.0,0.0,-0.499500,8.000000,1.0,0.000004
88841,1910523.0,65699008.0,49.0,49.0,,,100.0,92.0,84.0,46.0,...,False,0,0,False,0.0,0.0,26447.562500,7.333333,1.0,0.000004
88842,1910523.0,735023.0,49.0,49.0,,,100.0,92.0,84.0,46.0,...,False,0,0,False,0.0,0.0,19835.812500,7.000000,1.0,0.000004


In [31]:
# Only if the floats are integers (e.g., 123.0)
test['id2'] = test['id2'].astype(float).astype(int).astype(str)
test['id3'] = test['id3'].astype(float).astype(int).astype(str)

In [32]:
# Create a combined column in both DataFrames
test['id2_id3'] = test['id2'].astype(str) + '_' + test['id3'].astype(str)
sub_df['id2_id3'] = sub_df['id2'].astype(str) + '_' + sub_df['id3'].astype(str)

# Create a mapping from df1
prob_map = test.set_index('id2_id3')['pred'].to_dict()

# Apply the mapping to df2
sub_df['pred'] = sub_df['id2_id3'].map(prob_map)

  test['id2_id3'] = test['id2'].astype(str) + '_' + test['id3'].astype(str)


In [33]:
sub_df = sub_df.drop('id2_id3', axis = 1)

In [34]:
sub_df

Unnamed: 0,id1,id2,id3,id5,pred
46756,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04,0.000134
57819,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04,0.000012
15390,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05,0.000112
145730,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04,0.000005
146085,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05,0.000003
...,...,...,...,...,...
1806,1874443_95537_16-23_2023-11-05 09:21:24.182,1874443,95537,2023-11-05,0.000013
127494,1541978_5718_16-23_2023-11-05 00:56:43.946,1541978,5718,2023-11-05,0.000016
106947,1887841_85905_16-23_2023-11-05 20:40:43.312,1887841,85905,2023-11-05,0.000031
158372,1569367_944713_16-23_2023-11-05 00:43:04.335,1569367,944713,2023-11-05,0.000020


In [35]:
sub_df = (
    sub_df
    .sort_values(['id2', 'pred'], ascending=[True, False])
    .reset_index(drop=True)
)

In [36]:
sub_df

Unnamed: 0,id1,id2,id3,id5,pred
0,1000061_1802_16-23_2023-11-05 09:47:54.545,1000061,1802,2023-11-05,0.000097
1,1000061_62395_16-23_2023-11-05 09:28:07.805,1000061,62395,2023-11-05,0.000067
2,1000061_403431_16-23_2023-11-05 09:28:10.592,1000061,403431,2023-11-05,0.000062
3,1000061_72292_16-23_2023-11-05 09:28:07.697,1000061,72292,2023-11-05,0.000054
4,1000061_944713_16-23_2023-11-05 09:28:12.529,1000061,944713,2023-11-05,0.000031
...,...,...,...,...,...
369296,1910523_65430_16-23_2023-11-04 08:24:17.803,1910523,65430,2023-11-04,0.000009
369297,1910523_65699008_16-23_2023-11-05 06:26:41.492,1910523,65699008,2023-11-05,0.000004
369298,1910523_57342_16-23_2023-11-04 08:24:17.804,1910523,57342,2023-11-04,0.000004
369299,1910523_735023_16-23_2023-11-05 06:26:42.053,1910523,735023,2023-11-05,0.000004


In [37]:
sub_df.to_csv('r2_submission_fileModel-Citizens.csv', index=False)