In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score, roc_auc_score
from scipy import sparse
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GroupShuffleSplit
import itertools
from torch.utils.data import DataLoader

In [2]:
# read the files changes path to where the data is stored
test = pd.read_csv(r"C:\Users\youpz\Documents\Master\P5\Data mining techniques\Assignment2\data\dmt-2025-2nd-assignment\test_set_VU_DM.csv")
train = pd.read_csv(r"C:\Users\youpz\Documents\Master\P5\Data mining techniques\Assignment2\data\dmt-2025-2nd-assignment\training_set_VU_DM.csv")
sample = pd.read_csv(r"C:\Users\youpz\Documents\Master\P5\Data mining techniques\Assignment2\data\dmt-2025-2nd-assignment\submission_sample.csv")

In [3]:
def missing_values_table(df):
    """Returns a DataFrame with missing counts and percent missing for each column."""
    n = len(df)
    missing_count   = df.isna().sum()
    missing_percent = 100 * missing_count / n
    missing_df = (
        pd.DataFrame({
            'missing_count':   missing_count,
            'missing_percent': missing_percent
        })
        .sort_values('missing_percent', ascending=False)
    )
    return missing_df

In [4]:
mv = missing_values_table(train)
print(mv.head(20))       # top 20 most‐missing columns
# Or to filter down to “lots” of missing, say >30%:
print(mv[mv['missing_percent'] > 30])

                           missing_count  missing_percent
comp1_rate_percent_diff          4863908        98.095353
comp6_rate_percent_diff          4862173        98.060362
comp1_rate                       4838417        97.581250
comp1_inv                        4828788        97.387053
comp4_rate_percent_diff          4827261        97.356256
gross_bookings_usd               4819957        97.208949
comp7_rate_percent_diff          4819832        97.206428
comp6_rate                       4718190        95.156511
visitor_hist_starrating          4706481        94.920364
visitor_hist_adr_usd             4705359        94.897735
comp6_inv                        4697371        94.736633
comp4_rate                       4650969        93.800797
comp7_rate                       4642999        93.640058
srch_query_affinity_score        4640941        93.598552
comp4_inv                        4614684        93.069001
comp7_inv                        4601925        92.811677
comp3_rate_per

In [5]:
########################## some beginning on the feature engineering


def preprocess_missing_and_competitors(train_df, test_df):
    # 1) Drop features with >93% missing or that leak the target
    drop_cols = [
        # competitor 1,4,6,7 are ~97–98% missing → too sparse to learn
        *[f'comp{i}_{t}' for i in [1,4,6,7] for t in ['rate','inv','rate_percent_diff']],
        'gross_bookings_usd'  # only in train, leaks booking price
    ]
    train_df.drop(columns=drop_cols, errors='ignore', inplace=True)
    test_df.drop(columns=drop_cols, errors='ignore', inplace=True)

    # 2) Impute & flag user history features
    #    Missing means “no prior purchases” → keep with sentinel + flag
    for df in (train_df, test_df):
        # visitor_hist_starrating
        df['hist_star_na'] = df['visitor_hist_starrating'].isna().astype(int)
        # fill with median starrating across users
        star_med = train_df['visitor_hist_starrating'].median()
        df['visitor_hist_starrating'] = df['visitor_hist_starrating'].fillna(star_med)

        # visitor_hist_adr_usd (avg USD spend)
        df['hist_adr_na'] = df['visitor_hist_adr_usd'].isna().astype(int)
        adr_med = train_df['visitor_hist_adr_usd'].median()
        df['visitor_hist_adr_usd'] = df['visitor_hist_adr_usd'].fillna(adr_med)

    # 3) Impute & flag affinity score
    #    Null means “hotel never seen” → fill with global minimum and flag
    affinity_min = train_df['srch_query_affinity_score'].min(skipna=True)
    for df in (train_df, test_df):
        df['affinity_na'] = df['srch_query_affinity_score'].isna().astype(int)
        df['srch_query_affinity_score'] = (
            df['srch_query_affinity_score']
            .fillna(affinity_min)
        )

    # 4) Keep & impute competitor 2,3,5,8 features (~50–90% missing)
    #    Null → “no data” sentinel (for categorical) or 0 (for percent diff), plus flag
    keep_comps = [2,3,5,8]
    for i in keep_comps:
        # availability flag
        inv_col = f'comp{i}_inv'
        flag_col = f'comp{i}_inv_na'
        for df in (train_df, test_df):
            df[flag_col] = df[inv_col].isna().astype(int)
            # fill null with 2 (new category: 0=no avail,1=avail,2=no data)
            df[inv_col] = df[inv_col].fillna(2).astype(int)

        # price‐compare flag
        rate_col = f'comp{i}_rate'
        rate_flag = f'comp{i}_rate_na'
        for df in (train_df, test_df):
            df[rate_flag] = df[rate_col].isna().astype(int)
            # fill null as “no data” = 2
            df[rate_col] = df[rate_col].fillna(2).astype(int)

        # percent_diff
        pdiff_col = f'comp{i}_rate_percent_diff'
        pdiff_flag = f'comp{i}_pdiff_na'
        for df in (train_df, test_df):
            df[pdiff_flag] = df[pdiff_col].isna().astype(int)
            # fill null as 0% diff (no info)
            df[pdiff_col] = df[pdiff_col].fillna(0.0)

    # 5) Bucket orig_destination_distance
    #    Missing → sentinel bucket + flag
    for df in (train_df, test_df):
        df['dist_na'] = df['orig_destination_distance'].isna().astype(int)
        df['orig_destination_distance'] = (
            df['orig_destination_distance'].fillna(-1)
        )
        # define bins (in km)
        bins = [-1, 0, 10, 50, 200, np.inf]
        labels = ['missing','0-10km','10-50km','50-200km','200km+']
        df['dist_bucket'] = pd.cut(
            df['orig_destination_distance'],
            bins=bins, labels=labels
        )

    return train_df, test_df

def create_base_features(df):
    """1) Parse datetime & basic price/historical features."""
    df = df.copy()
    # --- Date/time splits ---
    df['date_time']   = pd.to_datetime(df['date_time'])
    df['search_year'] = df['date_time'].dt.year
    df['search_month']= df['date_time'].dt.month
    df['search_day']  = df['date_time'].dt.day
    df['search_hour'] = df['date_time'].dt.hour

    # --- Price per night & hist price devation ---
    df['price_per_night'] = df['price_usd'] / df['srch_length_of_stay']
    df['price_vs_historical'] = df['price_usd'] - df['prop_log_historical_price']
    df['price_vs_historical'].fillna(0, inplace=True)
    return df

def add_destination_stats(train_df, test_df):
    """6) Dest‑level total searches & booking rate."""
    dest = (
        train_df
        .groupby('srch_destination_id')
        .agg(dest_searches=('srch_id','count'),
             dest_bookings=('booking_bool','sum'))
        .assign(dest_booking_rate=lambda x: x.dest_bookings / x.dest_searches)
        .reset_index()
    )
    # Reassign merge result back to each DataFrame
    train_df = train_df.merge(
        dest[['srch_destination_id','dest_searches','dest_booking_rate']],
        on='srch_destination_id', how='left'
    )
    test_df = test_df.merge(
        dest[['srch_destination_id','dest_searches','dest_booking_rate']],
        on='srch_destination_id', how='left'
    )
    return train_df, test_df

def add_within_search_features(df):
    """7) Z‑scores & deltas in each search group."""
    grp = df.groupby('srch_id')
    # price
    df['price_mean_srch'] = grp['price_usd'].transform('mean')
    df['price_std_srch']  = grp['price_usd'].transform('std').fillna(1)
    df['price_zscore']    = (df['price_usd'] - df['price_mean_srch']) / df['price_std_srch']
    # stars
    df['star_mean_srch']  = grp['prop_starrating'].transform('mean')
    df['star_delta_srch'] = df['prop_starrating'] - df['star_mean_srch']
    # user delta
    df['star_delta_user'] = df['prop_starrating'] - df['visitor_hist_starrating']
    # distance
    df['dist_mean_srch']  = grp['orig_destination_distance'].transform('mean')
    df['dist_std_srch']   = grp['orig_destination_distance'].transform('std').fillna(1)
    df['dist_zscore']     = (df['orig_destination_distance'] - df['dist_mean_srch']) / df['dist_std_srch']
    return df

def add_temporal_features(df):
    """8) Weekday/weekend & check‑in weekend flags."""
    # day‑of‑week for search
    df['search_dow'] = df['date_time'].dt.weekday  # 0=Mon…6=Sun
    df['is_search_weekend'] = df['search_dow'].isin([5,6]).astype(int)
    # approximate check‑in day
    checkin = df['date_time'] + pd.to_timedelta(df['srch_booking_window'], 'D')
    df['checkin_dow'] = checkin.dt.weekday
    df['is_checkin_weekend'] = df['checkin_dow'].isin([5,6]).astype(int)
    return df

def add_ranks(df):
    """9) Dense ranks of price, star & distance within each search."""
    df['price_rank'] = df.groupby('srch_id')['price_usd'].rank('dense', ascending=True)
    df['star_rank']  = df.groupby('srch_id')['prop_starrating'].rank('dense', ascending=False)
    df['dist_rank']  = df.groupby('srch_id')['orig_destination_distance'].rank('dense', ascending=True)
    return df

def one_hot_encode_columns(train_df, test_df, columns_to_encode):
    """
    Converts specified string columns in train and test DataFrames to one-hot encoded features.

    Args:
        train_df (pd.DataFrame): The training DataFrame.
        test_df (pd.DataFrame): The testing DataFrame.
        columns_to_encode (list): A list of column names (strings) to be one-hot encoded.

    Returns:
        tuple: A tuple containing the modified training and testing DataFrames with one-hot encoded columns.
    """
    train_processed = train_df.copy()
    test_processed = test_df.copy()

    for col in columns_to_encode:
        if col in train_processed.columns and col in test_processed.columns:
            # Get unique values from both train and test to ensure consistent encoding
            all_unique_values = pd.concat([train_processed[col], test_processed[col]]).unique()

            for value in all_unique_values:
                train_processed[f'{col}_{value}'] = (train_processed[col] == value).astype(int)
                test_processed[f'{col}_{value}'] = (test_processed[col] == value).astype(int)

            # Drop the original categorical column
            train_processed.drop(columns=[col], inplace=True)
            test_processed.drop(columns=[col], inplace=True)
        else:
            print(f"Warning: Column '{col}' not found in both train and test DataFrames. Skipping one-hot encoding for this column.")

    return train_processed, test_processed

In [6]:
class ExpediaDataset(Dataset):
    def __init__(self, X, y):
        # X: numpy array of shape (n_samples, n_features)
        # y: array-like of shape (n_samples,)
        self.X = torch.tensor(np.asarray(X), dtype=torch.float32)
        self.y = torch.tensor(np.asarray(y), dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class DeepRecommender(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),

            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.2),

            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(0.1),

            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(1)

# Training loop
optimizer = torch.optim.Adam
criterion = nn.BCEWithLogitsLoss()

def train_nn(model, train_loader, val_loader, epochs=3, lr=1e-3, device='cpu'):
    model.to(device)
    opt = optimizer(model.parameters(), lr=lr)
    for epoch in range(epochs):
        model.train()
        for Xb, yb in train_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            opt.zero_grad()
            loss = criterion(model(Xb), yb)
            loss.backward(); opt.step()
        model.eval()
        val_loss = np.mean([criterion(model(Xv.to(device)), yv.to(device)).item()
                             for Xv, yv in val_loader])
        print(f"Epoch {epoch+1}/{epochs} - Val Loss: {val_loss:.4f}")
    return model.cpu()

# Prediction function

def predict_nn(model, loader, device='cpu'):
    model.to(device).eval()
    preds = []
    with torch.no_grad():
        for Xb, _ in loader:
            preds.extend(torch.sigmoid(model(Xb.to(device))).cpu().numpy())
    return np.array(preds)
def listnet_loss(scores, labels, group_ids):
    """
    Listwise softmax cross-entropy (ListNet) over sessions.
    scores:    Tensor of shape (batch,)
    labels:    Tensor of shape (batch,)
    group_ids: 1D numpy array of same length, giving srch_id for each element
    """
    loss = 0.0
    count = 0
    for q in np.unique(group_ids):
        idx = np.where(group_ids == q)[0]
        if len(idx) < 2:
            continue
        s_q = scores[idx]        # Tensor [n_q]
        y_q = labels[idx].float()# Tensor [n_q]
        P    = torch.softmax(y_q, dim=0)
        P_hat= torch.softmax(s_q, dim=0)
        loss += -torch.sum(P * torch.log(P_hat + 1e-8))
        count += 1
    return loss / max(count, 1)

def ensemble_predictions(nn_preds, lgbm_preds, weights=[0.5, 0.5]):
    """Ensembles predictions using weighted averaging."""
    return weights[0] * nn_preds + weights[1] * lgbm_preds

In [7]:
# 0) Define target up front
y = train['booking_bool'] * 5 + train['click_bool']

# 1) Run through your FE pipeline in one loop
train_feat, test_feat = train.copy(), test.copy()
steps = [
    # 2-DF functions: (train_df, test_df) → (train_df, test_df)
    preprocess_missing_and_competitors,
    add_destination_stats,
    lambda tr, te: one_hot_encode_columns(tr, te, ['dist_bucket']),
    # 1-DF functions must be wrapped:
    lambda tr, te: (create_base_features(tr),     create_base_features(te)),
    lambda tr, te: (add_within_search_features(tr), add_within_search_features(te)),
    lambda tr, te: (add_temporal_features(tr),    add_temporal_features(te)),
    lambda tr, te: (add_ranks(tr),                add_ranks(te)),
]

for fn in steps:
    train_feat, test_feat = fn(train_feat, test_feat)


# 2) Build feature list
drop = ['date_time','gross_bookings_usd','position',
        'click_bool','booking_bool','srch_id','prop_id']
features = [c for c in train_feat.columns if c not in drop]

# 3) Split into train/valid groups
mask_tr = train_feat['srch_id'].isin(
    train_test_split(
        train_feat['srch_id'].unique(), test_size=0.2, random_state=22
    )[0]
)
X = train_feat[features]
X_test = test_feat[features]
mask_va = ~mask_tr

# 4) Clean & impute based on TRAIN medians
for df in (X, X_test):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
for col in features:
    med = X[col].median()
    X[col].fillna(med, inplace=True)
    X_test[col].fillna(med, inplace=True)

# 5) Scale & slice
scaler = StandardScaler()
X_tr = scaler.fit_transform(X[mask_tr])
X_va = scaler.transform(X[~mask_tr])
X_test = scaler.transform(X_test)

y_tr, y_va = y[mask_tr], y[~mask_tr]

# 6) Build grouping arrays
grp_tr = train_feat['srch_id'][mask_tr].value_counts(sort=False).values
grp_va = train_feat['srch_id'][~mask_tr].value_counts(sort=False).values

# 7) LightGBM datasets
train_data = lgb.Dataset(X_tr, label=y_tr, group=grp_tr)
valid_data = lgb.Dataset(X_va, label=y_va, group=grp_va,
                         reference=train_data)

##NN
# Automatically use GPU if available, otherwise fall back to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_tr_nn = X_tr           # shape (n_train, n_feats)
X_val_nn = X_va          # shape (n_val,   n_feats)
y_tr_nn = y_tr.values    # turn into NumPy if it's a Series
y_val_nn= y_va.values

# grab the same session‐ids used for grouping
id_tr = train_feat['srch_id'][mask_tr].values
id_va = train_feat['srch_id'][mask_va].values

X_test_nn = X_test       # same test matrix you already scaled

# 2) Build your loaders
tr_loader = DataLoader(ExpediaDataset(X_tr_nn, y_tr_nn), batch_size=512, shuffle=True)
va_loader = DataLoader(ExpediaDataset(X_val_nn, y_val_nn), batch_size=512)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['price_vs_historical'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['price_vs_historical'].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/p

In [8]:
#run lgb model and set paramters

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [5],
    'learning_rate': 0.01,
    'num_leaves': 128,
    'verbose': -1
}

#dont make the model run endlesly 
model_lgb = lgb.train(
    params,
    train_data,
    valid_sets=[valid_data],
    callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)],
    num_boost_round=3000
)


Training until validation scores don't improve for 100 rounds
[100]	valid_0's ndcg@5: 0.375802
[200]	valid_0's ndcg@5: 0.37893
[300]	valid_0's ndcg@5: 0.379789
[400]	valid_0's ndcg@5: 0.380012
[500]	valid_0's ndcg@5: 0.380079
Early stopping, best iteration is:
[421]	valid_0's ndcg@5: 0.380106


In [9]:

# Predict relevance scores for each test row
model_lgb_pred = model_lgb.predict(X_test)
"""
# Insert those scores into the sample submission DataFrame
sample['score'] = preds

#  Sort by search session (ascending) and score (descending)
#    so that for each srch_id, the most relevant prop_id comes first
submission = sample.sort_values(
    ['srch_id', 'score'],
    ascending=[True, False]
)

# keep only the required columns and write to CSV
#    Kaggle expects: srch_id, prop_id (in ranked order)
submission[['srch_id', 'prop_id']].to_csv(
    'submission.csv',
    index=False
)
print("Submission.csv adjusted with new scores!")
""" 


'\n# Insert those scores into the sample submission DataFrame\nsample[\'score\'] = preds\n\n#  Sort by search session (ascending) and score (descending)\n#    so that for each srch_id, the most relevant prop_id comes first\nsubmission = sample.sort_values(\n    [\'srch_id\', \'score\'],\n    ascending=[True, False]\n)\n\n# keep only the required columns and write to CSV\n#    Kaggle expects: srch_id, prop_id (in ranked order)\nsubmission[[\'srch_id\', \'prop_id\']].to_csv(\n    \'submission.csv\',\n    index=False\n)\nprint("Submission.csv adjusted with new scores!")\n'

In [10]:
"""
# --- 1) ListNet loss (unchanged) ---
def listnet_loss(scores, labels, group_ids):
    loss, count = 0.0, 0
    for q in np.unique(group_ids):
        idx = np.where(group_ids == q)[0]
        if len(idx) < 2:
            continue
        s_q, y_q = scores[idx], labels[idx].float()
        P, P_hat = torch.softmax(y_q, 0), torch.softmax(s_q, 0)
        loss += -torch.sum(P * torch.log(P_hat + 1e-8))
        count += 1
    return loss / max(count, 1)

param_grid = {
    'lr':           [1e-3, 2e-3],
    'batch_size':   [512, 1024],
    'dropout':      [0.1, 0.2],
    'weight_decay': [0.0, 1e-4]
}

best = {'ndcg': 0.0, 'cfg': None}

for lr, bs, drop, wd in itertools.product(*param_grid.values()):
    tr_loader = DataLoader(ExpediaDataset(X_tr, y_tr.values),
                           batch_size=bs, shuffle=True)
    va_loader = DataLoader(ExpediaDataset(X_va, y_va.values),
                           batch_size=bs, shuffle=False)

    mdl = DeepRecommender(X_tr.shape[1]).to(device)
    for m in mdl.modules():
        if isinstance(m, nn.Dropout):
            m.p = drop

    opt = torch.optim.Adam(
        mdl.parameters(),
        lr=lr,
        weight_decay=wd
    )
    sched = torch.optim.lr_scheduler.OneCycleLR(
        opt,
        max_lr=lr * 10,
        steps_per_epoch=len(tr_loader),
        epochs=10
    )

    best_ndcg, stale = 0.0, 0
    for epoch in range(1, 8):
        # — train —
        mdl.train()
        for Xb, yb in tr_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            opt.zero_grad()
            loss = listnet_loss(mdl(Xb), yb, id_tr[:len(Xb)])
            loss.backward()
            opt.step()

        # — validate —
        mdl.eval()
        preds = []
        with torch.no_grad():
            for Xb, _ in va_loader:
                preds.extend(mdl(Xb.to(device)).cpu().numpy())
        preds = np.array(preds)

        # compute mean NDCG@5 using the same y_va array
        ndcgs = []
        for q in np.unique(id_va):
            idx = np.where(id_va == q)[0]
            if len(idx) > 1:
                true = y_va.values[idx]
                score = preds[idx]
                ndcgs.append(ndcg_score([true], [score], k=5))
        mean_ndcg = np.mean(ndcgs)
        sched.step(mean_ndcg)

        if mean_ndcg > best_ndcg + 1e-4:
            best_ndcg, stale = mean_ndcg, 0
        else:
            stale += 1
        if stale >= 5:
            break

    print(f"cfg lr={lr}, bs={bs}, drop={drop} → Val NDCG@5: {best_ndcg:.4f}")
    if best_ndcg > best['ndcg']:
        best.update({'ndcg': best_ndcg, 'cfg': (lr, bs, drop)})

print("🏆 Best config:", best)
"""






cfg lr=0.0005, bs=128, drop=0.1 → Val NDCG@5: 0.3600




cfg lr=0.0005, bs=128, drop=0.1 → Val NDCG@5: 0.3616




cfg lr=0.0005, bs=128, drop=0.1 → Val NDCG@5: 0.3578




KeyboardInterrupt: 

In [23]:
# --- After you found best['cfg'] and before building full_loader ---  
# Reconstruct the full feature & label arrays:
X_full = np.vstack([X_tr, X_va])         # shape = (n_train + n_val, n_feats)
y_full = np.concatenate([y_tr.values, y_va.values])  # same length

# If you also need the full group‐ids for ListNet loss on the full set:
id_full = np.concatenate([id_tr, id_va])           

# Now you can build your full_loader:
full_loader = DataLoader(
    ExpediaDataset(X_full, y_full),
    batch_size=bs,       # from your best config
    shuffle=True
)


In [None]:
# unpack your best cfg (now including wd if you added it to the grid)
#lr, bs, drop, wd = best['cfg']
lr, bs, drop, wd = 0.0005, 128, 0.1, 0.0

# rebuild full‐training loader with your chosen batch size
full_loader = DataLoader(ExpediaDataset(X_full, y_full), 
                         batch_size=bs, shuffle=True)

# 1) Instantiate and set dropout
final_nn = DeepRecommender(X_full.shape[1]).to(device)
for m in final_nn.modules():
    if isinstance(m, nn.Dropout):
        m.p = drop

# 2) Optimizer with weight decay
opt = torch.optim.Adam(final_nn.parameters(), 
                       lr=lr, 
                       weight_decay=wd)

# 3) One‐Cycle LR scheduler
sched = torch.optim.lr_scheduler.OneCycleLR(
    opt,
    max_lr=lr * 10,
    steps_per_epoch=len(full_loader),
    epochs=30
)

# 4) Training loop with gradient clipping
for epoch in range(1, 31):
    print(epoch)
    final_nn.train()
    total_loss = 0.0
    for Xb, yb in full_loader:
        Xb, yb = Xb.to(device), yb.to(device)
        opt.zero_grad()
        scores = final_nn(Xb)
        loss = listnet_loss(scores, yb, id_full[:len(Xb)])
        loss.backward()
        # clip gradients
        torch.nn.utils.clip_grad_norm_(final_nn.parameters(), max_norm=5.0)
        opt.step()
        sched.step()
        total_loss += loss.item() * Xb.size(0)
    avg_loss = total_loss / len(full_loader.dataset)
    print(f"Epoch {epoch}/30 — Avg ListNet Loss: {avg_loss:.4f} — LR: {opt.param_groups[0]['lr']:.1e}")


1
Epoch 1/30 — Avg ListNet Loss: 2.8024 — LR: 3.4e-04
2
Epoch 2/30 — Avg ListNet Loss: 2.7907 — LR: 7.6e-04
3
Epoch 3/30 — Avg ListNet Loss: 2.7893 — LR: 1.4e-03
4
Epoch 4/30 — Avg ListNet Loss: 2.7871 — LR: 2.2e-03
5


In [None]:
# --- 4) Produce final test preds ---
# --- 4) Produce final test preds ---
test_loader = DataLoader(
    ExpediaDataset(X_test),   # just X_test, no dummy y
    batch_size=bs,
    shuffle=False
)

nn_test_preds = []
final_nn.eval()
with torch.no_grad():
    for Xb in test_loader:             # each batch is just the features
        Xb = Xb.to(device)
        nn_test_preds.extend(
            torch.sigmoid(final_nn(Xb))
            .cpu()
            .numpy()
        )
nn_test_preds = np.array(nn_test_preds)


In [None]:
"""# --- Ensemble the predictions ---
ensemble_final_preds = ensemble_predictions(nn_test_preds, model_lgb_pred, weights=[0.5, 0.5]) # Adjust weights

# Predict relevance scores for each test row
preds_ens = ensemble_final_preds

# Insert those scores into the sample submission DataFrame
sample['score'] = preds_ens

# Sort by search session (ascending) and score (descending)
#   so that for each srch_id, the most relevant prop_id comes first
submission = sample.sort_values(
    ['srch_id', 'score'],
    ascending=[True, False]
)

# keep only the required columns and write to CSV
#   Kaggle expects: srch_id, prop_id (in ranked order)
submission[['srch_id', 'prop_id']].to_csv(
    'submission.csv',
    index=False
)
print("Submission.csv adjusted with new scores!")"""

In [None]:
"""
# 1) LightGBM val set predictions
# Replace `model_lgb` with whatever variable you named your trained LightGBM model
val_preds_lgb = model_lgb.predict(X_va)   # shape = (n_val,)

# 2) NN val set predictions
final_nn.eval()
nn_val_preds = []
with torch.no_grad():
    for Xb, _ in va_loader:  # va_loader from your NN split on X_va/y_va
        nn_val_preds.extend(
            torch.sigmoid(final_nn(Xb.to(device))).cpu().numpy()
        )
nn_val_preds = np.array(nn_val_preds)      # shape = (n_val,)

# 3) Blend them
w_nn, w_lgb = 0.4, 0.6
ensemble_val = w_nn * nn_val_preds + w_lgb * val_preds_lgb


ndcgs = []
for q in np.unique(id_va):
    idx = np.where(id_va == q)[0]
    if len(idx) > 1:
        true_rel  = y_val_nn[idx]    # your val labels array
        score_rel = ensemble_val[idx]
        ndcgs.append(ndcg_score([true_rel], [score_rel], k=5))

mean_ndcg5 = np.mean(ndcgs)
print(f"Ensembled Val NDCG@5: {mean_ndcg5:.4f}")


In [None]:
print("NN preds:", nn_test_preds.min(), nn_test_preds.max())
print("LGB preds:", model_lgb_pred.min(), model_lgb_pred.max())

In [None]:


# 1) Min–max scale each prediction array into [0,1]
def minmax(arr):
    return (arr - arr.min()) / (arr.max() - arr.min() + 1e-8)

nn_scaled  = minmax(nn_test_preds)
lgb_scaled = minmax(model_lgb_pred)   # or whatever your LGB test‐preds variable is

# 2) Do the two blends
ens1 = ensemble_predictions(nn_scaled, lgb_scaled, [0.5, 0.5])
ens2 = ensemble_predictions(nn_scaled, lgb_scaled, [0.0, 1.0])

# 3) Compare their sorted‐order permutations
order1 = np.argsort(ens1)
order2 = np.argsort(ens2)

# 4) Compute fraction of positions that differ
fraction_changed = np.mean(order1 != order2)
print(f"Fraction of test‐rows whose position changes: {fraction_changed:.4%}")


In [None]:


best_w, best_score = None, 0.0

for w_nn in np.linspace(0, 1, 11):      # 0.0, 0.1, …, 1.0
    w_lgb = 1 - w_nn

    # 1) Compute the raw blended test‐fold scores
    blended = w_nn * nn_val_preds + w_lgb * val_preds_lgb

    # 2) Compute NDCG@5 per session directly on these raw scores
    ndcgs = []
    for q in np.unique(id_va):
        idx = np.where(id_va == q)[0]
        if len(idx) > 1:
            true = y_val_nn[idx]          # your relevance labels
            scores = blended[idx]         # raw blended scores
            ndcgs.append(ndcg_score([true], [scores], k=5))
    mean_ndcg = np.mean(ndcgs)

    print(f"w_nn={w_nn:.1f}, w_lgb={w_lgb:.1f} → Val NDCG@5: {mean_ndcg:.4f}")
    if mean_ndcg > best_score:
        best_score, best_w = mean_ndcg, w_nn

print(f"\n🏆 Best blend: w_nn={best_w:.2f}, w_lgb={1-best_w:.2f} → NDCG@5={best_score:.4f}")


In [None]:
df = sample[['srch_id']].copy()
df['nn_rank']  = pd.DataFrame({'score': nn_test_preds,  'srch': sample['srch_id']}) \
                    .groupby('srch')['score'] \
                    .rank(method='dense', ascending=False)
df['lgb_rank'] = pd.DataFrame({'score': model_lgb_pred,'srch': sample['srch_id']}) \
                    .groupby('srch')['score'] \
                    .rank(method='dense', ascending=False)

# weighted rank
w_nn, w_lgb = best_w, (1-best_w)
df['ensemble_rank'] = w_nn * df['nn_rank'] + w_lgb * df['lgb_rank']

# use that to sort
submission = sample.assign(_rank=df['ensemble_rank']) \
    .sort_values(['srch_id','_rank'], ascending=[True,True]) \
    [['srch_id','prop_id']]



In [None]:
# keep only the required columns and write to CSV
#   Kaggle expects: srch_id, prop_id (in ranked order)
submission[['srch_id', 'prop_id']].to_csv(
    'submission.csv',
    index=False
)
print("Submission.csv adjusted with new scores!")