In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

In [2]:
# read the files changes path to where the data is stored
test = pd.read_csv(r"C:\Users\youpz\Documents\Master\P5\Data mining techniques\Assignment2\data\dmt-2025-2nd-assignment\test_set_VU_DM.csv")
train = pd.read_csv(r"C:\Users\youpz\Documents\Master\P5\Data mining techniques\Assignment2\data\dmt-2025-2nd-assignment\training_set_VU_DM.csv")
sample = pd.read_csv(r"C:\Users\youpz\Documents\Master\P5\Data mining techniques\Assignment2\data\dmt-2025-2nd-assignment\submission_sample.csv")

In [3]:
#number op srch_id
print(len(sample['srch_id'].unique()), 'number of search IDs in sample')
print(len(sample['prop_id'].unique()), 'number of different hotels in sample')
print(len(test['srch_id'].unique()),'number of search ids in testdataset')

199549 number of search IDs in sample
129438 number of different hotels in sample
199549 number of search ids in testdataset


In [4]:
# Check if srch_ids match between sample, test, and train --> to get a feeling for the code

sample_ids = set(sample['srch_id'].unique())
test_ids = set(test['srch_id'].unique())
train_ids = set(train['srch_id'].unique())

# Check sample vs. test
if sample_ids != test_ids:
    missing_in_sample = test_ids - sample_ids
    missing_in_test = sample_ids - test_ids
    print(f"Mismatch in srch_ids between sample and test! Missing in sample: {len(missing_in_sample)}, Missing in test: {len(missing_in_test)}")
else:
    print("srch_id sets match between sample and test.")

# Check sample vs. train
if sample_ids != train_ids:
    missing_in_sample_train = train_ids - sample_ids
    missing_in_train_sample = sample_ids - train_ids
    print(f"Mismatch in srch_ids between sample and train! Missing in sample: {len(missing_in_sample_train)}, Missing in train: {len(missing_in_train_sample)}")
else:
    print("srch_id sets match between sample and train.")



srch_id sets match between sample and test.
Mismatch in srch_ids between sample and train! Missing in sample: 79881, Missing in train: 79635


In [5]:
########################## some beginning on the feature engineering

def create_base_features(df):
    # Parse date_time into components
    df['date_time'] = pd.to_datetime(df['date_time'])
    df['search_year']  = df['date_time'].dt.year
    df['search_month'] = df['date_time'].dt.month
    df['search_day']   = df['date_time'].dt.day
    df['search_hour']  = df['date_time'].dt.hour

    # Price per night
    df['price_per_night'] = df['price_usd'] / df['srch_length_of_stay']

    # Historical price deviation
    hist_mean = df['visitor_hist_adr_usd'].mean()
    df['hist_price_usd'] = df['visitor_hist_adr_usd'].fillna(hist_mean)
    df['price_diff_hist'] = df['price_usd'] - df['hist_price_usd']
    df['price_rat_hist']  = df['price_usd'] / (df['hist_price_usd'] + 1e-6)

    # Competitor percent-diffs log transformation
    diff_cols = [c for c in df.columns if 'rate_percent_diff' in c]
    for c in diff_cols:
        df[c] = df[c].fillna(0)
        df[f'log_{c}'] = np.log1p(df[c].abs()) * np.sign(df[c])

    # Flag missing historical stars
    df['hist_star_na'] = df['visitor_hist_starrating'].isna().astype(int)
    df['visitor_hist_starrating'] = df['visitor_hist_starrating'].fillna(-1)

    return df

In [6]:
#apply new featrues
train_feat = create_base_features(train.copy())
test_feat  = create_base_features(test.copy())

In [7]:
############################################################ create new features

# Compute popularity and booking rate per destination
dest_stats = (
    train_feat
    .groupby('srch_destination_id')
    .agg(dest_searches=('srch_id', 'count'),
         dest_bookings=('booking_bool', 'sum'))
    .reset_index()
)
dest_stats['dest_booking_rate'] = dest_stats['dest_bookings'] / dest_stats['dest_searches']

# merge new features
for df in (train_feat, test_feat):
    df.merge(dest_stats[['srch_destination_id','dest_searches','dest_booking_rate']],
             on='srch_destination_id', how='left')

In [8]:
#rank data
for df in (train_feat, test_feat):
    df['price_rank'] = df.groupby('srch_id')['price_usd'].rank(method='dense', ascending=True)
    df['star_rank']  = df.groupby('srch_id')['prop_starrating'].rank(method='dense', ascending=False)
    df['dist_rank']  = df.groupby('srch_id')['orig_destination_distance']\
                         .rank(method='dense', ascending=True)


In [9]:
############################################### create train and test data

# Create relevance label for training
y = train_feat['booking_bool'] * 5 + train_feat['click_bool']

# List of features to drop (train-only or unneeded)
drop_cols = ['date_time','click_bool','booking_bool','gross_bookings_usd','position']
features = [c for c in train_feat.columns if c not in drop_cols + ['srch_id','prop_id']]

#create X train and X_test
X = train_feat[features]
X_test = test_feat[features]

# Extract unique query IDs so we can split at the search-session level
group_ids = train_feat['srch_id'].unique()

# train test split 80/20
train_ids, valid_ids = train_test_split(
    group_ids,
    test_size=0.2,
    random_state=22
)

# Build boolean masks that mark every row in train_feat as belonging
#    either to the train split or the validation split, based on its srch_id
mask_tr = train_feat['srch_id'].isin(train_ids)
mask_va = train_feat['srch_id'].isin(valid_ids)

# Subset your feature matrix X and label vector y according to those masks
#    so X_tr/y_tr and X_va/y_va correspond to disjoint sets of queries
X_tr, y_tr = X[mask_tr], y[mask_tr]
X_va, y_va = X[mask_va], y[mask_va]

# Compute the “group sizes” for LightGBM’s ranker: for each query (srch_id),
#    count how many candidate rows belong to it.  This array of counts
#    tells the ranker where one query ends and the next begins.
groups_tr = train_feat[mask_tr].groupby('srch_id').size().values
groups_va = train_feat[mask_va].groupby('srch_id').size().values

#get train and valdiation data to train the model
train_data = lgb.Dataset(X_tr, label=y_tr, group=groups_tr)
valid_data = lgb.Dataset(X_va, label=y_va, group=groups_va)

In [10]:
#run lgb model and set paramters

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [5],
    'learning_rate': 0.05,
    'num_leaves': 64,
    'verbose': -1
}

#dont make the model run endlesly 
model = lgb.train(
    params,
    train_data,
    valid_sets=[valid_data],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)],
    num_boost_round=1000
)


Training until validation scores don't improve for 50 rounds
[100]	valid_0's ndcg@5: 0.377543
[200]	valid_0's ndcg@5: 0.382643
[300]	valid_0's ndcg@5: 0.385069
[400]	valid_0's ndcg@5: 0.38626
[500]	valid_0's ndcg@5: 0.386712
Early stopping, best iteration is:
[500]	valid_0's ndcg@5: 0.386712


In [11]:
# Predict relevance scores for each test row
preds = model.predict(X_test)

# Insert those scores into the sample submission DataFrame
sample['score'] = preds

#  Sort by search session (ascending) and score (descending)
#    so that for each srch_id, the most relevant prop_id comes first
submission = sample.sort_values(
    ['srch_id', 'score'],
    ascending=[True, False]
)

# keep only the required columns and write to CSV
#    Kaggle expects: srch_id, prop_id (in ranked order)
submission[['srch_id', 'prop_id']].to_csv(
    'submission.csv',
    index=False
)