In [40]:
from sklearn.datasets import load_svmlight_file
from sklearn.metrics import ndcg_score
import numpy as np
import lightgbm as lgb

# Load the dataset for one fold
def load_one_fold(data_path):
    X_train, y_train, qid_train = load_svmlight_file(str(data_path + 'train.txt'), query_id=True)
    X_test, y_test, qid_test = load_svmlight_file(str(data_path + 'test.txt'), query_id=True)
    y_train = y_train.astype(int)
    y_test = y_test.astype(int)
    _, group_train = np.unique(qid_train, return_counts=True)
    _, group_test = np.unique(qid_test, return_counts=True)
    return X_train, y_train, qid_train, group_train, X_test, y_test, qid_test, group_test

def ndcg_single_query(y_score, y_true, k):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)

# calculate NDCG score given a trained model 
def compute_ndcg_all(model, X_test, y_test, qids_test, k=10):
    unique_qids = np.unique(qids_test)
    ndcg_ = list()
    for i, qid in enumerate(unique_qids):
        y = y_test[qids_test == qid]

        if np.sum(y) == 0:
            continue

        p = model.predict(X_test[qids_test == qid])

        idcg = ndcg_single_query(y, y, k=k)
        ndcg_.append(ndcg_single_query(p, y, k=k) / idcg)
    return np.mean(ndcg_)

# get importance of features
def get_feature_importance(model, importance_type='gain'):
    return model.feature_importance(importance_type=importance_type)

In [22]:
from sklearn.datasets import load_svmlight_file
web10k_data_path = "MSLR-WEB10K/Fold1/"

# Load the Web10k dataset for one fold
X_train, y_train, qid_train = load_svmlight_file(str(web10k_data_path + 'train.txt'), query_id=True)
X_test, y_test, qid_test = load_svmlight_file(str(web10k_data_path + 'test.txt'), query_id=True)
X_vali, y_vali, qid_vali = load_svmlight_file(str(web10k_data_path + 'vali.txt'), query_id=True)

# Print the number of unique queries in total
total_unique_queries = len(set(np.concatenate((qid_train, qid_test,qid_vali))))
print(f"Total number of unique queries: {total_unique_queries}")

# Show the distribution of relevance labels
unique_labels, label_counts = np.unique(np.concatenate((y_train, y_test,y_vali)), return_counts=True)
print("\nDistribution of relevance labels:")
for label, count in zip(unique_labels, label_counts):
    print(f"Relevance label {label}: {count} samples")


Total number of unique queries: 10000

Distribution of relevance labels:
Relevance label 0.0: 624263 samples
Relevance label 1.0: 386280 samples
Relevance label 2.0: 159451 samples
Relevance label 3.0: 21317 samples
Relevance label 4.0: 8881 samples


# Answer 13

Total number of unique queries (including vali.txt) = 10,000 

Excluding vali.txt, Total number of unique queries = 8,000

Distribution of relevance labels (*including vali.txt* of Fold1):

| Relevance Label | Number of Samples |
|------------------|-------------------|
| 0.0              | 624263            |
| 1.0              | 386280            |
| 2.0              | 159451            |
| 3.0              | 21317             |
| 4.0              | 8881              |


In [36]:
def train_lightgbm_model(X_train, y_train, qid_train):
    params = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'ndcg_eval_at': [3, 5, 10],
        # Add other parameters as needed
    }

    train_data = lgb.Dataset(X_train, label=y_train, group=np.unique(qid_train, return_counts=True)[1])

    model = lgb.train(params, train_data)

    return model

# Function to evaluate model on test set and print nDCG scores for multiple k values
def evaluate_model(model, X_test, y_test, qid_test, fold_number, k_values=[3, 5, 10]):
    predictions = model.predict(X_test)
    print("#################################  EVALUATING MODEL-FOLD {}   #################################".format(fold_number))

    for k in k_values:
        ndcg_at_k = compute_ndcg_all(model, X_test, y_test, qid_test, k)
        print(f"nDCG@{k}: {ndcg_at_k}")
    print("\n################################################################################################")

# Train and evaluate LightGBM models for each fold
mslr_data_path = "MSLR-WEB10K"
for fold_number in range(1, 6):
    fold_path = f"{mslr_data_path}/Fold{fold_number}/"
    print(f"\nTraining model for {fold_path}")
    
    # Load one fold of MSLR data
    X_train, y_train, qid_train, group_train, X_test, y_test, qid_test, group_test = load_one_fold(fold_path)

    # Train a LightGBM model
    model = train_lightgbm_model(X_train, y_train, qid_train)

    # Evaluate the model on the test set
    evaluate_model(model, X_test, y_test, qid_test,fold_number,k_values=[3, 5, 10])




Training model for MSLR-WEB10K/Fold1/
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.074784 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25637
[LightGBM] [Info] Number of data points in the train set: 723412, number of used features: 136
#################################  EVALUATING MODEL-FOLD 1   #################################
nDCG@3: 0.4564571300800643
nDCG@5: 0.4632890672260867
nDCG@10: 0.48286731451235976

################################################################################################

Training model for MSLR-WEB10K/Fold2/
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.086566 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25623
[LightGBM] [Info] Number of d

# Answer 14

| Evaluation Metric (on test set) | Fold 1 | Fold 2 | Fold 3 | Fold 4 | Fold 5 |
|-------------------|--------|--------|--------|--------|--------|
| nDCG@3            | 0.456  | 0.454  | 0.449  | 0.461  | 0.470  |
| nDCG@5            | 0.463  | 0.457  | 0.458  | 0.466  | 0.471  |
| nDCG@10           | 0.483  | 0.477  | 0.476  | 0.488  | 0.490  |


In [41]:
# Function to get the top N important features based on 'gain'
def get_top_n_features(model, n=5):
    importance_type = 'gain'
    feature_importance = get_feature_importance(model, importance_type)
    top_indices = np.argsort(feature_importance)[::-1][:n]
    return top_indices

# Analyze and interpret results for each fold
for fold_number in range(1, 6):
    fold_path = f"{mslr_data_path}/Fold{fold_number}/"
    print(f"\nTraining model for {fold_path}")
    
    # Load one fold of MSLR data
    X_train, y_train, qid_train, group_train, X_test, y_test, qid_test, group_test = load_one_fold(fold_path)

    # Train a LightGBM model
    model = train_lightgbm_model(X_train, y_train, qid_train)

    # Get top 5 important features based on 'gain'
    top_features = get_top_n_features(model, n=5)
    
    print(f"Top 5 important features for Fold {fold_number}:")
    for i, feature_index in enumerate(top_features):
        print(f"{i + 1}. Feature {feature_index + 1}")
    print("#########################################################################")


Training model for MSLR-WEB10K/Fold1/
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.079854 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25637
[LightGBM] [Info] Number of data points in the train set: 723412, number of used features: 136
Top 5 important features for Fold 1:
1. Feature 134
2. Feature 8
3. Feature 108
4. Feature 55
5. Feature 130
#########################################################################

Training model for MSLR-WEB10K/Fold2/
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.082610 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25623
[LightGBM] [Info] Number of data points in the train set: 716683, number of used features: 136
Top 5 important features fo

# Answer 15

| Fold # | Top Feature 1 | Top Feature 2 | Top Feature 3 | Top Feature 4 | Top Feature 5 |
|------|---------------|---------------|---------------|---------------|---------------|
| 1    | 134           | 8             | 108           | 55            | 130           |
| 2    | 134           | 8             | 55            | 108           | 130           |
| 3    | 134           | 55            | 108           | 130           | 8             |
| 4    | 134           | 8             | 55            | 130           | 129           |
| 5    | 134           | 8             | 55            | 108           | 130           |


# Removing top 20 and removing bottom 60

In [69]:
# Function to remove top N features based on 'gain'
def remove_top_n_features(X, top_indices, n=20):
    X_csc = X.tocsc()
    indices_to_keep = np.setdiff1d(np.arange(X.shape[1]), top_indices[:n])
    return X_csc[:, indices_to_keep]

# Function to remove bottom N features based on 'gain'
def remove_bottom_n_features(X, top_indices, n=60):
    X_csc = X.tocsc()
    indices_to_keep = np.setdiff1d(np.arange(X.shape[1]), top_indices[-n:])
    return X_csc[:, indices_to_keep]

for fold_number in range(1, 6):
    fold_path = f"{mslr_data_path}/Fold{fold_number}/"
    print("**********************************************************************************************************\n")
    print(f"\nAnalyzing results for Fold {fold_number}")
    
    # Load one fold of MSLR data
    X_train, y_train, qid_train, group_train, X_test, y_test, qid_test, group_test = load_one_fold(fold_path)

    # Train a LightGBM model
    model = train_lightgbm_model(X_train, y_train, qid_train)

    # Get top 20 important features based on 'gain'
    top_features = get_top_n_features(model, n=20)

    # Remove top 20 features and train a new model
    X_train_removed_top = remove_top_n_features(X_train, top_features, n=20)
    X_test_removed_top = remove_top_n_features(X_test, top_features, n=20)
    print(f"REMOVING TOP 20 FEATURES for {fold_path}\n")
    print(f"--------------------------Training for {fold_path} (Removing TOP 20 features)-----------------------------:")
    new_model_top_removed = train_lightgbm_model(X_train_removed_top, y_train, qid_train)
    evaluate_model(new_model_top_removed, X_test_removed_top, y_test, qid_test,fold_number,k_values=[10])
    # Get top 60 least important features based on 'gain'
    least_features = get_top_n_features(model, n=60)

    # Remove bottom 60 features and train a new model
    X_train_removed_bottom = remove_bottom_n_features(X_train, least_features, n=60)
    X_test_removed_bottom = remove_bottom_n_features(X_test, least_features, n=60)
    print(f"REMOVING BOTTOM 60 FEATURES for {fold_path}\n")

    print(f"--------------------------Training for {fold_path}  (Removing BOTTOM 60 features)-----------------------------:")
    print("\n**********************************************************************************************************")
    new_model_bottom_removed = train_lightgbm_model(X_train_removed_bottom, y_train, qid_train)
    evaluate_model(new_model_bottom_removed, X_test_removed_bottom, y_test, qid_test,fold_number,k_values=[10])



**********************************************************************************************************


Analyzing results for Fold 1
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.074343 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25637
[LightGBM] [Info] Number of data points in the train set: 723412, number of used features: 136
REMOVING TOP 20 FEATURES for MSLR-WEB10K/Fold1/

--------------------------Training for MSLR-WEB10K/Fold1/ (Removing TOP 20 features)-----------------------------:
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075550 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 21582
[LightGBM] [Info] Number of data points in the train set: 723412, number of used 

# Answer 16

• Remove top 20 features:

| Fold # | nDCG@10          |
|------|------------------|
| 1    | 0.408            |
| 2    | 0.405            |
| 3    | 0.412            |
| 4    | 0.412            |
| 5    | 0.417            |


We note that the performance measured by nDCG@10 shows a decrease in values compared to the original model by around 5%, indicating that these features play a role in the model's predictive capabilities. But the decrease is not much. The reasons could be probably owing to redundancy in features: the removed features might be redundant or highly correlated with other features in the dataset. In such cases, the model can still rely on the correlated information provided by the remaining features. It may also be the case that the LightGBM model may have enough capacity to compensate for the removal of a few top features. The model may have learned alternative patterns from the remaining features.


• Remove bottom 60 features:


| Fold | nDCG@10          |
|------|------------------|
| 1    | 0.376            |
| 2    | 0.372            |
| 3    | 0.374            |
| 4    | 0.376            |
| 5    | 0.380            |


Removing least 60 important features leads to close to an 7 % decrement in performance. This clearly makes sense with the intuition that lower importance features don't contribute much to the model performance, with features being highly redundant. The removed features might be redundant or less informative, and their exclusion did not lead to a loss of crucial information. With respect to the model on removing top 20, it only caused a further 2% loss which is miniscule even after removing 60 feature vectors and hence we note that only the top few features are very important in a model's decision mechanism.


