In [1]:
NUM_QUERIES = 1000
RESULTS_LEN = 400
SLATE_LENGTH = 400
NUM_FEATURES = 300

RUN_ID = 1

In [2]:
import pickle

with open("data/data_export.pkl", "rb") as file:
    property_map, grouped_rankings = pickle.load(file)

In [3]:
from torchvision.transforms import transforms
from allrank.data.dataset_loading import LibSVMDataset, FixLength, ToTensor, fix_length_to_longest_slate
from scipy.sparse import csr_matrix
import numpy as np

def flatten_groups(grouped_rankings):
    ranking_groups = []

    for search_date, search_date_grouped_items in grouped_rankings.items():
        for _, check_in_grouped_items in search_date_grouped_items.items():
            for _, duration_grouped_items in check_in_grouped_items.items():
                for _, guest_count_grouped_items in duration_grouped_items.items():
                    ranking_groups.append(guest_count_grouped_items[:])

    return ranking_groups


def combine_data(ranking_groups, property_map, slate_length=400, seed=42):
    np.random.seed(seed)
    
    total_rows = 0
    group_sizes = []
    
    for idx, ranking_group in enumerate(ranking_groups):
        group_rankings = [item["ranking"] for item in ranking_group]
        group_rankings_np = np.array(group_rankings)
        
        _, unique_indices = np.unique(group_rankings_np[::-1], return_index=True)
        unique_count = len(unique_indices)
        
        group_sizes.append(unique_count)
        total_rows += unique_count
    
    print(f"Total rows: {total_rows}")
    
    # Get feature dimension from first item in first group to allocate array
    sample_item = ranking_groups[0][0]
    ranking_attrs = sample_item["scaled_attrs"]
    property_id = sample_item["propertyId"]
    prop = property_map[property_id]
    property_data = np.concatenate([prop["attrs"], prop["encoded_title"]])
    full_attrs = np.concatenate([ranking_attrs, property_data])
    feature_dim = len(full_attrs)
    
    print(f"Feature dimension: {feature_dim}")
    
    # Pre-allocate arrays - now including the attributes array
    all_attrs = np.zeros((total_rows, feature_dim), dtype=np.float32)
    all_rankings = np.zeros(total_rows, dtype=np.float32)
    all_queries = np.zeros(total_rows, dtype=np.int32)
    
    query_index = {}
    current_row = 0
    
    # Second pass - fill arrays
    for idx, ranking_group in enumerate(ranking_groups):
        group_data = []
        group_rankings = []
        listing_ids = []
        group_metadata = {}

        for item in ranking_group:
            ranking_attrs = item["scaled_attrs"]
            property_id = item["propertyId"]
            ranking = item["ranking"]

            prop = property_map[property_id]
            property_data = np.concatenate([prop["attrs"], prop["encoded_title"]])
            full_attrs = np.concatenate([ranking_attrs, property_data])
            
            group_data.append(full_attrs)
            group_rankings.append(ranking)
            listing_ids.append({
                'airbnb_id': prop["airbnbId"],
                'guesty_listing_id': prop["guestyListingId"]
            })

            group_metadata = {
                "durationOfStay": item["durationOfStay"],
                "searchDateOffset": item["searchDateOffset"],
                "checkInDateOffset": item["checkInDateOffset"],
                "guestCount": item["guestCount"]
            }
        
        group_rankings_np = np.array(group_rankings)
        group_data_np = np.array(group_data)

        # Remove duplicates
        _, unique_indices = np.unique(group_rankings_np[::-1], return_index=True)
        unique_indices = unique_indices[::-1]

        group_data_np = group_data_np[unique_indices]
        group_rankings_np = group_rankings_np[unique_indices]
        listing_ids = [listing_ids[i] for i in unique_indices]

        # Shuffle
        perm = np.random.permutation(len(group_data_np))
        group_data_np = group_data_np[perm]
        group_rankings_np = group_rankings_np[perm]
        listing_ids = [listing_ids[i] for i in perm]

        # Invert ranking
        inverted_rankings = np.clip(slate_length - group_rankings_np, 0, None)
        
        # Store all data directly in pre-allocated arrays
        group_size = len(group_data_np)
        all_attrs[current_row:current_row+group_size] = group_data_np
        all_rankings[current_row:current_row+group_size] = inverted_rankings
        all_queries[current_row:current_row+group_size] = idx
        
        # Update query index
        query_index[idx] = {
            **group_metadata,
            'listingIds': listing_ids
        }
        
        current_row += group_size
        
    return all_attrs, all_rankings, all_queries, query_index

def get_dataset(ds_attrs, ds_ranks, ds_qids, is_train, slate_length):
    attrs_sparse = csr_matrix(ds_attrs)
    ds = LibSVMDataset(attrs_sparse, ds_ranks, ds_qids)
    
    if is_train:
        ds.transform = transforms.Compose([FixLength(slate_length), ToTensor()])
    else:
        ds.transform = fix_length_to_longest_slate(ds)
    return ds


def get_datasets(grouped_rankings, property_map, valid_percent, slate_length, seed=42):
    ranking_groups = flatten_groups(grouped_rankings)
    total_groups = len(ranking_groups)

    # Shuffle group order
    indices = np.random.permutation(total_groups)
    valid_size = int(valid_percent * total_groups)

    valid_groups = [ranking_groups[i] for i in indices[:valid_size]]
    train_groups = [ranking_groups[i] for i in indices[valid_size:]]

    # Combine groups into arrays
    train_attrs, train_ranks, train_qids, train_index = combine_data(train_groups, property_map, slate_length, seed)
    valid_attrs, valid_ranks, valid_qids, valid_index = combine_data(valid_groups, property_map, slate_length, seed)
    
    train_ds = get_dataset(train_attrs, train_ranks, train_qids, True, slate_length)
    valid_ds = get_dataset(valid_attrs, valid_ranks, valid_qids, False, slate_length)

    return train_ds, valid_ds, train_index, valid_index

In [4]:
train_ds, valid_ds, train_index, valid_index = get_datasets(
    grouped_rankings, 
    property_map, 
    0.3, 
    SLATE_LENGTH)

Total rows: 3155204
Feature dimension: 3180
Total rows: 1361651
Feature dimension: 3180


In [5]:
train_ds[0]

(tensor([[ 0.0000,  0.2000,  0.3119,  ...,  0.0384,  0.0290,  0.0493],
         [ 0.0000,  0.2000,  0.3119,  ..., -0.0346, -0.0326,  0.0482],
         [ 0.0000,  0.2000,  0.3119,  ..., -0.0239,  0.0638,  0.0216],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]),
 tensor([285., 321., 276., 374., 350., 327., 381., 270., 253., 250., 348., 352.,
         366., 282., 259., 361., 338., 393., 300., 306., 299., 367., 377., 375.,
         386., 268., 347., 385., 255., 307., 277., 302., 356., 398., 320., 333.,
         369., 266., 260., 334., 378., 252., 353., 371., 324., 319., 291., 345.,
         363., 311., 323., 387., 359., 248., 368., 290., 365., 329., 325., 287.,
         336., 391., 295., 314., 309., 343., 357., 358., 258., 389., 349., 318.,
         310., 272., 263., 388., 380., 289., 395., 376., 354., 316., 247