In [17]:
import pandas as pd
import random
import sys
import os

In [18]:
# Add the src folder to the path
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

In [19]:
import utils.data_generation as utils_datagen

In [20]:
# Set seed for reproducibility
random.seed(42)

In [None]:
# Set constants related to data generation
N_DESTINATIONS = 20                                         # Number of travel destinations (e.g. London)            
N_PRODUCT_CATEGORIES = 10                                   # Number of product categories (e.g. Day trip, transport, food tour)
RANGE_PRODUCT_DESTINATION = {"min": 10, "max": 48}          # Number of products per destination. 
RANGE_DURATION = {"min": 30, "max": 300}                    # Duration in minutes.
N_QUERIES = 10000                                           # Number of query records to generate
N_MAX_PRODUCTS_PER_QUERY = 24                               # Maximum number of products per query

In [22]:
# Create product data
product_data = []

for destination_id in range(1, N_DESTINATIONS+1):
    n_products_dest = random.randint(RANGE_PRODUCT_DESTINATION["min"], RANGE_PRODUCT_DESTINATION["max"])
    
    for product_destination_serial in range(1, n_products_dest + 1):
        # product_destination_serial is the n-th product in the destination.
        product_code = f"{destination_id}-{product_destination_serial}"
        product_category = random.randint(1, N_PRODUCT_CATEGORIES)
        duration = random.randint(RANGE_DURATION["min"], RANGE_DURATION["max"])
        
        product_data.append({
            'destination_id': destination_id,
            'product_code': product_code,
            'product_category': product_category,
            'duration': duration
        })

# Create DataFrame
product_df = pd.DataFrame(product_data)

print(f"Generated {len(product_df)} products across {N_DESTINATIONS} destinations")
print(product_df.head(10))



Generated 595 products across 20 destinations
   destination_id product_code  product_category  duration
0               1          1-1                 1       170
1               1          1-2                 4       144
2               1          1-3                 3        82
3               1          1-4                 9        74
4               1          1-5                10       246
5               1          1-6                 1        45
6               1          1-7                 2       141
7               1          1-8                 4       288
8               1          1-9                10        43
9               1         1-10                 9       131


In [None]:
# Create query data
query_data = []

for _ in range(N_QUERIES):
    # Generate visitor_id as UUID-like string
    visitor_id = utils_datagen.generate_visitor_id()
    query_id = utils_datagen.generate_query_id()
    
    # Generate destination_id from 1 to N_DESTINATIONS
    destination_id = random.randint(1, N_DESTINATIONS)
    
    query_data.append({
        'visitor_id': visitor_id,
        'query_id': query_id,
        'destination_id': destination_id
    })

# Create DataFrame
query_df = pd.DataFrame(query_data)

print(f"Generated {len(query_df)} queries")
print(query_df.head(10))

# Now, join query_df with product_df, and randomly retain up to 24 rows/products per query without replacement.
# For simplicity, assume each product appears on the search results equally likely (unrealistic)
# These are the products that were returned by the search.
# Gives a FutureWarning that can be ignored
model_df = query_df.merge(product_df, on='destination_id', how='left')
model_df = model_df.groupby('query_id').apply(lambda x: x.sample(n=min(N_MAX_PRODUCTS_PER_QUERY, len(x)))).reset_index(drop=True) 



Generated 10000 queries
                             visitor_id         query_id  destination_id
0  9a7de93a-a105-48b9-9ba5-c0882f61b495  erNHu-9GCL-gR0O              18
1  bf0149b8-886c-4c4c-a457-375399af1b48  SnBov-CzfA-Pxj5               3
2  3987c43a-679c-449a-8e01-71712eeff8f2  ffAUx-id7I-Pz26              14
3  0dcd0cd7-3ba4-45c3-be35-d00e3a580d27  1S8Lv-JnOn-g0wV              19
4  8a216404-18cc-447c-b973-195fe82e18d3  oAIVM-PILO-QbqV              11
5  60b317a4-2cc4-43ed-a203-8f7adbe72ad7  vliOe-T8fG-xCjt              11
6  40e2489a-76f0-4c88-ad2a-307b22b8c08a  WJfdj-MPqP-EBqU               4
7  c91d3b91-13d6-4869-bbd5-eee048cf80a9  vgQKG-tn9d-m6VV              10
8  cf8b92b4-fa80-4789-92ce-2afa46c13066  uaUkx-RFZX-e1cb               3
9  ddd0cfee-b956-489d-9014-99a66d1d9e0b  1JJRz-hbuX-MZ5f               4


  model_df = model_df.groupby('query_id').apply(lambda x: x.sample(n=min(24, len(x)))).reset_index(drop=True)


In [24]:
model_df

Unnamed: 0,visitor_id,query_id,destination_id,product_code,product_category,duration
0,1be358f1-1b37-4885-8d96-5af24ce8ffcf,001eE-o8dn-NzoC,19,19-3,3,53
1,1be358f1-1b37-4885-8d96-5af24ce8ffcf,001eE-o8dn-NzoC,19,19-9,6,244
2,1be358f1-1b37-4885-8d96-5af24ce8ffcf,001eE-o8dn-NzoC,19,19-8,7,262
3,1be358f1-1b37-4885-8d96-5af24ce8ffcf,001eE-o8dn-NzoC,19,19-11,7,80
4,1be358f1-1b37-4885-8d96-5af24ce8ffcf,001eE-o8dn-NzoC,19,19-4,1,185
...,...,...,...,...,...,...
211214,486d88dd-a0d0-4de6-9a70-9d691519d3d2,zzzm9-Thwo-JXfk,3,3-23,3,111
211215,486d88dd-a0d0-4de6-9a70-9d691519d3d2,zzzm9-Thwo-JXfk,3,3-3,9,155
211216,486d88dd-a0d0-4de6-9a70-9d691519d3d2,zzzm9-Thwo-JXfk,3,3-14,3,165
211217,486d88dd-a0d0-4de6-9a70-9d691519d3d2,zzzm9-Thwo-JXfk,3,3-15,3,156


In [27]:
# Save model_df to data/model_df.csv. Parquet likely a better choice, but using csv for simplicity and to reduce dependencies.
model_df.to_csv('../data/model_df.csv', index=False)