# Module imports

In [11]:
import pandas as pd
import random
import sys
import os

In [12]:
# Add the src folder to the path
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

In [13]:
import utils.data_generation as utils_datagen

# Constants and setup

In [14]:
# Set seed for reproducibility
random.seed(42)

In [15]:
# Set constants related to data generation
N_DESTINATIONS = 20                                         # Number of travel destinations (e.g. London)            
N_PRODUCT_CATEGORIES = 10                                   # Number of product categories (e.g. Day trip, transport, food tour)
RANGE_PRODUCT_DESTINATION = {"min": 10, "max": 48}          # Number of products per destination. 
RANGE_DURATION = {"min": 30, "max": 300}                    # Duration in minutes.
N_QUERIES = 10000                                           # Number of query records to generate
N_MAX_PRODUCTS_PER_QUERY = 24                               # Maximum number of products per query
N_RANGE_PRODUCT_CTR = {"min": 0.05, "max": 0.12}            # Maximum click through rate for a product
LIST_DEVICES = ("desktop", "mobile", "tablet")              # Devices that can be used to search
LIST_LANGUAGES = ("en", "fr", "ja")                         # Languages used for the UI

# Generate data

In [16]:
# Create product data
product_data = []

for destination_id in range(1, N_DESTINATIONS+1):
    n_products_dest = random.randint(RANGE_PRODUCT_DESTINATION["min"], RANGE_PRODUCT_DESTINATION["max"])
    
    for product_destination_serial in range(1, n_products_dest + 1):
        # product_destination_serial is the n-th product in the destination.
        product_code = f"{destination_id}-{product_destination_serial}"
        product_category = random.randint(1, N_PRODUCT_CATEGORIES)
        duration = random.randint(RANGE_DURATION["min"], RANGE_DURATION["max"])
        ctr_14d = random.uniform(N_RANGE_PRODUCT_CTR["min"], N_RANGE_PRODUCT_CTR["max"])
        
        product_data.append({
            'destination_id': destination_id,
            'product_code': product_code,
            'product_category': str(product_category),
            'duration': duration,
            'ctr_14d': ctr_14d,
            'ctr_30d': ctr_14d + random.uniform(-0.01, 0.01)    # A random number based on the 14d CTR +- 1%
        })

# Create DataFrame
product_df = pd.DataFrame(product_data)

print(f"Generated {len(product_df)} products across {N_DESTINATIONS} destinations")
print(product_df.head(10))



Generated 578 products across 20 destinations
   destination_id product_code product_category  duration   ctr_14d   ctr_30d
0               1          1-1                1       170  0.067142  0.059933
1               1          1-2                2        74  0.091334  0.081970
2               1          1-3                2       141  0.066286  0.068327
3               1          1-4                9       131  0.100121  0.104148
4               1          1-5                7       142  0.081445  0.077008
5               1          1-6                1       111  0.098870  0.095675
6               1          1-7                3       140  0.117005  0.113737
7               1          1-8                2       224  0.056770  0.063720
8               1          1-9               10       165  0.106499  0.111094
9               1         1-10                9        93  0.118118  0.115689


In [17]:
# Create query data
query_data = []

for _ in range(N_QUERIES):
    # Generate visitor_id as UUID-like string
    visitor_id = utils_datagen.generate_visitor_id()
    query_id = utils_datagen.generate_query_id()
    
    # Generate destination_id from 1 to N_DESTINATIONS
    destination_id = random.randint(1, N_DESTINATIONS)
    
    query_data.append({
        'visitor_id': visitor_id,
        'query_id': query_id,
        'destination_id': destination_id,
        'device': random.choice(LIST_DEVICES),
        'language': random.choice(LIST_LANGUAGES),
    })

# Create DataFrame
query_df = pd.DataFrame(query_data)

print(f"Generated {len(query_df)} queries")
print(query_df.head(10))

# Now, join query_df with product_df, and randomly retain up to 24 rows/products per query without replacement.
# For simplicity, assume each product appears on the search results equally likely (unrealistic)
# These are the products that were returned by the search.
# Gives a FutureWarning that can be ignored
model_df = query_df.merge(product_df, on='destination_id', how='left')
model_df = model_df.groupby('query_id').apply(lambda x: x.sample(n=min(N_MAX_PRODUCTS_PER_QUERY, len(x)))).reset_index(drop=True) 



Generated 10000 queries
                             visitor_id         query_id  destination_id  \
0  93688aa7-9713-44cd-a036-b86a79872e05  Syeko-Zy3u-Ui9S              17   
1  b41582b3-fd92-44aa-9c99-3d1a25d05d15  dBZvV-7yVb-q9Ew              13   
2  c370cec9-de58-448d-ac34-7be0c50b239f  QOfMX-SfjS-NTtg               1   
3  fa95ff12-7cc7-4203-8677-2f69893514b2  wqilB-Izbv-fLux              10   
4  2a88f33e-ccd8-44d3-b842-1b795b7e7e68  WSI9p-GQkq-qjkr              10   
5  49b4d0a8-c601-488e-82be-c133c0f7c477  w6SEM-Ciay-0x1B              14   
6  66548355-3a0a-4fae-ac3a-90816b9dd5d3  it6mt-78sR-EJpx              13   
7  7b993b59-8a97-4866-b428-4385f6537eec  5m9lV-bT6m-XZe0              14   
8  c08004e2-ba27-47d7-889f-7073a5f2801c  ispjA-xBLK-H51z               2   
9  b644879b-afed-4aed-a9a4-a979c6f4691f  3HNfV-OwOc-8QyU              14   

    device language  
0   tablet       fr  
1   mobile       en  
2   mobile       fr  
3   mobile       fr  
4   mobile       en  
5   tab

  model_df = model_df.groupby('query_id').apply(lambda x: x.sample(n=min(N_MAX_PRODUCTS_PER_QUERY, len(x)))).reset_index(drop=True)


In [18]:
# Randomly assign clicks to products to a column named "clicked", based on ctr_14d.
# This is a simple (and flawed) way to simulate the click-through rate of a product.
model_df["clicked"] = model_df.apply(lambda row: 1 if random.random() < row["ctr_14d"] else 0, axis=1)

In [19]:
# Some final checks
model_df.head(10)

Unnamed: 0,visitor_id,query_id,destination_id,device,language,product_code,product_category,duration,ctr_14d,ctr_30d,clicked
0,e20b98b6-f567-4780-b794-b764cb8f817a,0003S-UcYv-V9CH,7,mobile,fr,7-34,4,263,0.088589,0.08626,0
1,e20b98b6-f567-4780-b794-b764cb8f817a,0003S-UcYv-V9CH,7,mobile,fr,7-11,3,187,0.091653,0.098138,0
2,e20b98b6-f567-4780-b794-b764cb8f817a,0003S-UcYv-V9CH,7,mobile,fr,7-9,10,131,0.07976,0.080651,0
3,e20b98b6-f567-4780-b794-b764cb8f817a,0003S-UcYv-V9CH,7,mobile,fr,7-19,10,253,0.094435,0.094269,1
4,e20b98b6-f567-4780-b794-b764cb8f817a,0003S-UcYv-V9CH,7,mobile,fr,7-33,5,242,0.108431,0.118036,0
5,e20b98b6-f567-4780-b794-b764cb8f817a,0003S-UcYv-V9CH,7,mobile,fr,7-12,5,254,0.058705,0.062478,0
6,e20b98b6-f567-4780-b794-b764cb8f817a,0003S-UcYv-V9CH,7,mobile,fr,7-2,8,182,0.110212,0.120012,0
7,e20b98b6-f567-4780-b794-b764cb8f817a,0003S-UcYv-V9CH,7,mobile,fr,7-16,10,40,0.11997,0.123416,0
8,e20b98b6-f567-4780-b794-b764cb8f817a,0003S-UcYv-V9CH,7,mobile,fr,7-20,2,270,0.074357,0.071023,0
9,e20b98b6-f567-4780-b794-b764cb8f817a,0003S-UcYv-V9CH,7,mobile,fr,7-21,2,112,0.073086,0.07696,1


# Save data

In [20]:
# Save model_df to data/model_df.csv. Parquet likely a better choice, but using csv for simplicity and to reduce dependencies.
model_df.to_csv('../data/model_df.csv', index=False)