# Module imports

In [70]:
import pandas as pd
import random
import sys
import os

In [71]:
# Add the src folder to the path
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

In [72]:
import utils.data_generation as utils_datagen

# Constants and setup

In [73]:
# Set seed for reproducibility
random.seed(42)

In [74]:
# Set constants related to data generation
N_DESTINATIONS = 20                                         # Number of travel destinations (e.g. London)            
N_PRODUCT_CATEGORIES = 10                                   # Number of product categories (e.g. Day trip, transport, food tour)
RANGE_PRODUCT_DESTINATION = {"min": 10, "max": 48}          # Number of products per destination. 
RANGE_DURATION = {"min": 30, "max": 300}                    # Duration in minutes.
N_QUERIES = 10000                                           # Number of query records to generate
N_MAX_PRODUCTS_PER_QUERY = 24                               # Maximum number of products per query
N_RANGE_PRODUCT_CTR = {"min": 0.05, "max": 0.12}            # Maximum click through rate for a product
LIST_DEVICES = ("desktop", "mobile", "tablet")              # Devices that can be used to search
LIST_LANGUAGES = ("en", "fr", "ja")                         # Languages used for the UI
LIST_HAS_BEEN_REFERRED = (True, False)                      # Whether the user has been referred to the site. 

# Generate data

In [75]:
# Create product data
product_data = []

for destination_id in range(1, N_DESTINATIONS+1):
    n_products_dest = random.randint(RANGE_PRODUCT_DESTINATION["min"], RANGE_PRODUCT_DESTINATION["max"])
    
    for product_destination_serial in range(1, n_products_dest + 1):
        # product_destination_serial is the n-th product in the destination.
        product_code = f"{destination_id}-{product_destination_serial}"
        product_category = random.randint(1, N_PRODUCT_CATEGORIES)
        duration = random.randint(RANGE_DURATION["min"], RANGE_DURATION["max"])
        ctr_14d = random.uniform(N_RANGE_PRODUCT_CTR["min"], N_RANGE_PRODUCT_CTR["max"])
        
        product_data.append({
            'destination_id': destination_id,
            'product_code': product_code,
            'product_category': str(product_category),
            'duration': duration,
            'ctr_14d': ctr_14d,
            'ctr_30d': ctr_14d + random.uniform(-0.01, 0.01)    # A random number based on the 14d CTR +- 1%
        })

# Create DataFrame
product_df = pd.DataFrame(product_data)

print(f"Generated {len(product_df)} products across {N_DESTINATIONS} destinations")
print(product_df.head(10))



Generated 578 products across 20 destinations
   destination_id product_code product_category  duration   ctr_14d   ctr_30d
0               1          1-1                1       170  0.067142  0.059933
1               1          1-2                2        74  0.091334  0.081970
2               1          1-3                2       141  0.066286  0.068327
3               1          1-4                9       131  0.100121  0.104148
4               1          1-5                7       142  0.081445  0.077008
5               1          1-6                1       111  0.098870  0.095675
6               1          1-7                3       140  0.117005  0.113737
7               1          1-8                2       224  0.056770  0.063720
8               1          1-9               10       165  0.106499  0.111094
9               1         1-10                9        93  0.118118  0.115689


In [76]:
# Create query data
query_data = []

for _ in range(N_QUERIES):
    # Generate visitor_id as UUID-like string
    visitor_id = utils_datagen.generate_visitor_id()
    query_id = utils_datagen.generate_query_id()
    
    # Generate destination_id from 1 to N_DESTINATIONS
    destination_id = random.randint(1, N_DESTINATIONS)
    
    query_data.append({
        'visitor_id': visitor_id,
        'query_id': query_id,
        'destination_id': destination_id,
        'device': random.choice(LIST_DEVICES),
        'language': random.choice(LIST_LANGUAGES),
        'has_been_referred': random.choice(LIST_HAS_BEEN_REFERRED)
    })

# Create DataFrame
query_df = pd.DataFrame(query_data)

print(f"Generated {len(query_df)} queries")
print(query_df.head(10))

# Now, join query_df with product_df, and randomly retain up to 24 rows/products per query without replacement.
# For simplicity, assume each product appears on the search results equally likely (unrealistic)
# These are the products that were returned by the search.
# Gives a FutureWarning that can be ignored
model_df = query_df.merge(product_df, on='destination_id', how='left')
model_df = model_df.groupby('query_id').apply(lambda x: x.sample(n=min(N_MAX_PRODUCTS_PER_QUERY, len(x)))).reset_index(drop=True) 



Generated 10000 queries
                             visitor_id         query_id  destination_id  \
0  acf19e48-40b6-4948-99ce-62de5c15af50  Syeko-Zy3u-Ui9S              17   
1  314f3e55-29cf-46a1-b2e2-e3fd49494f5c  goeLM-K8uN-WBoI              13   
2  893ad238-d143-4408-8fff-a3a0b62aae1f  fMXSf-jSNT-tgat              12   
3  f1ebffcc-ad34-4ff0-9c86-be5d7e49e622  fAIFg-fILW-1azW              18   
4  65762053-ac16-4961-a277-b98a42c86440  Qkqqj-krsW-w6SE              20   
5  0cfdcf95-e7b2-4daf-b1b0-b9085cdffe3e  iay0x-1BAZ-OKsL               7   
6  26a471a2-80bb-4c80-929e-feec8932cae9  78sRE-JpxY-yhIK              18   
7  8520c098-48eb-4e64-9730-310d449ba359  7CRBs-QYXz-ENpO              14   
8  d0d4da96-264f-46f0-8ddb-c2dc8e760c25  9DyF5-QHXw-3HNf               8   
9  1ae72123-8c8e-4c83-824e-271ee4806253  kLINO-7uXg-gnUI              19   

    device language  has_been_referred  
0   tablet       fr               True  
1   mobile       en              False  
2   mobile      

  model_df = model_df.groupby('query_id').apply(lambda x: x.sample(n=min(N_MAX_PRODUCTS_PER_QUERY, len(x)))).reset_index(drop=True)


In [77]:
# Randomly assign clicks to products to a column named "clicked", based on ctr_14d.
# This is a simple (and flawed) way to simulate the click-through rate of a product.
model_df["clicked"] = model_df.apply(lambda row: random.random() < row["ctr_14d"], axis=1)

In [80]:
# Some final checks
model_df.head(10)

Unnamed: 0,visitor_id,query_id,destination_id,device,language,has_been_referred,product_code,product_category,duration,ctr_14d,ctr_30d,clicked
0,859012e3-304f-4b03-9de6-ba30d2f854df,00GwU-BJbx-OTn3,19,tablet,ja,False,19-6,8,54,0.11806,0.126892,False
1,859012e3-304f-4b03-9de6-ba30d2f854df,00GwU-BJbx-OTn3,19,tablet,ja,False,19-10,3,267,0.114032,0.11528,False
2,859012e3-304f-4b03-9de6-ba30d2f854df,00GwU-BJbx-OTn3,19,tablet,ja,False,19-18,4,134,0.077764,0.072551,True
3,859012e3-304f-4b03-9de6-ba30d2f854df,00GwU-BJbx-OTn3,19,tablet,ja,False,19-17,4,90,0.102282,0.107368,False
4,859012e3-304f-4b03-9de6-ba30d2f854df,00GwU-BJbx-OTn3,19,tablet,ja,False,19-12,6,70,0.114556,0.121498,False
5,859012e3-304f-4b03-9de6-ba30d2f854df,00GwU-BJbx-OTn3,19,tablet,ja,False,19-2,10,193,0.090423,0.095874,False
6,859012e3-304f-4b03-9de6-ba30d2f854df,00GwU-BJbx-OTn3,19,tablet,ja,False,19-11,7,98,0.077189,0.072289,True
7,859012e3-304f-4b03-9de6-ba30d2f854df,00GwU-BJbx-OTn3,19,tablet,ja,False,19-7,7,298,0.072311,0.070638,False
8,859012e3-304f-4b03-9de6-ba30d2f854df,00GwU-BJbx-OTn3,19,tablet,ja,False,19-1,9,178,0.066391,0.074719,False
9,859012e3-304f-4b03-9de6-ba30d2f854df,00GwU-BJbx-OTn3,19,tablet,ja,False,19-9,8,153,0.109454,0.105469,False


# Save data

In [81]:
# Save model_df to data/model_df.csv. Parquet likely a better choice, but using csv for simplicity and to reduce dependencies.
model_df.to_csv('../data/model_df.csv', index=False)