Please note that I'd usually first screen the data via tools like ydata-profiling, which is a check not performed in this sample code.

# Module imports

In [55]:
import os
import pandas as pd
import sys

from sklearn.model_selection import train_test_split


In [56]:
# Add the src folder to the path
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

In [57]:
import utils.data_validation as utils_dataval

# Constants

In [58]:
TRAIN_PROP = 0.8
RANDOM_STATE = 42

# Load data

In [59]:
df = pd.read_csv('../data/model_df.csv')

In [60]:
df.head()

Unnamed: 0,visitor_id,query_id,destination_id,device,language,has_been_referred,product_code,product_category,duration,ctr_14d,ctr_30d,clicked
0,859012e3-304f-4b03-9de6-ba30d2f854df,00GwU-BJbx-OTn3,19,tablet,ja,False,19-6,8,54,0.11806,0.126892,False
1,859012e3-304f-4b03-9de6-ba30d2f854df,00GwU-BJbx-OTn3,19,tablet,ja,False,19-10,3,267,0.114032,0.11528,False
2,859012e3-304f-4b03-9de6-ba30d2f854df,00GwU-BJbx-OTn3,19,tablet,ja,False,19-18,4,134,0.077764,0.072551,True
3,859012e3-304f-4b03-9de6-ba30d2f854df,00GwU-BJbx-OTn3,19,tablet,ja,False,19-17,4,90,0.102282,0.107368,False
4,859012e3-304f-4b03-9de6-ba30d2f854df,00GwU-BJbx-OTn3,19,tablet,ja,False,19-12,6,70,0.114556,0.121498,False


# Process data

In [61]:
# Create mean-encoded variables based on device and product_category.
# Done as device is static within query. Columns are only useful to LGBM Ranker models if there are variations within a query.

def create_mean_encoded_variables(df, categorical_column, target_column):
    return df.groupby(categorical_column)[target_column].transform('mean')

df["device_prodcat_mean_encoded"] = create_mean_encoded_variables(df, ["device", "product_category"], "clicked")
df["language_prodcat_mean_encoded"] = create_mean_encoded_variables(df, ["language", "product_category"], "clicked")

In [62]:
df["product_category"] = df["product_category"].astype("category")

# Validate data

Please note that only a small amount of tests are written and I have not provided a full test suite that includes tests cases due to time constraints.

In [63]:
utils_dataval.validate_categorical_columns(df, ["product_category"])

Validating column 'product_category'
Column 'product_category' is categorical (dtype: category)
All columns are categorical.


In [64]:
utils_dataval.validate_float_columns(df, ["ctr_14d", "ctr_30d"])

Validating column 'ctr_14d'
Validating column 'ctr_30d'
All columns are float.


In [65]:
column_range_dict = {"ctr_14d": {"min": 0, "max": 1}, "ctr_30d": {"min": 0, "max": 1}}
utils_dataval.validate_float_range(df, column_range_dict)

Validating column 'ctr_14d'
Validating column 'ctr_30d'
All columns are float and within range.


In [66]:
utils_dataval.validate_no_nulls(df, ["device", "language", "has_been_referred"])

Validating column 'device'
Validating column 'language'
Validating column 'has_been_referred'
All columns do not contain nulls.


# Split Data

In [67]:
# For simplicity, randomly split the data into training and test sets by query_id.


# Get unique query_ids
unique_query_ids = df['query_id'].unique()

# Split query_ids into train (80%) and test (20%)
train_query_ids, test_query_ids = train_test_split(
    unique_query_ids, 
    test_size=TRAIN_PROP, 
    random_state=RANDOM_STATE
)

# Create train and test dataframes based on query_id splits
train_df = df[df['query_id'].isin(train_query_ids)].copy()
test_df = df[df['query_id'].isin(test_query_ids)].copy()

print(f"Total queries: {len(unique_query_ids)}")
print(f"Train queries: {len(train_query_ids)} ({len(train_query_ids)/len(unique_query_ids)*100:.1f}%)")
print(f"Test queries: {len(test_query_ids)} ({len(test_query_ids)/len(unique_query_ids)*100:.1f}%)")


Total queries: 10000
Train queries: 2000 (20.0%)
Test queries: 8000 (80.0%)


# Save data

In [68]:
# Save model_df to data/model_df.csv. Parquet likely a better choice, but using csv for simplicity and to reduce dependencies.
train_df.to_csv('../data/train_df.csv', index=False)
test_df.to_csv('../data/test_df.csv', index=False)