In [1]:
import pandas as pd
import numpy as np

# --- 1. Load Data ---
print("Loading data...")
file_path = 'data/train.csv'

# We load 3 million rows and select a smart set of columns
cols_to_use = [
    'click', 'hour', 'banner_pos', 'site_category', 'app_category', 
    'device_type', 'device_conn_type', 'device_id', 'C1', 'C14', 
    'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'
]
df = pd.read_csv(file_path, usecols=cols_to_use, nrows=3000000)

print("Data loaded. Shape:", df.shape)

# --- 2. Create Basic Time & Behavioral Features ---
print("Creating basic features...")
# Sort by time, which is critical for historical features
df = df.sort_values('hour')

# Create hour_of_day
df['hour_of_day'] = df['hour'].astype(str).str[-2:]

# Create user_ad_count (the correct, non-leaky way)
df['user_ad_count'] = df.groupby('device_id').cumcount()

# --- 3. Split Data *BEFORE* Advanced Encoding ---
# This is the most important step to prevent data leakage.
# We create our "final exam" test set and lock it away.
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['click']
)

# Free up memory
del df
import gc
gc.collect()

print("Data split complete. Training shape:", train_df.shape, "Testing shape:", test_df.shape)

# --- 4. Advanced Target Encoding ---
print("Performing Target Encoding...")
target = 'click'
# Calculate the overall click rate of our training data
global_mean = train_df[target].mean()

# Define the high-cardinality features we want to encode
# These are features with too many unique values to be used directly
high_card_features = [
    'banner_pos', 'site_category', 'app_category', 'device_id',
    'device_type', 'device_conn_type', 'C1', 'C14', 'C15', 'C16', 
    'C17', 'C18', 'C19', 'C20', 'C21'
]

# Create the "reputation" map from the training data
encoding_maps = {}
for col in high_card_features:
    print(f"Encoding {col}...")
    # For each category, calculate its average click rate
    encoding_maps[col] = train_df.groupby(col)[target].mean()

# --- 5. Apply Encodings to Both Datasets ---
def apply_target_encoding(df, encoding_maps, global_mean):
    df_encoded = df.copy()
    for col, mapping in encoding_maps.items():
        # Apply the learned reputations
        df_encoded[col] = df_encoded[col].map(mapping)
        # Fill any new categories with the global average
        df_encoded[col] = df_encoded[col].fillna(global_mean)
    return df_encoded

train_df_encoded = apply_target_encoding(train_df, encoding_maps, global_mean)
test_df_encoded = apply_target_encoding(test_df, encoding_maps, global_mean)

print("Feature Engineering complete!")

Loading data...
Data loaded. Shape: (3000000, 17)
Creating basic features...


ModuleNotFoundError: No module named 'sklearn'