# User income prediction
## Author: Yotam Dery
## Date: 03/03/2025

# Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

# Data Loading

In [None]:
# Loading the train data
df_train = pd.read_csv('train_home_assignment.csv')
print('train shape is: {}'.format(df_train.shape))

# Loading the test data
df_test = pd.read_csv('test_home_assignment.csv')
print('test shape is: {}'.format(df_test.shape))

In [None]:
# First inspect of the train set
df_train.head()

* Seems like the most meaningful features would be indicators for whether a user will take a specific action based on their shopping behavior.

# Train-validation split

* We'd like to first perfrom the train-validation split to ensure that missing values in the validation set are imputed using training set statistics, <br>
and to prevent data leakage. <br>
We'll use a 80-20 Split (80% for training, 20% for validation).

In [None]:
# Define features and target
X = df_train.drop(columns=["org_price_usd_following_30_days"])
y = df_train["org_price_usd_following_30_days"]

# Perform train-validation split (80-20), ensuring stratified sampling
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Confirm the split sizes
X_train.shape, X_val.shape, y_train.shape, y_val.shape

# EDA

In [None]:
# Let's print some statistics for the train set
X_train.describe()

In [None]:
# Create a histogram to show the distribution of the numerical target variable
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=y_train,
    nbinsx=20,  # Adjust the number of bins as needed
    marker=dict(color='blue', line=dict(width=1)),
    opacity=0.75
))

# Update layout
fig.update_layout(
    title="Distribution of Target Variable in Training Set",
    xaxis_title="Target Variable",
    yaxis_title="Count",
    template="plotly_white",
    bargap=0.1
)

fig.show()

## Univariate Analysis (Feature Distributions)

In [None]:
# Select numerical features for distribution analysis
numerical_features = ["#viewed_ads", "#times_visited_website", "#products_in_cart", 
                      "target_product_price", "target_product_description_length", "age"]

# Create histograms for numerical features
for feature in numerical_features:
    fig = px.histogram(X_train, x=feature, nbins=30, title=f"Distribution of {feature}",
                       template="plotly_white")
    fig.show()

Insights from Univariate Analysis: <br>
1. Some features (e.g., target_product_price) may have right-skewed distributions, meaning log transformation might help. <br> <br>
2. There are some outliers for features like #viewed_ads and #times_visited_website. <br> As the span of the bins is not too large, I do think that these values can be useful for the prediction.

In [None]:
# Select categorical features for distribution analysis
categorical_features = ["target_product_price_color", "shopper_segment", "delivery_time", "target_product_category"]

# Create bar plots for categorical feature distributions with corrected syntax
for feature in categorical_features:
    category_counts = X_train[feature].value_counts().reset_index()
    category_counts.columns = [feature, "count"]

    fig = px.bar(
        category_counts,
        x=feature,
        y="count",
        text=category_counts["count"],
        title=f"Distribution of {feature}",
        labels={feature: feature, "count": "Count"},
        template="plotly_white"
    )
    fig.show()

<b> Insights from Categorical Feature Analysis: </b> <br>
Imbalanced Categories:

1. Some categories may be dominant, while others are underrepresented.
We may need to group less frequent categories into an "Other" category.
Feature Encoding Decisions:

2. Low cardinality features (shopper_segment, delivery_time) → Label Encoding.
High cardinality features (target_product_category, target_product_price_color) → One-Hot Encoding.

## Bivariate Analysis (Feature Relationships)
In this step, we'll analyze how different features relate to the target variable (tag) to uncover important patterns.

In [None]:
# Select numerical features
numerical_features = ["#viewed_ads", "#times_visited_website", "#products_in_cart", 
                      "target_product_price", "target_product_description_length", "age"]

# Create boxplots for each numerical feature vs. target variable
df = pd.concat([X_train, y_train], axis=1)
for feature in numerical_features:
    fig = px.box(df, x="tag", y=feature, color="tag",
                 title=f"Distribution of {feature} by Target (tag)",
                 labels={"tag": "Tag (0 = No, 1 = Yes)", feature: feature},
                 template="plotly_white")
    fig.show()

<b> Insights from Boxplots </b>

Compare distributions for tag=0 vs. tag=1:
1. Q: Are users who added more products to the cart (#products_in_cart) more likely to convert? <br>
   A: We can see that the median number of products in the cart is higher for tag=1 compared to tag=0.
   
2. Q: Does age impact conversions? <br>
   A: The age distributions for tag=0 and tag=1 are quite similar, meaning age does not significantly impact purchase likelihood.

In [None]:
# Analyze categorical features vs. target (tag)
categorical_features = ["target_product_price_color", "shopper_segment", "delivery_time"]
df = pd.concat([X_train, y_train], axis=1)

# Create bar plots to show the distribution of categorical features for each tag value
for feature in categorical_features:
    df_counts = df.groupby([feature, "tag"]).size().reset_index(name="count")
    
    # Separate data for tag=0 and tag=1
    df_tag_0 = df_counts[df_counts["tag"] == 0]
    df_tag_1 = df_counts[df_counts["tag"] == 1]

    # Create the figure with **fixed** colors
    fig = go.Figure()

    # Add bars for tag=0 (Blue)
    fig.add_trace(go.Bar(
        x=df_tag_0[feature], 
        y=df_tag_0["count"], 
        name="Tag 0 (No)", 
        marker_color="blue",
        opacity=0.8
    ))

    # Add bars for tag=1 (Red)
    fig.add_trace(go.Bar(
        x=df_tag_1[feature], 
        y=df_tag_1["count"], 
        name="Tag 1 (Yes)", 
        marker_color="red",
        opacity=0.8
    ))

    # Update layout
    fig.update_layout(
        title=f"Distribution of {feature} by Target (tag)",
        xaxis_title=feature,
        yaxis_title="Count",
        barmode="group",  # Ensures side-by-side bars
        template="plotly_white"
    )

    fig.show()

<b> Insights from Categorical Feature Analysis </b>
1. Q: Does shopper_segment affect conversions? <br>
   A: Yes! Heavy shoppers likely have a higher probability of conversion (tag=1) than new shoppers
   
3. Q: Does target_product_price_color affect conversions? <br>
   A: Not significantly—There may be minor differences, but no clear trend.


## Correlation Analysis
Identify highly correlated features (which could be redundant).

In [None]:
# Compute the correlation matrix for numerical features
df = pd.concat([X_train, y_train], axis=1)
correlation_matrix = df[["#viewed_ads", "#times_visited_website", "#products_in_cart", 
                               "target_product_price", "target_product_description_length", "age", "tag"]].corr()

# Create a heatmap using Plotly
fig = ff.create_annotated_heatmap(
    z=np.round(correlation_matrix.values, 2),
    x=list(correlation_matrix.columns),
    y=list(correlation_matrix.index),
    colorscale="blues",
    showscale=True
)

# Update layout
fig.update_layout(title="Feature Correlation Heatmap")

fig.show()

print("Seems like there are no highly correlated numeric features. We might use feature selection techniques afterwards")

## Explore missing values

In [None]:
# Check missing values percentage in the dataset
missing_values = X_train.isnull().mean() * 100

# Filter only columns with missing values
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

# Create a Plotly bar chart for missing values
fig = go.Figure(data=[
    go.Bar(
        x=missing_values.index,
        y=missing_values.values,
        text=[f"{p:.2f}%" for p in missing_values.values],  # Format percentages
        textposition='auto',
        marker=dict(color='orange'),
        opacity=0.8
    )
])

# Update layout
fig.update_layout(
    title="Percentage of Missing Values per Feature",
    xaxis_title="Feature",
    yaxis_title="Missing Values (%)",
    template="plotly_white"
)

fig.show()

print("Seems like there are 3 features with missing values. We'll fill those missing values based on each features' distribution")

# Feature engineering

In [None]:
# Lets remind us of our data
X_train.head()

In [None]:
def create_behavioral_features(df):
    df['avg_products_in_cart_per_viewed_ads'] = df['#products_in_cart']/df['#viewed_ads']+1 # Measures intent—high values indicate users add items quickly.
    df['avg_products_in_cart_per_times_visited'] = df['#products_in_cart']/df['#times_visited_website']+1   # Measures conversion efficiency
    df['avg_viewed_ads_per_times_visited'] = df['#viewed_ads']/df['#times_visited_website']+1   # Measures ad exposure
    df['price_per_product'] = df['target_product_price']/df['#products_in_cart']+1  # Captures average price per cart item—budget
    # df['avg_products_in_cart_per_viewed_ads'].replace(np.inf,df.loc[df['avg_products_in_cart_per_viewed_ads'] != np.inf, 'avg_products_in_cart_per_viewed_ads'].max(),inplace=True)
    # df['avg_products_in_cart_per_times_visited'].replace(np.inf,df.loc[df['avg_products_in_cart_per_times_visited'] != np.inf, 'avg_products_in_cart_per_times_visited'].max(),inplace=True)
    # df['avg_viewed_ads_per_times_visited'].replace(np.inf,df.loc[df['avg_viewed_ads_per_times_visited'] != np.inf, 'avg_viewed_ads_per_times_visited'].max(),inplace=True)
    return df

In [None]:
def create_categorical_features_encoding(df):
    """
    Transforms categorical features in the given dataframe by:
    1. Applying Label Encoding to low-cardinality categorical features.
    2. Splitting 'target_product_category' into primary and secondary categories.
    3. Applying One-Hot Encoding to 'primary_target_product_category' and 'secondary_target_product_category'.

    Parameters:
    df (pd.DataFrame): Input dataframe containing categorical features.

    Returns:
    pd.DataFrame: Transformed dataframe with encoded categorical features.
    """
    from sklearn.preprocessing import LabelEncoder
    
    # Copy dataframe to avoid modifying the original
    df = df.copy()
    
    # Label Encoding for low-cardinality categorical features
    label_encode_features = ["shopper_segment", "delivery_time"]
    label_encoders = {}
    
    for feature in label_encode_features:
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature])
        label_encoders[feature] = le  # Store encoders for reference

    # Splitting 'target_product_category' into primary and secondary categories
    df[["primary_target_product_category", "secondary_target_product_category"]] = df[
        "target_product_category"
    ].str.split(" - ", n=1, expand=True)

    # Filling missing values in 'secondary_target_product_category' (if no secondary exists, fill with "None")
    df["secondary_target_product_category"].fillna("None", inplace=True)

    # One-Hot Encoding for primary and secondary categories
    df = pd.get_dummies(df, columns=["primary_target_product_category", "secondary_target_product_category"], drop_first=True)

    # One-Hot Encoding for 'target_product_price_color'
    df = pd.get_dummies(df, columns=["target_product_price_color"], drop_first=True)

    # Drop the original 'target_product_category' column
    df.drop(columns=["target_product_category"], inplace=True)

    return df

In [None]:
def apply_sin_cos_transformation(df):
    """
    Applies Sin/Cos transformation to the 'timestamp' feature by extracting the hour of the day
    and encoding it as two cyclic features: sin(hour) and cos(hour).
    
    Parameters:
    df (pd.DataFrame): Input dataframe containing the 'timestamp' column.
    
    Returns:
    pd.DataFrame: Transformed dataframe with 'sin_hour' and 'cos_hour' features.
    """
    # Copy dataframe to avoid modifying the original
    df = df.copy()
    
    # Convert 'timestamp' to datetime format and extract the hour
    df["timestamp"] = pd.to_datetime(df["timestamp"], format="%H:%M:%S")
    df["hour_of_day"] = df["timestamp"].dt.hour  # Extract hour (0-23)

    # Apply sin/cos transformation
    df["sin_hour"] = np.sin(2 * np.pi * df["hour_of_day"] / 24)
    df["cos_hour"] = np.cos(2 * np.pi * df["hour_of_day"] / 24)

    # Drop the original timestamp column and hour column
    df.drop(columns=["timestamp", "hour_of_day"], inplace=True)

    return df

print("Bucketing (morning: 6-12, afternoon: 12-18) creates hard boundaries that may not reflect actual user behavior.")
print("By using sin(hour) and cos(hour), the transition between hours is smooth and distance-based models (e.g. XGBoost) can interpret time more effectively.")
print("This preserves the cyclic nature of time, making 23:00 and 01:00 close in value rather than distant.")

In [None]:
def impute_missing_values(df):
    """
    Imputes missing values for numerical features using appropriate strategies:
    - Uses median imputation for `#viewed_ads`, `age` and `#times_visited_website`.
    - Uses  median imputation for `age` based on `shopper_segment`.
    
    Median imputation is robust against outliers and skewed distributions.
    median for age helps preserve user demographics.
    
    Parameters:
    df (pd.DataFrame): Input dataframe with missing values.
    
    Returns:
    pd.DataFrame: Transformed dataframe with missing values imputed.
    """
    # Median imputation for numerical features
    num_features = ["#viewed_ads", "#times_visited_website", "age"]
    for feature in num_features:
        df[feature].fillna(df[feature].median(), inplace=True)

    return df

In [None]:
# Define the sklearn transformer
class CustomDataTransformer(BaseEstimator, TransformerMixin):
    """
    Custom sklearn Transformer that applies:
    1. Missing value imputation
    2. Categorical feature encoding (Label Encoding & One-Hot Encoding)
    3. Timestamp transformation (Sin/Cos encoding for cyclic time representation)
    4. Behavioral feature engineering (Ratio-based features)
    """

    def fit(self, X, y=None):
        """
        Fit the transformer (for label encoding categories).
        """
        
        return self

    def transform(self, X):
        """
        Apply all transformations to the dataset.
        """
        X = X.copy()

        # Apply missing value imputation
        X = impute_missing_values(X)

        # Apply categorical feature encoding
        X = create_categorical_features_encoding(X)

        # Apply timestamp transformation (sin/cos encoding)
        X = apply_sin_cos_transformation(X)

        # Create behavioral features
        #X = create_behavioral_features(X)

        return X

In [None]:
# Initialize and fit the transformer on training data only
transformer = CustomDataTransformer()
X_train_transformed = transformer.fit_transform(X_train)  # Fit and transform on training set

# Apply the same transformations to validation data (using fitted transformer)
X_val_transformed = transformer.transform(X_val)  # Transform validation set

# Modeling

In [None]:
def plot_roc_curve(y_true, y_prob, model_name="Model"):
    """
    Generates an interactive ROC Curve using Plotly.

    Parameters:
    y_true (array-like): True binary labels.
    y_prob (array-like): Predicted probabilities for the positive class.
    model_name (str): Name of the model (for labeling).
    """
    # Compute ROC curve points
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)

    # Create the plot
    fig = go.Figure()

    # Add ROC Curve
    fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f"{model_name} ROC Curve", line=dict(color='blue')))
    
    # Add Diagonal Reference Line (Random Model)
    fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Guess', line=dict(dash='dash', color='black')))

    # Customize layout
    fig.update_layout(
        title=f"ROC Curve for {model_name}",
        xaxis_title="False Positive Rate (FPR)",
        yaxis_title="True Positive Rate (Recall)",
        template="plotly_white"
    )

    fig.show()

In [None]:
def plot_confusion_matrix(y_true, y_pred, model_name="Model"):
    """
    Generates an interactive confusion matrix heatmap using Plotly.

    Parameters:
    y_true (array-like): True binary labels.
    y_pred (array-like): Predicted labels.
    model_name (str): Name of the model (for labeling).
    """
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Define labels
    labels = ["True Negative", "False Positive", "False Negative", "True Positive"]

    # Create the annotation matrix
    annotations = [[f'{value}' for value in row] for row in cm]

    # Generate heatmap using Plotly
    fig = ff.create_annotated_heatmap(
        z=cm,
        x=['Pred: 0', 'Pred: 1'],
        y=['Actual: 0', 'Actual: 1'],
        colorscale="Blues",
        annotation_text=annotations,
        showscale=False
    )

    fig.update_layout(
        title=f"Confusion Matrix for {model_name}",
        xaxis_title="Predicted Label",
        yaxis_title="True Label",
        template="plotly_white"
    )

    fig.show()

## Create a baseline model

In [None]:
# Initialize Decision Tree model with a reasonable depth to prevent overfitting
baseline_model = DecisionTreeClassifier(max_depth=5, random_state=42)

# Train the model
baseline_model.fit(X_train_transformed, y_train)

# Make predictions on validation set
y_pred = baseline_model.predict(X_val_transformed)
y_prob = baseline_model.predict_proba(X_val_transformed)[:, 1]  # Probabilities for ROC-AUC

# Evaluate model performance
baseline_metrics = {
    "Accuracy": accuracy_score(y_val, y_pred),
    "Precision": precision_score(y_val, y_pred),
    "Recall": recall_score(y_val, y_pred),
    "ROC-AUC": roc_auc_score(y_val, y_prob)
}

baseline_results = pd.DataFrame([baseline_metrics])
print("Baseline Model Performance:\n", baseline_results)

In [None]:
# Generate the ROC curve for the baseline Decision Tree model
plot_roc_curve(y_val, y_prob, model_name="Decision Tree")

In [None]:
# Generate the Confusion Matrix for the Decision Tree baseline model
plot_confusion_matrix(y_val, y_pred, model_name="Decision Tree")

## Find best parameters for each advanced model

In [None]:
# Define the final hyperparameter tuning & evaluation function
def tune_and_evaluate_model(model_name, X_train, y_train, X_val, y_val):
    """
    Tunes hyperparameters using GridSearchCV, trains the best model, evaluates its performance,
    and plots the ROC curve and confusion matrix using existing functions.

    Parameters:
    model_name (str): "random_forest" or "xgboost"
    X_train (pd.DataFrame): Transformed training features.
    y_train (pd.Series): Training labels.
    X_val (pd.DataFrame): Transformed validation features.
    y_val (pd.Series): Validation labels.

    Returns:
    best_model: Trained model with best parameters.
    best_params (dict): Best hyperparameters found by GridSearchCV.
    """
    # Define hyperparameter grids
    param_grids = {
        "random_forest": {
            "n_estimators": [50, 100],
            "max_depth": [5, 10],
            "min_samples_split": [2, 5]
        },
        "xgboost": {
            "n_estimators": [50, 100],
            "max_depth": [3, 6],
            "learning_rate": [0.05, 0.1]
        }
    }

    # Model selection
    if model_name == "random_forest":
        model = RandomForestClassifier(random_state=42, n_jobs=-1)
    elif model_name == "xgboost":
        model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")
    else:
        raise ValueError("Invalid model name. Choose 'random_forest' or 'xgboost'.")

    # Set up GridSearchCV
    grid_search = GridSearchCV(
        model,
        param_grids[model_name],
        cv=5,
        scoring="roc_auc",
        n_jobs=-1,
        verbose=2
    )

    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)

    # Retrieve best model and parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Make predictions on validation set
    y_pred = best_model.predict(X_val)
    y_prob = best_model.predict_proba(X_val)[:, 1]

    # Compute evaluation metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    auc_score = roc_auc_score(y_val, y_prob)

    # Print performance metrics
    print(f"Best {model_name.upper()} Parameters: {best_params}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC-AUC: {auc_score:.4f}")

    # Plot ROC Curve using existing function
    plot_roc_curve(y_val, y_prob, model_name=model_name.upper())

    # Plot Confusion Matrix using existing function
    plot_confusion_matrix(y_val, y_pred, model_name=model_name.upper())

    return best_model, best_params

# Tune, train, and evaluate Random Forest
best_rf_model, best_rf_params = tune_and_evaluate_model("random_forest", X_train_transformed, y_train, X_val_transformed, y_val)

# Tune, train, and evaluate XGBoost
best_xgb_model, best_xgb_params = tune_and_evaluate_model("xgboost", X_train_transformed, y_train, X_val_transformed, y_val)

# Final prediction
We can see that the XGBoost model is the best model. We'll choose it to generate our final predictions!

In [None]:
# Apply the same preprocessing steps
X_test_transformed = transformer.transform(df_test)  # Use the trained transformer

In [None]:
X_test_transformed = X_test_transformed.drop(columns=["id"], errors="ignore")

In [None]:
# Predict probabilities for the test set
y_test_prob = best_xgb_model.predict_proba(X_test_transformed)[:, 1]  # Get probability for class 1

# Predict class labels
y_test_pred = best_xgb_model.predict(X_test_transformed)

In [None]:
# Create a DataFrame for submission
test_predictions = pd.DataFrame({
    "id": test_df.index,  # Adjust this based on the test dataset structure
    "predicted_prob": y_test_prob,  # Probability of class 1
    "predicted_class": y_test_pred   # Predicted class label
})

# Save to CSV file
test_predictions.to_csv("test_predictions.csv", index=False)
print("Predictions saved to test_predictions.csv")

In [None]:
# Compute evaluation metrics for the test set
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
auc_test = roc_auc_score(y_test, y_test_prob)

# Print performance metrics
print(f"Test Set Performance for Random Forest:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"ROC-AUC: {auc_test:.4f}")


In [None]:
# Plot ROC Curve using existing function
plot_roc_curve(y_test, y_test_prob, model_name="Random Forest (Test Set)")

In [None]:
# Plot Confusion Matrix using existing function
plot_confusion_matrix(y_test, y_test_pred, model_name="Random Forest (Test Set)")