Data Loading and Merging

In [None]:
import pandas as pd

# Load sessions and transactions
sessions = pd.read_csv("sessions.csv")
transactions = pd.read_csv("transactions.csv")

# Mark sessions that had purchases
purchased_sessions = transactions[['machine_id', 'site_session_id']].drop_duplicates()
sessions['made_purchase'] = sessions.merge(
    purchased_sessions,
    on=['machine_id', 'site_session_id'],
    how='left',
    indicator=True
)['_merge'] == 'both'

Filtering for Abandonment Labels

In [None]:
# Keep sessions with >1 page view for intent filtering
filtered_sessions = sessions[sessions['pages_viewed'] > 1].copy()
filtered_sessions['abandoned'] = ~filtered_sessions['made_purchase']

Feature Engineering

Time-of-Day Buckets

In [None]:
import datetime

def bucket_time(hour):
    if 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 24:
        return 'Evening'
    else:
        return 'Late Night'

filtered_sessions['event_hour'] = pd.to_datetime(
    filtered_sessions['event_time'], format="%H:%M:%S"
).dt.hour
filtered_sessions['time_of_day'] = filtered_sessions['event_hour'].apply(bucket_time)


Pages Per Minute

In [None]:
filtered_sessions['pages_per_minute'] = filtered_sessions['pages_viewed'] / (
    filtered_sessions['duration'] + 0.01  # avoid division by zero
)

Referral Type Bucketing

In [None]:
def classify_referrer(domain):
    if pd.isna(domain) or domain == '':
        return 'Direct'
    elif 'facebook' in domain or 'twitter' in domain:
        return 'Social Media'
    elif 'google' in domain or 'bing' in domain:
        return 'Search Engine'
    elif 'email' in domain or 'mail' in domain:
        return 'Email Campaign'
    else:
        return 'Referral Site'

filtered_sessions['referral_type'] = filtered_sessions['ref_domain_name'].apply(classify_referrer)


Income Bracket Binning

In [None]:
def income_bracket(val):
    if val in [11, 12]:
        return 'Low'
    elif val in [13, 14, 15]:
        return 'Medium'
    elif val in [16, 17, 18]:
        return 'High'
    else:
        return 'Unknown'

filtered_sessions['income_bracket'] = filtered_sessions['household_income'].apply(income_bracket)


EDA and Visualizations

Pages Viewed vs. Abandonment

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.histplot(data=filtered_sessions, x='pages_viewed', hue='abandoned', bins=30, kde=True)
plt.title("Pages Viewed Distribution by Abandonment")
plt.xlabel("Pages Viewed")
plt.ylabel("Session Count")
plt.show()

Abandonment by Time of Day

In [None]:
abandon_by_time = filtered_sessions.groupby("time_of_day")["abandoned"].mean().reset_index()

sns.barplot(data=abandon_by_time, x="time_of_day", y="abandoned", order=["Morning", "Afternoon", "Evening", "Late Night"])
plt.title("Abandonment Rate by Time of Day")
plt.ylabel("Abandonment Rate")
plt.show()

Abandonment by Referral Source

In [None]:
abandon_by_ref = filtered_sessions.groupby("referral_type")["abandoned"].mean().reset_index()

sns.barplot(data=abandon_by_ref, y="referral_type", x="abandoned")
plt.title("Abandonment Rate by Referral Type")
plt.xlabel("Abandonment Rate")
plt.ylabel("Referral Type")
plt.show()

Modeling

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

# Select features
features = ['pages_viewed', 'duration', 'pages_per_minute']
X = filtered_sessions[features]
y = filtered_sessions['abandoned']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

# Standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic model
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)
y_pred_prob = logreg.predict_proba(X_test_scaled)[:, 1]

print("LogReg AUC:", roc_auc_score(y_test, y_pred_prob))

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced')
rf.fit(X_train, y_train)
rf_preds = rf.predict_proba(X_test)[:, 1]
print("Random Forest AUC:", roc_auc_score(y_test, rf_preds))

XGBoost

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, scale_pos_weight=9, use_label_encoder=False, eval_metric='auc')
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict_proba(X_test)[:, 1]

print("XGBoost AUC:", roc_auc_score(y_test, xgb_preds))

SHAP for Explainability

In [None]:
import shap

explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_test)

shap.plots.bar(shap_values)