# Predictive Analytics
## Customer Satisfaction Prediction Using Decision Trees

In this predictive analysis, we will try to predict if customers will be satisfied with their orders so we can maintain satisfaction levels and identify customers who are unhappy so countermeasures can be employed.

Our goal is to build a decision tree model using a decision tree classifier algorithm to predict whether customers will give high ratings (4-5 stars) or low ratings (1-3 stars) based on factors such as delivery time, shipping costs, and product details.

Much of this notebook's python code was generated by Claude 4 via Visual Studio Code integration. Suggestions and improvements were also added using these tools. However, everything has been looked over and tailored for the exam manually, and we take full responsibility for the content within.

In [1]:
# Import required libraries for predictive analytics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Database connection
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv

# Machine Learning libraries
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                           confusion_matrix, classification_report)
from sklearn.preprocessing import LabelEncoder
import random

# Set visualization style
plt.style.use('seaborn-v0_8')
pd.set_option('display.max_columns', None)

# Create export directory for all predictive analysis outputs
EXPORT_DIR = "predictive_exports"
if not os.path.exists(EXPORT_DIR):
    os.makedirs(EXPORT_DIR)
    print(f"Created export directory: {EXPORT_DIR}")
else:
    print(f"Using existing export directory: {EXPORT_DIR}")

# Function to safely save plots avoiding file conflicts
def safe_plot_save(filename, dpi=300):
    """Save plot with timestamp to avoid file locking issues"""
    try:
        timestamp = datetime.now().strftime("%H%M%S")
        safe_filename = f"{filename.split('.')[0]}_{timestamp}.png"
        full_path = os.path.join(EXPORT_DIR, safe_filename)
        plt.savefig(full_path, dpi=dpi, bbox_inches='tight')
        plt.close()  # Close the plot after saving
        print(f"Plot saved as: {full_path}")
        return full_path
    except Exception as e:
        print(f"Could not save {filename}: {e}")
        plt.close()  # Still close the plot
        return None

# Function to get export path for any file
def get_export_path(filename):
    """Get full path for export file"""
    return os.path.join(EXPORT_DIR, filename)

print("Libraries loaded successfully")
print("Customer Satisfaction Prediction Ready")
print(f"All exports will be saved to: {EXPORT_DIR}")

Using existing export directory: predictive_exports
Libraries loaded successfully
Customer Satisfaction Prediction Ready
All exports will be saved to: predictive_exports


In [2]:
# Database connection and data extraction
load_dotenv()
DB_URL = os.getenv('DB_URL')
engine = create_engine(DB_URL)

print("Connecting to database...")

# Query to get customer satisfaction data
satisfaction_query = """
SELECT
    fcr.review_key,
    fcr.order_id,
    fcr.review_score,

    -- Customer demographics
    dc.customer_state,
    dc.customer_region,

    -- Seller information
    ds.seller_state,
    ds.seller_region,

    -- Product characteristics
    dp.product_category_english,
    dp.product_category_level_1,
    dp.product_weight_grams,
    dp.product_size_category,
    COALESCE(dp.product_length_cm * dp.product_height_cm * dp.product_width_cm, 0) as product_volume_cm3,

    -- Order financial metrics
    fs.total_item_value,
    fs.freight_value,
    fs.payment_value,

    -- Delivery performance (key satisfaction driver)
    fdp.estimated_delivery_days,
    fdp.actual_delivery_days,
    fdp.delivery_delay_days,
    fdp.is_on_time,
    fdp.order_value as total_order_value,
    fdp.item_count,

    -- Geographic distance factor
    CASE
        WHEN dc.customer_state = ds.seller_state THEN 'Same State'
        WHEN dc.customer_region = ds.seller_region THEN 'Same Region'
        ELSE 'Different Region'
    END as shipping_distance,

    -- Price competitiveness
    CASE
        WHEN fs.total_item_value > (fs.total_item_value + fs.freight_value) * 0.8 THEN 'Low Freight'
        WHEN fs.freight_value > fs.total_item_value * 0.3 THEN 'High Freight'
        ELSE 'Normal Freight'
    END as freight_category,

    -- Order complexity
    CASE
        WHEN fdp.item_count = 1 THEN 'Simple'
        WHEN fdp.item_count <= 3 THEN 'Medium'
        ELSE 'Complex'
    END as order_complexity

FROM fact_customer_reviews fcr
JOIN dim_customer dc ON fcr.customer_key = dc.customer_key
JOIN dim_seller ds ON fcr.seller_key = ds.seller_key
JOIN dim_product dp ON fcr.product_key = dp.product_key
JOIN fact_sales fs ON fcr.order_id = fs.order_id AND fcr.product_key = fs.product_key
JOIN fact_delivery_performance fdp ON fcr.order_id = fdp.order_id
WHERE fcr.review_score IS NOT NULL
    AND fdp.is_delivered = true
    AND dc.is_current = true
    AND dp.product_weight_grams > 0
    AND fdp.actual_delivery_days IS NOT NULL
ORDER BY fcr.review_key;
"""

print("Loading data...")
df = pd.read_sql(satisfaction_query, engine)

print("\nDataset Overview:")
print(f"Total reviews: {len(df):,}")
print(f"Features: {df.shape[1]} variables")
print(f"Date range: 2016-2018")
print(f"Goal: Customer satisfaction prediction")

Connecting to database...
Loading data...

Dataset Overview:
Total reviews: 104,650
Features: 24 variables
Date range: 2016-2018
Goal: Customer satisfaction prediction


In [3]:
# Looking at the customer satisfaction data
print("CUSTOMER SATISFACTION ANALYSIS")

# Create target variable: High satisfaction (4-5 stars) vs Low satisfaction (1-3 stars)
df['high_satisfaction'] = (df['review_score'] >= 4).astype(int)

# Display sample records
print("\nSample customer reviews:")
display(df[['order_id', 'review_score', 'high_satisfaction', 'actual_delivery_days',
           'delivery_delay_days', 'customer_region', 'total_order_value']].head())

# Check data quality
print(f"\nData Quality Check:")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicate reviews: {df['review_key'].duplicated().sum()}")
print(f"Total customers: {df['order_id'].nunique():,}")

# Target variable distribution
satisfaction_dist = df['high_satisfaction'].value_counts(normalize=True)
print(f"\nSatisfaction Split:")
print(f"High Satisfaction (4-5 stars): {satisfaction_dist[1]:.1%} ({df['high_satisfaction'].sum():,} reviews)")
print(f"Low Satisfaction (1-3 stars): {satisfaction_dist[0]:.1%} ({(df['high_satisfaction']==0).sum():,} reviews)")

# Review score breakdown
print(f"\nDetailed Review Scores:")
score_dist = df['review_score'].value_counts().sort_index()
for score, count in score_dist.items():
    percentage = count / len(df) * 100
    stars = '⭐' * int(score)
    print(f"{stars} {score}: {count:,} reviews ({percentage:.1f}%)")

# Key business metrics
print(f"\nKey Stats:")
print(f"Average review score: {df['review_score'].mean():.2f}/5.0")
print(f"Average delivery time: {df['actual_delivery_days'].mean():.1f} days")
print(f"Average delay: {df['delivery_delay_days'].mean():.1f} days")
print(f"On-time delivery rate: {df['is_on_time'].mean():.1%}")
print(f"Average order value: R$ {df['total_order_value'].mean():.2f}")

CUSTOMER SATISFACTION ANALYSIS

Sample customer reviews:


Unnamed: 0,order_id,review_score,high_satisfaction,actual_delivery_days,delivery_delay_days,customer_region,total_order_value
0,fc046d7776171871436844218f817d7d,5,1,27,0,Southeast,145.0
1,d4665434b01caa9dc3e3e78b3eb3593e,5,1,3,-22,Southeast,60.0
2,e28abf2eb2f1fbcbdc2dd0cd9a561671,5,1,7,-16,Southeast,150.0
3,04fb47576993a3cb0c12d4b25eab6e4e,5,1,4,-9,Southeast,259.9
4,5f358d797a49fe2f24352f73426215f6,5,1,1,-12,Southeast,68.89



Data Quality Check:
Missing values: 1456
Duplicate reviews: 9388
Total customers: 95,065

Satisfaction Split:
High Satisfaction (4-5 stars): 77.8% (81,414 reviews)
Low Satisfaction (1-3 stars): 22.2% (23,236 reviews)

Detailed Review Scores:
⭐ 1: 11,226 reviews (10.7%)
⭐⭐ 2: 3,336 reviews (3.2%)
⭐⭐⭐ 3: 8,674 reviews (8.3%)
⭐⭐⭐⭐ 4: 20,442 reviews (19.5%)
⭐⭐⭐⭐⭐ 5: 60,972 reviews (58.3%)

Key Stats:
Average review score: 4.11/5.0
Average delivery time: 12.4 days
Average delay: -11.9 days
On-time delivery rate: 93.4%
Average order value: R$ 148.67


In [4]:
# Satisfaction drivers analysis and visualization
plt.figure(figsize=(16, 12))

# 1. Satisfaction distribution
plt.subplot(2, 4, 1)
satisfaction_counts = df['high_satisfaction'].value_counts()
plt.pie(satisfaction_counts.values, labels=['High Satisfaction', 'Low Satisfaction'],
        autopct='%1.1f%%', colors=['#4ecdc4', '#ff6b6b'])
plt.title('Customer Satisfaction Distribution')

# 2. Satisfaction by delivery performance
plt.subplot(2, 4, 2)
delivery_satisfaction = pd.crosstab(df['is_on_time'], df['high_satisfaction'], normalize='index')
delivery_satisfaction.plot(kind='bar', ax=plt.gca(), color=['#ff6b6b', '#4ecdc4'])
plt.title('Satisfaction by Delivery Performance')
plt.xticks([0, 1], ['Delayed', 'On Time'], rotation=0)
plt.legend(['Low Satisfaction', 'High Satisfaction'])

# 3. Satisfaction by shipping distance
plt.subplot(2, 4, 3)
distance_satisfaction = pd.crosstab(df['shipping_distance'], df['high_satisfaction'], normalize='index')
distance_satisfaction.plot(kind='bar', ax=plt.gca(), color=['#ff6b6b', '#4ecdc4'])
plt.title('Satisfaction by Shipping Distance')
plt.xticks(rotation=45)
plt.legend(['Low Satisfaction', 'High Satisfaction'])

# 4. Satisfaction by freight category
plt.subplot(2, 4, 4)
freight_satisfaction = pd.crosstab(df['freight_category'], df['high_satisfaction'], normalize='index')
freight_satisfaction.plot(kind='bar', ax=plt.gca(), color=['#ff6b6b', '#4ecdc4'])
plt.title('Satisfaction by Freight Cost')
plt.xticks(rotation=45)
plt.legend(['Low Satisfaction', 'High Satisfaction'])

# 5. Delivery delay impact
plt.subplot(2, 4, 5)
delay_bins = pd.cut(df['delivery_delay_days'], bins=[-50, 0, 5, 15, 100],
                   labels=['Early', '0-5 days', '6-15 days', '15+ days'])
delay_satisfaction = pd.crosstab(delay_bins, df['high_satisfaction'], normalize='index')
delay_satisfaction.plot(kind='bar', ax=plt.gca(), color=['#ff6b6b', '#4ecdc4'])
plt.title('Satisfaction by Delivery Delay')
plt.xticks(rotation=45)
plt.legend(['Low Satisfaction', 'High Satisfaction'])

# 6. Order value impact
plt.subplot(2, 4, 6)
value_bins = pd.cut(df['total_order_value'], bins=3, labels=['Low Value', 'Medium Value', 'High Value'])
value_satisfaction = pd.crosstab(value_bins, df['high_satisfaction'], normalize='index')
value_satisfaction.plot(kind='bar', ax=plt.gca(), color=['#ff6b6b', '#4ecdc4'])
plt.title('Satisfaction by Order Value')
plt.xticks(rotation=45)
plt.legend(['Low Satisfaction', 'High Satisfaction'])

# 7. Regional satisfaction patterns
plt.subplot(2, 4, 7)
region_satisfaction = df.groupby('customer_region')['high_satisfaction'].mean().sort_values(ascending=True)
region_satisfaction.plot(kind='barh', ax=plt.gca(), color='#95a5a6')
plt.title('Satisfaction Rate by Region')
plt.xlabel('High Satisfaction Rate')

# 8. Product category satisfaction
plt.subplot(2, 4, 8)
top_categories = df['product_category_level_1'].value_counts().head(6)
category_satisfaction = df[df['product_category_level_1'].isin(top_categories.index)].groupby(
    'product_category_level_1')['high_satisfaction'].mean().sort_values(ascending=True)
category_satisfaction.plot(kind='barh', ax=plt.gca(), color='#e74c3c')
plt.title('Satisfaction by Product Category')
plt.xlabel('High Satisfaction Rate')

plt.tight_layout()
safe_plot_save('customer_satisfaction_analysis.png')
plt.show()

print("Satisfaction analysis visualizations saved")
print("Key patterns identified for predictive modeling")

Plot saved as: predictive_exports\customer_satisfaction_analysis_201335.png
Satisfaction analysis visualizations saved
Key patterns identified for predictive modeling


In [5]:
# Feature engineering for decision tree model
print("FEATURE ENGINEERING")

# Create working dataset
model_df = df.copy()

# Handle missing values - drop rows with missing key data
model_df = model_df.dropna(subset=['review_score', 'actual_delivery_days', 'total_order_value'])

# Features to encode (categorical)
categorical_features = [
    'customer_region',
    'seller_region',
    'shipping_distance',
    'freight_category',
    'order_complexity',
    'product_size_category'
]

# Numerical features to use
numerical_features = [
    'estimated_delivery_days',
    'actual_delivery_days',
    'delivery_delay_days',
    'total_order_value',
    'freight_value',
    'item_count',
    'product_weight_grams'
]

# Encode categorical variables
label_encoders = {}
print("\nEncoding categorical features:")

for feature in categorical_features:
    le = LabelEncoder()
    model_df[f'{feature}_encoded'] = le.fit_transform(model_df[feature].astype(str))
    label_encoders[feature] = le
    print(f"✓ {feature}: {len(le.classes_)} categories")

# Create feature matrix
encoded_features = [f'{feat}_encoded' for feat in categorical_features]
all_features = numerical_features + encoded_features

# Prepare final dataset
X = model_df[all_features].copy()
y = model_df['high_satisfaction'].copy()

# Handle any remaining missing values with median
X = X.fillna(X.median())

print(f"\nModel preparation:")
print(f"Feature matrix: {X.shape}")
print(f"Target vector: {y.shape}")
print(f"Features: {len(all_features)} total")
print(f"Class balance: {y.mean():.1%} high satisfaction")

# Show feature stats
print(f"\nFeature summary:")
display(X.describe().round(2))

FEATURE ENGINEERING

Encoding categorical features:
✓ customer_region: 5 categories
✓ seller_region: 5 categories
✓ shipping_distance: 3 categories
✓ freight_category: 3 categories
✓ order_complexity: 3 categories
✓ product_size_category: 4 categories

Model preparation:
Feature matrix: (104650, 13)
Target vector: (104650,)
Features: 13 total
Class balance: 77.8% high satisfaction

Feature summary:


Unnamed: 0,estimated_delivery_days,actual_delivery_days,delivery_delay_days,total_order_value,freight_value,item_count,product_weight_grams,customer_region_encoded,seller_region_encoded,shipping_distance_encoded,freight_category_encoded,order_complexity_encoded,product_size_category_encoded
count,104650.0,104650.0,104650.0,104650.0,104650.0,104650.0,104650.0,104650.0,104650.0,104650.0,104650.0,104650.0,104650.0
mean,24.37,12.42,-11.95,148.67,20.0,1.33,2098.65,3.38,3.78,0.99,0.89,1.78,0.89
std,8.79,9.39,10.06,253.34,15.66,1.0,3750.19,1.11,0.62,0.86,0.55,0.49,0.8
min,3.0,0.0,-147.0,0.85,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,7.0,-17.0,48.9,13.13,1.0,300.0,3.0,4.0,0.0,1.0,2.0,0.0
50%,24.0,10.0,-12.0,89.9,16.28,1.0,700.0,4.0,4.0,1.0,1.0,2.0,1.0
75%,29.0,16.0,-7.0,159.8,21.15,1.0,1800.0,4.0,4.0,2.0,1.0,2.0,1.0
max,156.0,208.0,188.0,13440.0,409.68,21.0,40425.0,4.0,4.0,2.0,2.0,2.0,3.0


In [6]:
# Model training setup
print("MODEL TRAINING SETUP")

# Set random seed
RANDOM_SEED = random.randint(1, 10000)
print(f"Random seed: {RANDOM_SEED}")

# Train-test split: 90% training, 10% testing (as shown by Peyman)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=RANDOM_SEED, stratify=y
)

print(f"\nData split:")
print(f"Training set: {len(X_train):,} samples (90%)")
print(f"Testing set: {len(X_test):,} samples (10%)")
print(f"Features: {X_train.shape[1]}")

# Check stratification worked
print(f"\nClass distribution check:")
print(f"Training - High satisfaction: {y_train.mean():.1%}")
print(f"Testing - High satisfaction: {y_test.mean():.1%}")
print(f"Overall - High satisfaction: {y.mean():.1%}")

print(f"\nReady for Decision Tree training")

MODEL TRAINING SETUP
Random seed: 7567

Data split:
Training set: 94,185 samples (90%)
Testing set: 10,465 samples (10%)
Features: 13

Class distribution check:
Training - High satisfaction: 77.8%
Testing - High satisfaction: 77.8%
Overall - High satisfaction: 77.8%

Ready for Decision Tree training


In [7]:
# Decision Tree Model Development
print("DECISION TREE MODEL TRAINING")

# Create Decision Tree with reasonable parameters
dt_model = DecisionTreeClassifier(
    max_depth=6,            # Not too deep to avoid overfitting
    min_samples_split=50,   # Need enough samples to split
    min_samples_leaf=25,    # Minimum samples in each leaf
    class_weight='balanced', # Handle class imbalance
    random_state=RANDOM_SEED
)

# Train the model
print("Training Decision Tree...")
dt_model.fit(X_train, y_train)

# Generate predictions
y_pred_train = dt_model.predict(X_train)
y_pred_test = dt_model.predict(X_test)

# Calculate performance metrics
print(f"\nMODEL PERFORMANCE")

# Training performance
train_accuracy = accuracy_score(y_train, y_pred_train)
train_precision = precision_score(y_train, y_pred_train)
train_recall = recall_score(y_train, y_pred_train)
train_f1 = f1_score(y_train, y_pred_train)

# Testing performance
test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test)
test_recall = recall_score(y_test, y_pred_test)
test_f1 = f1_score(y_test, y_pred_test)

print(f"Training Performance:")
print(f"  Accuracy:  {train_accuracy:.3f} ({train_accuracy:.1%})")
print(f"  Precision: {train_precision:.3f} ({train_precision:.1%})")
print(f"  Recall:    {train_recall:.3f} ({train_recall:.1%})")
print(f"  F1-Score:  {train_f1:.3f} ({train_f1:.1%})")

print(f"\nTesting Performance:")
print(f"  Accuracy:  {test_accuracy:.3f} ({test_accuracy:.1%})")
print(f"  Precision: {test_precision:.3f} ({test_precision:.1%})")
print(f"  Recall:    {test_recall:.3f} ({test_recall:.1%})")
print(f"  F1-Score:  {test_f1:.3f} ({test_f1:.1%})")

# Check for overfitting
overfitting = train_accuracy - test_accuracy
print(f"\nOverfitting Check:")
print(f"  Accuracy gap: {overfitting:.3f}")
if overfitting < 0.05:
    print(f"  Good - low overfitting")
elif overfitting < 0.15:
    print(f"  Moderate overfitting")
else:
    print(f"  High overfitting - might need simpler model")

# Detailed classification report
print(f"\nDetailed Results:")
print(classification_report(y_test, y_pred_test,
                          target_names=['Low Satisfaction', 'High Satisfaction']))

DECISION TREE MODEL TRAINING
Training Decision Tree...

MODEL PERFORMANCE
Training Performance:
  Accuracy:  0.721 (72.1%)
  Precision: 0.859 (85.9%)
  Recall:    0.767 (76.7%)
  F1-Score:  0.811 (81.1%)

Testing Performance:
  Accuracy:  0.712 (71.2%)
  Precision: 0.857 (85.7%)
  Recall:    0.756 (75.6%)
  F1-Score:  0.804 (80.4%)

Overfitting Check:
  Accuracy gap: 0.009
  Good - low overfitting

Detailed Results:
                   precision    recall  f1-score   support

 Low Satisfaction       0.40      0.56      0.46      2324
High Satisfaction       0.86      0.76      0.80      8141

         accuracy                           0.71     10465
        macro avg       0.63      0.66      0.63     10465
     weighted avg       0.75      0.71      0.73     10465



In [8]:
# Confusion Matrix Analysis
print("CONFUSION MATRIX ANALYSIS")

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred_test)
tn, fp, fn, tp = cm.ravel()

print(f"Confusion Matrix:")
print(f"                    Predicted")
print(f"Actual    Low Sat  High Sat")
print(f"Low Sat     {tn:4d}      {fp:4d}    (TN: {tn}, FP: {fp})")
print(f"High Sat    {fn:4d}      {tp:4d}    (FN: {fn}, TP: {tp})")

# Calculate confusion matrix metrics
print(f"\nConfusion Matrix Calculations:")
accuracy = (tp + tn) / (tp + tn + fp + fn)
true_positive_rate = tp / (tp + fn) if (tp + fn) > 0 else 0  # Recall/Sensitivity
true_negative_rate = tn / (tn + fp) if (tn + fp) > 0 else 0  # Specificity
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0

print(f"Accuracy = (TP + TN) / (TP + TN + FP + FN) = {accuracy:.3f}")
print(f"True Positive Rate = TP / (TP + FN) = {true_positive_rate:.3f}")
print(f"True Negative Rate = TN / (TN + FP) = {true_negative_rate:.3f}")
print(f"Precision = TP / (TP + FP) = {precision:.3f}")
print(f"Recall = TP / (TP + FN) = {recall:.3f}")

# What this means for business
print(f"\nWhat this means:")
print(f"True Negatives (Correctly predicted low satisfaction): {tn:,}")
print(f"False Positives (Predicted high, actually low): {fp:,}")
print(f"False Negatives (Predicted low, actually high): {fn:,}")
print(f"True Positives (Correctly predicted high satisfaction): {tp:,}")

print(f"\nModel Reliability:")
print(f"Correctly identifies low satisfaction: {true_negative_rate:.1%}")
print(f"Correctly identifies high satisfaction: {true_positive_rate:.1%}")
print(f"Overall accuracy: {accuracy:.1%}")

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Low Satisfaction', 'High Satisfaction'],
            yticklabels=['Low Satisfaction', 'High Satisfaction'])
plt.title('Confusion Matrix - Customer Satisfaction Prediction')
plt.xlabel('Predicted')
plt.ylabel('Actual')

safe_plot_save('confusion_matrix_satisfaction.png')
plt.show()

print("Confusion matrix saved")

CONFUSION MATRIX ANALYSIS
Confusion Matrix:
                    Predicted
Actual    Low Sat  High Sat
Low Sat     1298      1026    (TN: 1298, FP: 1026)
High Sat    1984      6157    (FN: 1984, TP: 6157)

Confusion Matrix Calculations:
Accuracy = (TP + TN) / (TP + TN + FP + FN) = 0.712
True Positive Rate = TP / (TP + FN) = 0.756
True Negative Rate = TN / (TN + FP) = 0.559
Precision = TP / (TP + FP) = 0.857
Recall = TP / (TP + FN) = 0.756

What this means:
True Negatives (Correctly predicted low satisfaction): 1,298
False Positives (Predicted high, actually low): 1,026
False Negatives (Predicted low, actually high): 1,984
True Positives (Correctly predicted high satisfaction): 6,157

Model Reliability:
Correctly identifies low satisfaction: 55.9%
Correctly identifies high satisfaction: 75.6%
Overall accuracy: 71.2%
Plot saved as: predictive_exports\confusion_matrix_satisfaction_201337.png
Confusion matrix saved


In [9]:
# K-Fold Cross Validation
print("K-FOLD CROSS VALIDATION")

# Setup K-Fold with stratification
cv_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

# Run cross-validation for multiple metrics
print("Running 5-fold cross-validation...")
cv_accuracy = cross_val_score(dt_model, X_train, y_train, cv=cv_folds, scoring='accuracy')
cv_precision = cross_val_score(dt_model, X_train, y_train, cv=cv_folds, scoring='precision')
cv_recall = cross_val_score(dt_model, X_train, y_train, cv=cv_folds, scoring='recall')
cv_f1 = cross_val_score(dt_model, X_train, y_train, cv=cv_folds, scoring='f1')

print(f"\nCross-Validation Results:")
print(f"Accuracy:  {cv_accuracy.mean():.3f} ± {cv_accuracy.std():.3f}")
print(f"Precision: {cv_precision.mean():.3f} ± {cv_precision.std():.3f}")
print(f"Recall:    {cv_recall.mean():.3f} ± {cv_recall.std():.3f}")
print(f"F1-Score:  {cv_f1.mean():.3f} ± {cv_f1.std():.3f}")

print(f"\nFold-by-fold accuracy:")
for i, score in enumerate(cv_accuracy, 1):
    print(f"Fold {i}: {score:.3f}")

# Check model stability
accuracy_std = cv_accuracy.std()
print(f"\nModel Stability:")
if accuracy_std < 0.02:
    stability = "Excellent - very stable"
elif accuracy_std < 0.05:
    stability = "Good - stable performance"
else:
    stability = "Fair - some variability"

print(f"Performance stability: {stability}")
print(f"Standard deviation: {accuracy_std:.3f}")

# Check precision vs recall balance
precision_recall_diff = abs(cv_precision.mean() - cv_recall.mean())
print(f"\nPrecision vs Recall Balance:")
print(f"Difference: {precision_recall_diff:.3f}")
if precision_recall_diff < 0.1:
    print(f"Good balance between precision and recall")
else:
    print(f"Large gap - might need model tuning")

# Visualize cross-validation results
plt.figure(figsize=(10, 6))
metrics_data = [cv_accuracy, cv_precision, cv_recall, cv_f1]
plt.boxplot(metrics_data, labels=['Accuracy', 'Precision', 'Recall', 'F1-Score'])
plt.title('Cross-Validation Performance Distribution')
plt.ylabel('Score')
plt.grid(True, alpha=0.3)

safe_plot_save('cross_validation_results.png')
plt.show()

print("Cross-validation results saved")

K-FOLD CROSS VALIDATION
Running 5-fold cross-validation...

Cross-Validation Results:
Accuracy:  0.721 ± 0.009
Precision: 0.858 ± 0.003
Recall:    0.770 ± 0.016
F1-Score:  0.811 ± 0.008

Fold-by-fold accuracy:
Fold 1: 0.729
Fold 2: 0.706
Fold 3: 0.718
Fold 4: 0.732
Fold 5: 0.722

Model Stability:
Performance stability: Excellent - very stable
Standard deviation: 0.009

Precision vs Recall Balance:
Difference: 0.088
Good balance between precision and recall
Plot saved as: predictive_exports\cross_validation_results_201342.png
Cross-validation results saved


In [10]:
# Feature Importance Analysis
print("FEATURE IMPORTANCE ANALYSIS")

# Get feature importance from trained model
feature_importance = pd.DataFrame({
    'feature': all_features,
    'importance': dt_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nTop 10 Most Important Features:")
for i, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1):
    feature_name = row['feature']
    importance = row['importance']
    print(f"{i:2d}. {feature_name}: {importance:.3f}")

# Show some decision tree rules
print(f"\nDecision Tree Rules (sample):")
tree_rules = export_text(dt_model, feature_names=all_features,
                        class_names=['Low Satisfaction', 'High Satisfaction'])
# Show first 20 lines
rules_lines = tree_rules.split('\n')[:20]
for line in rules_lines:
    if line.strip():
        print(line)

# Top satisfaction drivers
print(f"\nKey Satisfaction Drivers:")
top_features = feature_importance.head(5)
for i, (_, row) in enumerate(top_features.iterrows(), 1):
    feature = row['feature']
    importance = row['importance']
    print(f"{i}. {feature}: {importance:.3f}")

# Visualize feature importance
plt.figure(figsize=(12, 8))
top_features_plot = feature_importance.head(10)
plt.barh(range(len(top_features_plot)), top_features_plot['importance'],
         color='steelblue', alpha=0.8)
plt.yticks(range(len(top_features_plot)), top_features_plot['feature'])
plt.xlabel('Feature Importance Score')
plt.title('Top 10 Customer Satisfaction Drivers')
plt.gca().invert_yaxis()

# Add value labels
for i, v in enumerate(top_features_plot['importance']):
    plt.text(v + 0.005, i, f'{v:.3f}', va='center', fontweight='bold')

plt.tight_layout()
safe_plot_save('feature_importance_satisfaction.png')
plt.show()

print("Feature importance chart saved")

FEATURE IMPORTANCE ANALYSIS

Top 10 Most Important Features:
 1. delivery_delay_days: 0.597
 2. item_count: 0.243
 3. actual_delivery_days: 0.115
 4. estimated_delivery_days: 0.014
 5. shipping_distance_encoded: 0.008
 6. freight_value: 0.006
 7. total_order_value: 0.005
 8. customer_region_encoded: 0.005
 9. product_size_category_encoded: 0.003
10. seller_region_encoded: 0.002

Decision Tree Rules (sample):
|--- delivery_delay_days <= 1.50
|   |--- item_count <= 1.50
|   |   |--- actual_delivery_days <= 16.50
|   |   |   |--- actual_delivery_days <= 9.50
|   |   |   |   |--- actual_delivery_days <= 4.50
|   |   |   |   |   |--- product_size_category_encoded <= 1.50
|   |   |   |   |   |   |--- class: High Satisfaction
|   |   |   |   |   |--- product_size_category_encoded >  1.50
|   |   |   |   |   |   |--- class: High Satisfaction
|   |   |   |   |--- actual_delivery_days >  4.50
|   |   |   |   |   |--- freight_value <= 11.82
|   |   |   |   |   |   |--- class: High Satisfaction
| 

In [11]:
# Decision Tree Visualization
print("DECISION TREE VISUALIZATION")

# Create tree visualization (top levels only for readability)
plt.figure(figsize=(20, 12))
plot_tree(dt_model,
         feature_names=all_features,
         class_names=['Low Satisfaction', 'High Satisfaction'],
         filled=True,
         rounded=True,
         fontsize=9,
         max_depth=3)  # Only show top 3 levels

plt.title('Customer Satisfaction Decision Tree\n(Top 3 Levels)', fontsize=16, pad=20)

safe_plot_save('decision_tree_satisfaction.png')
plt.show()

# Tree info
print(f"\nDecision Tree Info:")
print(f"Tree depth: {dt_model.get_depth()}")
print(f"Number of leaves: {dt_model.get_n_leaves()}")
print(f"Total nodes: {dt_model.tree_.node_count}")

print(f"\nHow to read the tree:")
print(f"• Each box is a decision rule")
print(f"• Color shows confidence (darker = more confident)")
print(f"• 'samples' = how many customers reach this point")

print("Decision tree saved")

DECISION TREE VISUALIZATION
Plot saved as: predictive_exports\decision_tree_satisfaction_201343.png

Decision Tree Info:
Tree depth: 6
Number of leaves: 52
Total nodes: 103

How to read the tree:
• Each box is a decision rule
• Color shows confidence (darker = more confident)
• 'samples' = how many customers reach this point
Decision tree saved


In [12]:
# Testing the model with sample scenarios
print("TESTING MODEL PREDICTIONS")

# Sample prediction scenarios
test_cases = [
    {
        'type': 'Bad Order Example',
        'estimated_delivery_days': 20,
        'actual_delivery_days': 25,
        'delivery_delay_days': 8,
        'total_order_value': 45.0,
        'freight_value': 15.0,
        'item_count': 1,
        'product_weight_grams': 3000,
        # Categorical values
        'customer_region': 'Northeast',
        'seller_region': 'Southeast',
        'shipping_distance': 'Different Region',
        'freight_category': 'High Freight',
        'order_complexity': 'Simple',
        'product_size_category': 'Large'
    },
    {
        'type': 'Good Order Example',
        'estimated_delivery_days': 7,
        'actual_delivery_days': 6,
        'delivery_delay_days': -1,
        'total_order_value': 150.0,
        'freight_value': 8.0,
        'item_count': 2,
        'product_weight_grams': 800,
        # Categorical values
        'customer_region': 'Southeast',
        'seller_region': 'Southeast',
        'shipping_distance': 'Same Region',
        'freight_category': 'Normal Freight',
        'order_complexity': 'Medium',
        'product_size_category': 'Medium'
    }
]

print(f"\nSample Predictions:")
for scenario in test_cases:
    case_type = scenario.pop('type')

    # Create feature vector for prediction
    sample_features = pd.DataFrame([{
        'estimated_delivery_days': scenario['estimated_delivery_days'],
        'actual_delivery_days': scenario['actual_delivery_days'],
        'delivery_delay_days': scenario['delivery_delay_days'],
        'total_order_value': scenario['total_order_value'],
        'freight_value': scenario['freight_value'],
        'item_count': scenario['item_count'],
        'product_weight_grams': scenario['product_weight_grams'],
        # Use trained encoders
        'customer_region_encoded': label_encoders['customer_region'].transform([scenario['customer_region']])[0],
        'seller_region_encoded': label_encoders['seller_region'].transform([scenario['seller_region']])[0],
        'shipping_distance_encoded': label_encoders['shipping_distance'].transform([scenario['shipping_distance']])[0],
        'freight_category_encoded': label_encoders['freight_category'].transform([scenario['freight_category']])[0],
        'order_complexity_encoded': label_encoders['order_complexity'].transform([scenario['order_complexity']])[0],
        'product_size_category_encoded': label_encoders['product_size_category'].transform([scenario['product_size_category']])[0]
    }])

    prediction = dt_model.predict(sample_features)[0]
    probability = dt_model.predict_proba(sample_features)[0]

    print(f"\n{case_type.upper()}:")
    print(f"   Delivery: {scenario['actual_delivery_days']} days (delay: {scenario['delivery_delay_days']} days)")
    print(f"   Order value: R$ {scenario['total_order_value']:.2f}")
    print(f"   Freight: R$ {scenario['freight_value']:.2f}")

    satisfaction_pred = 'High Satisfaction' if prediction == 1 else 'Low Satisfaction'
    confidence = probability[1] if prediction == 1 else probability[0]

    print(f"   Prediction: {satisfaction_pred} ({confidence:.1%} confidence)")

    if prediction == 0:  # Low satisfaction predicted
        print(f"   Action needed: Watch this order closely")
    else:
        print(f"   Looks good: Should be happy customer")

print(f"\nKey Insights for Business:")
print(f"1. Delivery performance is the main driver")
print(f"2. High freight costs hurt satisfaction")
print(f"3. Can identify risky orders early")
print(f"4. Regional differences matter")

print(f"\nModel ready for use")

TESTING MODEL PREDICTIONS

Sample Predictions:

BAD ORDER EXAMPLE:
   Delivery: 25 days (delay: 8 days)
   Order value: R$ 45.00
   Freight: R$ 15.00
   Prediction: Low Satisfaction (96.6% confidence)
   Action needed: Watch this order closely

GOOD ORDER EXAMPLE:
   Delivery: 6 days (delay: -1 days)
   Order value: R$ 150.00
   Freight: R$ 8.00
   Prediction: High Satisfaction (56.3% confidence)
   Looks good: Should be happy customer

Key Insights for Business:
1. Delivery performance is the main driver
2. High freight costs hurt satisfaction
3. Can identify risky orders early
4. Regional differences matter

Model ready for use


In [13]:
# Export Results for PowerBI
print("EXPORTING RESULTS")

# 1. Model Performance Summary
model_performance = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1_Score', 'CV_Accuracy', 'CV_Std'],
    'Training': [train_accuracy, train_precision, train_recall, train_f1,
                cv_accuracy.mean(), cv_accuracy.std()],
    'Testing': [test_accuracy, test_precision, test_recall, test_f1,
               cv_accuracy.mean(), cv_accuracy.std()]
})
model_performance.to_csv(get_export_path('satisfaction_model_performance.csv'), index=False)

# 2. Feature Importance for Dashboard
feature_importance_export = feature_importance.copy()
feature_importance_export['importance_percentage'] = feature_importance_export['importance'] * 100
feature_importance_export.to_csv(get_export_path('satisfaction_feature_importance.csv'), index=False)

# 3. Confusion Matrix Details
confusion_metrics = pd.DataFrame({
    'Metric': ['True_Negatives', 'False_Positives', 'False_Negatives', 'True_Positives',
              'True_Positive_Rate', 'True_Negative_Rate', 'Precision', 'Recall'],
    'Value': [tn, fp, fn, tp, true_positive_rate, true_negative_rate, precision, recall],
    'Description': ['Correctly predicted low satisfaction', 'Predicted high, actually low',
                   'Predicted low, actually high', 'Correctly predicted high satisfaction',
                   'Sensitivity', 'Specificity', 'Positive predictive value', 'Sensitivity']
})
confusion_metrics.to_csv(get_export_path('satisfaction_confusion_matrix.csv'), index=False)

# 4. Test Set Predictions
test_predictions = X_test.copy()
test_predictions['actual_satisfaction'] = y_test.values
test_predictions['predicted_satisfaction'] = y_pred_test
test_predictions['prediction_probability'] = dt_model.predict_proba(X_test)[:, 1]
test_predictions['correct_prediction'] = (y_test.values == y_pred_test).astype(int)

# Add labels for PowerBI
test_predictions['actual_label'] = test_predictions['actual_satisfaction'].map(
    {0: 'Low Satisfaction', 1: 'High Satisfaction'})
test_predictions['predicted_label'] = test_predictions['predicted_satisfaction'].map(
    {0: 'Low Satisfaction', 1: 'High Satisfaction'})
test_predictions['risk_category'] = pd.cut(
    test_predictions['prediction_probability'],
    bins=[0, 0.3, 0.7, 1.0],
    labels=['High Risk', 'Medium Risk', 'Low Risk']
)

test_predictions.to_csv(get_export_path('satisfaction_predictions_test_set.csv'), index=False)

# 5. Business Metrics Summary
business_metrics = pd.DataFrame({
    'Metric': [
        'Total_Reviews_Analyzed',
        'High_Satisfaction_Rate',
        'Average_Review_Score',
        'Average_Delivery_Days',
        'Average_Delay_Days',
        'On_Time_Delivery_Rate',
        'Model_Accuracy',
        'Model_Precision',
        'Model_Recall',
        'Cross_Validation_Score'
    ],
    'Value': [
        len(model_df),
        f"{model_df['high_satisfaction'].mean():.1%}",
        f"{model_df['review_score'].mean():.2f}",
        f"{model_df['actual_delivery_days'].mean():.1f}",
        f"{model_df['delivery_delay_days'].mean():.1f}",
        f"{model_df['is_on_time'].mean():.1%}",
        f"{test_accuracy:.1%}",
        f"{test_precision:.1%}",
        f"{test_recall:.1%}",
        f"{cv_accuracy.mean():.1%} ± {cv_accuracy.std():.1%}"
    ]
})
business_metrics.to_csv(get_export_path('satisfaction_business_metrics.csv'), index=False)

# 6. Regional Analysis
regional_analysis = model_df.groupby(['customer_region', 'seller_region']).agg({
    'high_satisfaction': ['count', 'mean'],
    'review_score': 'mean',
    'delivery_delay_days': 'mean',
    'total_order_value': 'mean'
}).round(3)

regional_analysis.columns = ['review_count', 'satisfaction_rate', 'avg_review_score',
                           'avg_delay_days', 'avg_order_value']
regional_analysis = regional_analysis.reset_index()
regional_analysis['satisfaction_percentage'] = regional_analysis['satisfaction_rate'] * 100
regional_analysis.to_csv(get_export_path('satisfaction_regional_analysis.csv'), index=False)

print(f"\nExport Complete:")
print(f"Created 6 CSV files in {EXPORT_DIR} folder:")
print(f"  • satisfaction_model_performance.csv")
print(f"  • satisfaction_feature_importance.csv")
print(f"  • satisfaction_confusion_matrix.csv")
print(f"  • satisfaction_predictions_test_set.csv")
print(f"  • satisfaction_business_metrics.csv")
print(f"  • satisfaction_regional_analysis.csv")
print(f"\nModel accuracy: {test_accuracy:.1%}")
print(f"Main driver: Delivery performance")
print(f"\nAll files saved to: {EXPORT_DIR}")

print(f"\nANALYSIS COMPLETE")

EXPORTING RESULTS

Export Complete:
Created 6 CSV files in predictive_exports folder:
  • satisfaction_model_performance.csv
  • satisfaction_feature_importance.csv
  • satisfaction_confusion_matrix.csv
  • satisfaction_predictions_test_set.csv
  • satisfaction_business_metrics.csv
  • satisfaction_regional_analysis.csv

Model accuracy: 71.2%
Main driver: Delivery performance

All files saved to: predictive_exports

ANALYSIS COMPLETE
