# E-Commerce Intent Prediction - Model Visualization

Bu notebook, model sonuçlarını görselleştirir:
- Confusion Matrix
- ROC Curve
- Feature Importance
- Pandas vs Spark Karşılaştırması

**Yazarlar:** Abdulkadir Külçe, Berkay Türk, Umut Çalıkkasap

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

# Style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print("Libraries loaded!")

## 1. Veri Yükleme ve Model Eğitimi

Görselleştirme için sample veri kullanıyoruz.

In [None]:
# Veri yükle (sample)
SAMPLE_RATE = 0.05  # %5 sample
DATA_PATH = "../data/2019-Oct.csv"

print(f"Loading data (sample rate: {SAMPLE_RATE*100}%)...")
df = pd.read_csv(DATA_PATH)
df = df.sample(frac=SAMPLE_RATE, random_state=42)
print(f"Loaded {len(df):,} rows")

In [None]:
# Preprocessing (Leakage-Free)
import gc

print("Preprocessing...")
df['event_time'] = pd.to_datetime(df['event_time'])
df['category_code'] = df['category_code'].fillna('unknown')
df['brand'] = df['brand'].fillna('unknown')

# Leakage Prevention
purchases = df[df['event_type'] == 'purchase'][['user_session', 'event_time']]
first_purchase = purchases.groupby('user_session')['event_time'].min().reset_index()
first_purchase.columns = ['user_session', 'purchase_timestamp']
df = df.merge(first_purchase, on='user_session', how='left')
mask = (df['purchase_timestamp'].isna()) | (df['event_time'] <= df['purchase_timestamp'])
df_clean = df[mask].copy()
del df
gc.collect()

print(f"Clean data: {len(df_clean):,} rows")

In [None]:
# Feature Engineering
print("Feature engineering...")

df_clean['is_view'] = (df_clean['event_type'] == 'view').astype(int)
df_clean['is_cart'] = (df_clean['event_type'] == 'cart').astype(int)

agg_funcs = {
    'purchase_timestamp': lambda x: 1 if x.notna().any() else 0,
    'is_view': 'sum',
    'is_cart': 'sum',
    'event_time': lambda x: (x.max() - x.min()).total_seconds(),
    'price': ['mean', 'max'],
    'product_id': 'nunique'
}

session_features = df_clean.groupby('user_session').agg(agg_funcs)
session_features.columns = ['label', 'view_count', 'cart_count', 'session_duration',
                             'avg_price', 'max_price', 'unique_items']
session_features = session_features.fillna(0)

print(f"Sessions: {len(session_features):,}")
print(f"Conversion rate: {session_features['label'].mean()*100:.2f}%")

In [None]:
# Undersampling
print("Undersampling...")

minority = session_features[session_features['label'] == 1]
majority = session_features[session_features['label'] == 0]

majority_sampled = majority.sample(n=len(minority), random_state=42)
balanced = pd.concat([minority, majority_sampled]).sample(frac=1, random_state=42)

print(f"Balanced dataset: {len(balanced):,} sessions")

In [None]:
# Model Training
print("Training model...")

FEATURE_COLS = ['view_count', 'cart_count', 'session_duration', 'avg_price', 'max_price', 'unique_items']

X = balanced[FEATURE_COLS]
y = balanced['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=20, max_depth=5, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Model trained!")

## 2. Confusion Matrix

In [None]:
# Confusion Matrix
fig, ax = plt.subplots(figsize=(8, 6))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
            xticklabels=['No Purchase', 'Purchase'],
            yticklabels=['No Purchase', 'Purchase'])

ax.set_xlabel('Predicted', fontsize=12)
ax.set_ylabel('Actual', fontsize=12)
ax.set_title('Confusion Matrix - Intent Prediction Model', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../figures/confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Purchase', 'Purchase']))

## 3. ROC Curve

In [None]:
# ROC Curve
fig, ax = plt.subplots(figsize=(8, 6))

fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

ax.plot(fpr, tpr, color='#2ecc71', lw=2.5, label=f'ROC Curve (AUC = {roc_auc:.4f})')
ax.plot([0, 1], [0, 1], color='#e74c3c', lw=1.5, linestyle='--', label='Random Classifier')

ax.fill_between(fpr, tpr, alpha=0.3, color='#2ecc71')

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)
ax.set_title('ROC Curve - Purchase Intent Prediction', fontsize=14, fontweight='bold')
ax.legend(loc='lower right', fontsize=11)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../figures/roc_curve.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Feature Importance

In [None]:
# Feature Importance
fig, ax = plt.subplots(figsize=(10, 6))

importance = pd.DataFrame({
    'feature': FEATURE_COLS,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=True)

colors = plt.cm.Blues(np.linspace(0.4, 0.9, len(importance)))
bars = ax.barh(importance['feature'], importance['importance'], color=colors)

# Add value labels
for bar, val in zip(bars, importance['importance']):
    ax.text(val + 0.01, bar.get_y() + bar.get_height()/2, 
            f'{val:.3f}', va='center', fontsize=10)

ax.set_xlabel('Importance', fontsize=12)
ax.set_title('Feature Importance - Random Forest Model', fontsize=14, fontweight='bold')
ax.set_xlim([0, max(importance['importance']) * 1.15])

plt.tight_layout()
plt.savefig('../figures/feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nFeature Importance Ranking:")
for i, row in importance.iloc[::-1].iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")

## 5. Pandas vs Spark Karşılaştırması

In [None]:
# Benchmark Comparison Data (Progress Report'tan)
benchmark_data = {
    'Framework': ['Pandas', 'Spark'],
    'Preprocessing (sec)': [780, 30],  # ~13 min vs ~0.5 min
    'Training (sec)': [1.84, 485.49],
    'AUC': [0.9276, 0.9276],
    'F1-Score': [0.8366, 0.8366]
}

benchmark_df = pd.DataFrame(benchmark_data)
benchmark_df

In [None]:
# Time Comparison Chart
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Preprocessing Time
colors = ['#3498db', '#e74c3c']
bars1 = axes[0].bar(['Pandas', 'Spark'], benchmark_df['Preprocessing (sec)'], color=colors)
axes[0].set_ylabel('Time (seconds)', fontsize=12)
axes[0].set_title('Preprocessing Time', fontsize=14, fontweight='bold')
axes[0].bar_label(bars1, fmt='%.0f sec')

# Training Time
bars2 = axes[1].bar(['Pandas', 'Spark'], benchmark_df['Training (sec)'], color=colors)
axes[1].set_ylabel('Time (seconds)', fontsize=12)
axes[1].set_title('Training Time', fontsize=14, fontweight='bold')
axes[1].bar_label(bars2, fmt='%.1f sec')

plt.suptitle('Pandas vs Spark - Performance Comparison (~6GB Data)', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../figures/benchmark_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Small Data Paradox Visualization
fig, ax = plt.subplots(figsize=(10, 6))

data_sizes = [1, 5, 10, 50, 100, 500, 1000]  # GB
pandas_viable = [1, 1, 0.8, 0, 0, 0, 0]  # Viability score
spark_advantage = [0.2, 0.3, 0.5, 0.8, 0.9, 1.0, 1.0]  # Advantage score

ax.fill_between(data_sizes, pandas_viable, alpha=0.3, color='#3498db', label='Pandas Viable Zone')
ax.fill_between(data_sizes, spark_advantage, alpha=0.3, color='#e74c3c', label='Spark Advantage Zone')

ax.axvline(x=6, color='green', linestyle='--', lw=2, label='Our Dataset (~6GB)')
ax.axvline(x=16, color='orange', linestyle=':', lw=2, label='Typical RAM Limit (16GB)')

ax.set_xscale('log')
ax.set_xlabel('Data Size (GB)', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('"Small Data Paradox" - When to Use Which Framework?', fontsize=14, fontweight='bold')
ax.legend(loc='center right')
ax.set_xlim([1, 1000])
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../figures/small_data_paradox.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Event Distribution (Funnel Analysis)

In [None]:
# Funnel Data (Progress Report'tan)
funnel_data = {
    'Event': ['View', 'Cart', 'Purchase'],
    'Percentage': [96.1, 2.2, 1.7],
    'Count': [40787794, 933346, 727624]  # Approximate
}

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pie Chart
colors_pie = ['#3498db', '#f39c12', '#2ecc71']
explode = (0.02, 0.02, 0.05)
axes[0].pie(funnel_data['Percentage'], labels=funnel_data['Event'], autopct='%1.1f%%',
            colors=colors_pie, explode=explode, shadow=True, startangle=90)
axes[0].set_title('Event Type Distribution', fontsize=14, fontweight='bold')

# Funnel Bar
y_pos = range(len(funnel_data['Event']))
bars = axes[1].barh(y_pos, funnel_data['Percentage'], color=colors_pie)
axes[1].set_yticks(y_pos)
axes[1].set_yticklabels(funnel_data['Event'])
axes[1].set_xlabel('Percentage (%)', fontsize=12)
axes[1].set_title('E-Commerce Funnel', fontsize=14, fontweight='bold')
axes[1].bar_label(bars, fmt='%.1f%%')
axes[1].set_xlim([0, 105])

plt.tight_layout()
plt.savefig('../figures/funnel_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Summary Dashboard

In [None]:
# Final Summary Dashboard
fig = plt.figure(figsize=(16, 10))

# Create grid
gs = fig.add_gridspec(2, 3, hspace=0.3, wspace=0.3)

# 1. Metrics Summary (Text)
ax1 = fig.add_subplot(gs[0, 0])
ax1.axis('off')
metrics_text = """
MODEL METRICS
─────────────
AUC:      0.9276
F1-Score: 0.8366
Recall:   0.8385
Accuracy: 0.8385

DATASET
─────────────
Events:   42.4M
Size:     ~6 GB
Sessions: ~9M
Conv.Rate: 6.8%
"""
ax1.text(0.1, 0.5, metrics_text, fontsize=12, fontfamily='monospace',
         verticalalignment='center', bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.5))
ax1.set_title('Summary', fontsize=14, fontweight='bold')

# 2. Confusion Matrix
ax2 = fig.add_subplot(gs[0, 1])
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax2,
            xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')
ax2.set_title('Confusion Matrix', fontsize=14, fontweight='bold')

# 3. ROC Curve
ax3 = fig.add_subplot(gs[0, 2])
fpr, tpr, _ = roc_curve(y_test, y_prob)
ax3.plot(fpr, tpr, color='#2ecc71', lw=2, label=f'AUC = {auc(fpr, tpr):.4f}')
ax3.plot([0, 1], [0, 1], 'r--', lw=1)
ax3.fill_between(fpr, tpr, alpha=0.3, color='#2ecc71')
ax3.set_xlabel('FPR')
ax3.set_ylabel('TPR')
ax3.legend(loc='lower right')
ax3.set_title('ROC Curve', fontsize=14, fontweight='bold')

# 4. Feature Importance
ax4 = fig.add_subplot(gs[1, 0])
importance_sorted = importance.sort_values('importance', ascending=True)
ax4.barh(importance_sorted['feature'], importance_sorted['importance'], color='steelblue')
ax4.set_xlabel('Importance')
ax4.set_title('Feature Importance', fontsize=14, fontweight='bold')

# 5. Time Comparison
ax5 = fig.add_subplot(gs[1, 1])
x = np.arange(2)
width = 0.35
ax5.bar(x - width/2, [780/60, 30/60], width, label='Preprocessing (min)', color='#3498db')
ax5.bar(x + width/2, [1.84/60, 485.49/60], width, label='Training (min)', color='#e74c3c')
ax5.set_xticks(x)
ax5.set_xticklabels(['Pandas', 'Spark'])
ax5.set_ylabel('Time (minutes)')
ax5.legend()
ax5.set_title('Framework Comparison', fontsize=14, fontweight='bold')

# 6. Funnel
ax6 = fig.add_subplot(gs[1, 2])
ax6.pie([96.1, 2.2, 1.7], labels=['View', 'Cart', 'Purchase'], autopct='%1.1f%%',
        colors=['#3498db', '#f39c12', '#2ecc71'], explode=(0.02, 0.02, 0.05))
ax6.set_title('Event Distribution', fontsize=14, fontweight='bold')

plt.suptitle('E-Commerce Intent Prediction - Results Dashboard', fontsize=18, fontweight='bold', y=1.02)
plt.savefig('../figures/dashboard.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n✓ All visualizations saved to ../figures/")

---
## Sonuç

Bu notebook'ta:
- **Confusion Matrix** ile model tahminlerini görselleştirdik
- **ROC Curve** ile AUC=0.93 performansını gösterdik  
- **Feature Importance** ile cart_count'un en önemli özellik olduğunu doğruladık
- **Pandas vs Spark** karşılaştırması ile "Small Data Paradox"u gösterdik

**Key Insight:** ~6GB veri için Pandas daha hızlı, ama Spark fault tolerance ve scalability sağlıyor.