# FairLend Kenya - Model Training
## Training Credit Risk Models on Original vs Synthetic Data

This notebook trains credit risk models and compares their performance and fairness.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('../src')

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

from data_processing.bias_detector import BiasDetector

plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## 1. Load and Prepare Data

In [None]:
# Load original data
df_original = pd.read_csv('../data/sample_data.csv')
print(f"Original dataset: {df_original.shape}")

# For now, we'll use the original data. In practice, you would load synthetic data here
# df_synthetic = pd.read_csv('../data/synthetic/fair_credit_data.csv')

df_original.head()

## 2. Feature Engineering

In [None]:
def prepare_features(df):
    """Prepare features for modeling"""
    df_model = df.copy()
    
    # Encode categorical variables
    label_encoders = {}
    categorical_cols = ['location', 'gender', 'business_type', 'education_level']
    
    for col in categorical_cols:
        if col in df_model.columns:
            le = LabelEncoder()
            df_model[col + '_encoded'] = le.fit_transform(df_model[col])
            label_encoders[col] = le
    
    # Create additional features
    if 'monthly_income' in df_model.columns and 'loan_amount' in df_model.columns:
        df_model['loan_to_income_ratio'] = df_model['loan_amount'] / (df_model['monthly_income'] + 1)
    
    if 'mpesa_transaction_count' in df_model.columns:
        df_model['mpesa_activity_score'] = np.log1p(df_model['mpesa_transaction_count'])
    
    if 'credit_history_months' in df_model.columns:
        df_model['has_credit_history'] = (df_model['credit_history_months'] > 0).astype(int)
    
    return df_model, label_encoders

df_prepared, encoders = prepare_features(df_original)
print(f"\nPrepared features: {df_prepared.shape}")
print(f"\nNew columns: {[col for col in df_prepared.columns if col not in df_original.columns]}")

## 3. Select Features and Target

In [None]:
# Define features for modeling (excluding protected attributes directly)
feature_cols = [
    'age',
    'monthly_income',
    'mpesa_transaction_count',
    'mpesa_avg_transaction',
    'sacco_member',
    'existing_loans',
    'credit_history_months',
    'loan_amount',
    'loan_to_income_ratio',
    'mpesa_activity_score',
    'has_credit_history',
    # Encoded categorical features
    'location_encoded',
    'business_type_encoded',
    'education_level_encoded'
]

# Remove any features that don't exist
feature_cols = [col for col in feature_cols if col in df_prepared.columns]

X = df_prepared[feature_cols]
y = df_prepared['loan_approved']

print(f"Features: {len(feature_cols)}")
print(f"Target distribution: {y.value_counts().to_dict()}")

## 4. Train-Test Split

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTrain target distribution: {y_train.value_counts().to_dict()}")
print(f"Test target distribution: {y_test.value_counts().to_dict()}")

## 5. Train Multiple Models

In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42, max_depth=5)
}

# Train and evaluate
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Evaluate
    results[name] = {
        'model': model,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_pred_proba),
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    print(f"  Accuracy: {results[name]['accuracy']:.4f}")
    print(f"  AUC: {results[name]['auc']:.4f}")
    print(f"  F1 Score: {results[name]['f1']:.4f}")

## 6. Compare Model Performance

In [None]:
# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [r['accuracy'] for r in results.values()],
    'Precision': [r['precision'] for r in results.values()],
    'Recall': [r['recall'] for r in results.values()],
    'F1 Score': [r['f1'] for r in results.values()],
    'AUC': [r['auc'] for r in results.values()]
})

print("\nModel Performance Comparison:")
print(comparison_df.round(4))

# Visualize
fig, ax = plt.subplots(figsize=(12, 6))
comparison_df.set_index('Model')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC']].plot(kind='bar', ax=ax)
ax.set_title('Model Performance Comparison')
ax.set_ylabel('Score')
ax.set_ylim(0, 1)
ax.legend(loc='lower right')
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 7. Select Best Model and Analyze

In [None]:
# Select best model based on F1 score
best_model_name = comparison_df.loc[comparison_df['F1 Score'].idxmax(), 'Model']
best_model_results = results[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"\nPerformance Metrics:")
for metric in ['accuracy', 'precision', 'recall', 'f1', 'auc']:
    print(f"  {metric.upper()}: {best_model_results[metric]:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, best_model_results['y_pred'])
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Rejected', 'Approved'],
            yticklabels=['Rejected', 'Approved'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# Classification Report
print(f"\nDetailed Classification Report:")
print(classification_report(y_test, best_model_results['y_pred'], 
                          target_names=['Rejected', 'Approved']))

## 8. Feature Importance Analysis

In [None]:
# Get feature importances (for tree-based models)
best_model = best_model_results['model']

if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))
    
    # Visualize
    plt.figure(figsize=(10, 8))
    plt.barh(feature_importance.head(10)['Feature'], feature_importance.head(10)['Importance'])
    plt.xlabel('Importance')
    plt.title(f'Top 10 Feature Importances - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
elif hasattr(best_model, 'coef_'):
    # For logistic regression
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Coefficient': best_model.coef_[0]
    }).sort_values('Coefficient', key=abs, ascending=False)
    
    print("\nTop 10 Most Important Features (by coefficient magnitude):")
    print(feature_importance.head(10))

## 9. Fairness Analysis of Model Predictions

In [None]:
# Add predictions to test set
test_df = df_prepared.iloc[X_test.index].copy()
test_df['model_prediction'] = best_model_results['y_pred']
test_df['prediction_proba'] = best_model_results['y_pred_proba']

# Analyze fairness by protected attributes
print("="*60)
print("FAIRNESS ANALYSIS OF MODEL PREDICTIONS")
print("="*60)

for attr in ['gender', 'location', 'business_type']:
    if attr in test_df.columns:
        print(f"\n{attr.upper()}:")
        
        # Actual vs Predicted approval rates
        fairness_df = test_df.groupby(attr).agg({
            'loan_approved': 'mean',
            'model_prediction': 'mean'
        }).round(3)
        fairness_df.columns = ['Actual Approval Rate', 'Predicted Approval Rate']
        
        print(fairness_df)
        
        # Calculate disparate impact for predictions
        pred_rates = test_df.groupby(attr)['model_prediction'].mean()
        di_pred = pred_rates.min() / pred_rates.max() if pred_rates.max() > 0 else 0
        print(f"  Disparate Impact (Predictions): {di_pred:.3f}")
        print(f"  Status: {'⚠️ BIASED' if di_pred < 0.8 else '✅ FAIR'}")

## 10. Visualize Fairness Metrics

In [None]:
# Compare actual vs predicted approval rates by gender
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Gender
gender_comparison = test_df.groupby('gender').agg({
    'loan_approved': 'mean',
    'model_prediction': 'mean'
})

x = np.arange(len(gender_comparison))
width = 0.35

axes[0].bar(x - width/2, gender_comparison['loan_approved'], width, label='Actual', alpha=0.8)
axes[0].bar(x + width/2, gender_comparison['model_prediction'], width, label='Model Prediction', alpha=0.8)
axes[0].set_xlabel('Gender')
axes[0].set_ylabel('Approval Rate')
axes[0].set_title('Approval Rates by Gender: Actual vs Predicted')
axes[0].set_xticks(x)
axes[0].set_xticklabels(gender_comparison.index)
axes[0].legend()
axes[0].axhline(y=0.8, color='red', linestyle='--', alpha=0.5, label='Fairness Threshold')

# Location (top 5)
location_comparison = test_df.groupby('location').agg({
    'loan_approved': 'mean',
    'model_prediction': 'mean'
}).head(5)

x = np.arange(len(location_comparison))
axes[1].bar(x - width/2, location_comparison['loan_approved'], width, label='Actual', alpha=0.8)
axes[1].bar(x + width/2, location_comparison['model_prediction'], width, label='Model Prediction', alpha=0.8)
axes[1].set_xlabel('Location')
axes[1].set_ylabel('Approval Rate')
axes[1].set_title('Approval Rates by Location: Actual vs Predicted')
axes[1].set_xticks(x)
axes[1].set_xticklabels(location_comparison.index, rotation=45, ha='right')
axes[1].legend()

plt.tight_layout()
plt.show()

## 11. Save Model

In [None]:
import joblib
import os

# Create models directory
os.makedirs('../models', exist_ok=True)

# Save best model
model_path = f'../models/best_credit_model_{best_model_name.lower().replace(" ", "_")}.pkl'
joblib.dump(best_model, model_path)

# Save feature columns
joblib.dump(feature_cols, '../models/feature_columns.pkl')

# Save encoders
joblib.dump(encoders, '../models/label_encoders.pkl')

print(f"\n✅ Model saved to {model_path}")
print(f"✅ Feature columns saved")
print(f"✅ Label encoders saved")

## Summary & Next Steps

### Key Findings:
1. **Best Model:** Selected based on F1 score and AUC
2. **Model Performance:** Check accuracy, precision, recall metrics
3. **Fairness:** Analyzed disparate impact in model predictions

### Next Steps:
1. Generate synthetic fair data (see `04_synthetic_generation.ipynb`)
2. Retrain model on synthetic data
3. Compare fairness metrics before and after
4. Deploy fair model in production