In [4]:
# Cell 1: Initial Setup and Imports
import pandas as pd
import numpy as np
import os
import sys
current_dir = os.getcwd()

# Assuming the notebook is in 'your_project/notebooks/'
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
print(f"Added '{project_root}' to sys.path.")
from src.feature_engineering import (
    create_feature_engineering_pipeline,
    RFMCalculator, # Needed for direct RFM calculation for clustering
    FeatureExtractor,
    AggregateFeatures,
    CustomEncoder,
    MissingValueHandler,
    FeatureScaler,
    clean_column_names 
)

# Import modeling libraries
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline


print("All necessary libraries and custom modules imported.")

DATA_PATH = os.path.join(project_root, 'data', 'raw', 'data.csv')

# Load the raw data
try:
    df_raw = pd.read_csv(DATA_PATH)
    print(f"Raw data loaded successfully. Shape: {df_raw.shape}")
    print("\nRaw data head:")
    display(df_raw.head())
except FileNotFoundError:
    print(f"Error: data.csv not found at {DATA_PATH}. Please ensure the data file is in the correct location: {DATA_PATH}")
    df_raw = None # Set to None to prevent further errors if data is not loaded 

if df_raw is None:
    print("Skipping Task 4: Raw data not loaded.")
else:
    print("\n--- Task 4: Proxy Target Variable Engineering ---")
    print("\n--- Preparing Data for RFM Clustering ---")

    rfm_pipeline_step = Pipeline([
        ('rfm_calculator', RFMCalculator())
    ])

    df_with_rfm = rfm_pipeline_step.fit_transform(df_raw.copy())

    customer_rfm_df = df_with_rfm.groupby('AccountId').agg(
        Recency=('Recency', 'first'),
        Frequency=('Frequency', 'first'),
        Monetary=('Monetary', 'first')
    ).reset_index()

    customer_rfm_df.fillna(
        {'Recency': customer_rfm_df['Recency'].max() + 1 if not customer_rfm_df['Recency'].empty else 0,
         'Frequency': 0,
         'Monetary': 0},
        inplace=True
    )

    print(f"Customer RFM data shape: {customer_rfm_df.shape}")
    print("\nCustomer RFM data head:")
    display(customer_rfm_df.head())
    print("\nCustomer RFM data info:")
    customer_rfm_df.info()

    # Store AccountId for later merging
    customer_ids = customer_rfm_df[['AccountId']].copy()

    # Select only the RFM features for clustering
    rfm_features = customer_rfm_df[['Recency', 'Frequency', 'Monetary']]

    print("\nRFM features for clustering (first 5 rows):")
    display(rfm_features.head())

if df_raw is None:
    print("Skipping Task 4 K-Means: Raw data not loaded.")
else:
    print("\n--- Scaling RFM Features ---")
    scaler = StandardScaler()
    rfm_scaled = scaler.fit_transform(rfm_features)
    rfm_scaled_df = pd.DataFrame(rfm_scaled, columns=rfm_features.columns, index=rfm_features.index)

    print("\nScaled RFM features (first 5 rows):")
    display(rfm_scaled_df.head())


    print("\n--- Performing K-Means Clustering (K=3) ---")
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10) # n_init to suppress warning
    customer_rfm_df['Cluster'] = kmeans.fit_predict(rfm_scaled_df)

    print("\nCustomer RFM data with cluster assignments (first 5 rows):")
    display(customer_rfm_df.head())

    print("\nCluster sizes:")
    print(customer_rfm_df['Cluster'].value_counts().sort_index())

    print("\nCluster centroids (after scaling):")
    cluster_centroids_scaled = pd.DataFrame(kmeans.cluster_centers_, columns=rfm_features.columns)
    display(cluster_centroids_scaled)

    print("\nCluster centroids (original scale - inverse transformed):")
    cluster_centroids_original_scale = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=rfm_features.columns)
    display(cluster_centroids_original_scale)

    # Visualize clusters (optional, for analysis)
    plt.figure(figsize=(12, 6))
    sns.scatterplot(x='Recency', y='Frequency', hue='Cluster', data=customer_rfm_df, palette='viridis', s=100, alpha=0.7)
    plt.title('Customer Clusters based on Recency and Frequency')
    plt.xlabel('Recency (Days)')
    plt.ylabel('Frequency (Transactions)')
    plt.grid(True)
    plt.show()

    plt.figure(figsize=(12, 6))
    sns.scatterplot(x='Monetary', y='Frequency', hue='Cluster', data=customer_rfm_df, palette='viridis', s=100, alpha=0.7)
    plt.title('Customer Clusters based on Monetary and Frequency')
    plt.xlabel('Monetary Value')
    plt.ylabel('Frequency (Transactions)')
    plt.grid(True)
    plt.show()
if df_raw is None:
    print("Skipping Task 4 High-Risk Label: Raw data not loaded.")
else:
    print("\n--- Defining and Assigning 'High-Risk' Label ---")
    high_risk_cluster_id = 0 

    customer_rfm_df['is_high_risk'] = customer_rfm_df['Cluster'].apply(
        lambda x: 1 if x == high_risk_cluster_id else 0
    )

    print(f"Assigned high-risk label (1) to Cluster {high_risk_cluster_id}.")
    print("\nDistribution of 'is_high_risk' label:")
    print(customer_rfm_df['is_high_risk'].value_counts())

    print("\nSample of customer_rfm_df with 'is_high_risk' label:")
    display(customer_rfm_df.sample(5))
# Cell 6: Integrate the Target Variable into the Main Dataset

if df_raw is None:
    print("Skipping Task 4 Integration: Raw data not loaded.")
else:
    print("\n--- Integrating Target Variable into Main Processed Dataset ---")

    full_pipeline = create_feature_engineering_pipeline(
        numerical_imputation_strategy='mean',
        categorical_encoding_method='onehot'
    )

    df_processed_main = full_pipeline.fit_transform(df_raw.copy())



    print(f"Shape of fully processed data before merging target: {df_processed_main.shape}")
    print("\nProcessed data columns before merge:")
    print(df_processed_main.columns.tolist())

    df_processed_final = pd.merge(
        df_processed_main,
        customer_rfm_df[['AccountId', 'is_high_risk']],
        on='AccountId',
        how='left'
    )

    df_processed_final['is_high_risk'] = df_processed_final['is_high_risk'].fillna(0).astype(int)

    print(f"Shape of final processed data with target variable: {df_processed_final.shape}")
    print("\nFinal processed data with 'is_high_risk' (first 5 rows):")
    display(df_processed_final.head())
    print("\nDistribution of final 'is_high_risk' label in the full dataset:")
    print(df_processed_final['is_high_risk'].value_counts())
    print("\nInfo of final processed data:")
    df_processed_final.info()

    print("\n--- Task 4 Completed ---")

if df_raw is None:
    print("Skipping Task 5: Raw data not loaded.")
else:
    print("\n--- Task 5: Model Training and Tracking ---")

    # --- MLflow Setup ---
    mlflow.set_tracking_uri("file:///./mlruns") # Logs to a local 'mlruns' directory
    mlflow.set_experiment("Credit_Risk_Scoring_Model")
    print(f"MLflow tracking URI set to: {mlflow.get_tracking_uri()}")
    print(f"MLflow experiment set to: {mlflow.get_experiment_by_name('Credit_Risk_Scoring_Model').experiment_id}")

    columns_to_drop = [
        'TransactionId', 'BatchId', 'SubscriptionId', 'CustomerId', # Identifiers
        'AccountId'
    ]

    # Filter out columns that might not exist in df_processed_final'
    features_df = df_processed_final.drop(
        columns=[col for col in columns_to_drop if col in df_processed_final.columns and col != 'is_high_risk'],
        errors='ignore'
    )

    # Separate features (X) and target (y)
    X = features_df.drop('is_high_risk', axis=1)
    y = features_df['is_high_risk']

    print(f"\nFeatures (X) shape: {X.shape}")
    print(f"Target (y) shape: {y.shape}")
    print("\nFeatures (X) head:")
    display(X.head())
    print("\nTarget (y) value counts:")
    print(y.value_counts())


    # --- Split Data ---
    print("\n--- Splitting Data into Training and Testing Sets ---")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_train value counts:\n{y_train.value_counts(normalize=True)}")
    print(f"y_test value counts:\n{y_test.value_counts(normalize=True)}")

    print("\n--- Model Training Setup Complete ---")
# Cell 8: Model Training, Hyperparameter Tuning, and Evaluation with MLflow

if df_raw is None:
    print("Skipping Task 5 Model Training: Raw data not loaded.")
else:
    print("\n--- Training, Tuning, and Evaluating Models ---")

    # Define models and their hyperparameter grids for GridSearchCV
    models = {
        'Logistic Regression': {
            'model': LogisticRegression(solver='liblinear', random_state=42),
            'params': {
                'C': [0.1, 1.0, 10.0],
                'penalty': ['l1', 'l2']
            }
        },
        'Random Forest': {
            'model': RandomForestClassifier(random_state=42),
            'params': {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5]
            }
        },
        'Gradient Boosting': {
            'model': GradientBoostingClassifier(random_state=42),
            'params': {
                'n_estimators': [50, 100],
                'learning_rate': [0.05, 0.1, 0.2],
                'max_depth': [3, 5]
            }
        }
    }

    best_model = None
    best_model_name = None
    best_roc_auc = -1

    # Iterate through models, perform GridSearchCV, log with MLflow
    for model_name, config in models.items():
      with mlflow.start_run(run_name=f"{model_name}_GridSearch"):
        print(f"\n--- Training {model_name} ---")

        classifier = config['model']
        param_grid = config['params']

        # Log model parameters (the full search grid)
        mlflow.log_params({f"param_{k}": str(v) for k, v in param_grid.items()})

        print(f"\n--- Debugging {model_name} inputs ---")
        
         
        print(f"X_train shape: {X_train.shape}")
        print(f"y_train shape: {y_train.shape}")

        # Define columns to drop for consistency
        problematic_cols = ['ProviderId', 'ProductId']

        # Apply the temporary fix to X_train
        X_train_processed = X_train.drop(columns=[col for col in problematic_cols if col in X_train.columns], errors='ignore').copy()

        # Apply the SAME temporary fix to X_test
        X_test_processed = X_test.drop(columns=[col for col in problematic_cols if col in X_test.columns], errors='ignore').copy()

        current_X_train_for_fit = X_train_processed
        current_X_test_for_predict = X_test_processed # For prediction later in this loop

        # Check data types in X_train_processed (used for fit)
        non_numeric_cols = current_X_train_for_fit.select_dtypes(exclude=np.number).columns
        if not non_numeric_cols.empty:
            print(f"!!! Warning: Non-numeric columns found in current_X_train_for_fit: {non_numeric_cols.tolist()}")
            for col in non_numeric_cols:
                print(f"Column '{col}' dtypes: {current_X_train_for_fit[col].apply(type).value_counts()}")
                sample_values = current_X_train_for_fit[col][current_X_train_for_fit[col].apply(lambda x: not isinstance(x, (int, float, np.number)))].unique()
                print(f"  Sample non-numeric values in '{col}': {sample_values[:10]}")
        else:
            print("All columns in current_X_train_for_fit are numeric. Good.")

        # Ensure all columns are numeric in current_X_train_for_fit
        for col in current_X_train_for_fit.columns:
            try:
                current_X_train_for_fit[col] = pd.to_numeric(current_X_train_for_fit[col], errors='raise')
            except ValueError as e:
                print(f"Error converting column '{col}' in current_X_train_for_fit to numeric: {e}")
                print(f"  Sample values in problematic column: {current_X_train_for_fit[col].head()}")
                raise

        # Also ensure X_test_processed is numeric before prediction
        for col in current_X_test_for_predict.columns:
            try:
                current_X_test_for_predict[col] = pd.to_numeric(current_X_test_for_predict[col], errors='raise')
            except ValueError as e:
                print(f"Error converting column '{col}' in current_X_test_for_predict to numeric: {e}")
                print(f"  Sample values in problematic column: {current_X_test_for_predict[col].head()}")
                raise

        print("X_train and X_test successfully processed for current run.")


        grid_search = GridSearchCV(classifier, param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)
        # Use the processed X_train for fitting
        grid_search.fit(current_X_train_for_fit, y_train)

        # Get the best model from Grid Search
        best_clf = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_

        print(f"Best parameters for {model_name}: {best_params}")
        print(f"Best cross-validation ROC-AUC for {model_name}: {best_score:.4f}")

        # Log best parameters and best CV score
        mlflow.log_params({f"best_param_{k}": v for k, v in best_params.items()})
        mlflow.log_metric("best_cv_roc_auc", best_score)

        # Evaluate the best model on the test set - USE THE PROCESSED X_test
        y_pred = best_clf.predict(current_X_test_for_predict)
        y_proba = best_clf.predict_proba(current_X_test_for_predict)[:, 1] # Probability of the positive class

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_proba)

        print(f"Test Accuracy: {accuracy:.4f}")
        print(f"Test Precision: {precision:.4f}")
        print(f"Test Recall: {recall:.4f}")
        print(f"Test F1 Score: {f1:.4f}")
        print(f"Test ROC-AUC: {roc_auc:.4f}")

        # Log test metrics
        mlflow.log_metric("test_accuracy", accuracy)
        mlflow.log_metric("test_precision", precision)
        mlflow.log_metric("test_recall", recall)
        mlflow.log_metric("test_f1_score", f1)
        mlflow.log_metric("test_roc_auc", roc_auc)

        # Log the model (ensure unique name if not registering to avoid conflicts)
        mlflow.sklearn.log_model(best_clf, "model", signature=mlflow.models.infer_signature(X_train, y_pred))


        # Track the best model overall for later registration
        if roc_auc > best_roc_auc:
            best_roc_auc = roc_auc
            best_model = best_clf
            best_model_name = model_name

    print(f"\n--- Best Model Identified: {best_model_name} with ROC-AUC: {best_roc_auc:.4f} ---")

    # Register the overall best model in MLflow Model Registry
    if best_model is not None:
        with mlflow.start_run(run_name=f"Register_Best_Model_{best_model_name}"):
            # Ensure the registered model name is consistent
            registered_model_name = "CreditRisk_HighRisk_Model"
            mlflow.sklearn.log_model(
                best_model,
                "best_model_artifact", # Artifact path within the run
                registered_model_name=registered_model_name,
                signature=mlflow.models.infer_signature(X_train, best_model.predict(X_train)),
                tags={"model_type": best_model_name, "task": "high_risk_prediction", "roc_auc": f"{best_roc_auc:.4f}"}
            )
            print(f"Registered best model '{best_model_name}' as '{registered_model_name}' in MLflow Model Registry.")

    print("\n--- Model Training and Evaluation Completed ---")

Added 'D:\10academy\week5' to sys.path.
All necessary libraries and custom modules imported.
Error: data.csv not found at D:\10academy\week5\data\raw\data.csv. Please ensure the data file is in the correct location: D:\10academy\week5\data\raw\data.csv
Skipping Task 4: Raw data not loaded.
Skipping Task 4 K-Means: Raw data not loaded.
Skipping Task 4 High-Risk Label: Raw data not loaded.
Skipping Task 4 Integration: Raw data not loaded.
Skipping Task 5: Raw data not loaded.
Skipping Task 5 Model Training: Raw data not loaded.
