In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from sklearn.datasets import load_iris
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc
from ydata_profiling import ProfileReport
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Set plotly to work in notebook
# import plotly.io as pio
# pio.renderers.default = 'notebook'

import plotly.io as pio
pio.renderers.default = "browser"

In [3]:
# 1) Load iris dataset
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['species'] = df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

print("Iris dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nTarget distribution:")
print(df['species'].value_counts())

Iris dataset shape: (150, 6)

First 5 rows:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target species  
0       0  setosa  
1       0  setosa  
2       0  setosa  
3       0  setosa  
4       0  setosa  

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3

In [25]:
# 1.1) Generate pandas-profile-report and save as HTML
try:
    from ydata_profiling import ProfileReport
    profile = ProfileReport(df, title="Iris Dataset Profiling Report", explorative=True, correlations={"pearson": {"calculate": True}, "spearman": {"calculate": True}, "kendall": {"calculate": True}})
    profile.to_file("iris_dataset_profile.html")
    print("Pandas profiling report saved as 'iris_dataset_profile.html'")
    print("Note: Profile report saved to file. Open iris_dataset_profile.html to view the report.")
    # Prevent automatic display that causes nbformat issues
    del profile
except Exception as e:
    print(f"Warning: Could not generate profile report due to: {e}")
    print("Continuing with analysis...")

100%|██████████| 6/6 [00:00<00:00, 120410.64it/s]0:00, 235.26it/s, Describe variable: species] 
Summarize dataset: 100%|██████████| 34/34 [00:00<00:00, 67.81it/s, Completed]                                    
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  2.10it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00, 31.45it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 89.86it/s]

Pandas profiling report saved as 'iris_dataset_profile.html'
Note: Profile report saved to file. Open iris_dataset_profile.html to view the report.





In [38]:
# # 2) Fit a GBDT model on the raw data
# # Prepare data
# X = df.drop(['target', 'species'], axis=1)
# y = df['target']

# # Split the data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# # Fit GBDT model (using XGBoost for better performance)
# gbdt_model = xgb.XGBClassifier(
#     n_estimators=100,
#     max_depth=3,
#     learning_rate=0.1,
#     random_state=42,
#     eval_metric='mlogloss'
# )

# gbdt_model.fit(X_train, y_train)

# # Make predictions
# y_pred = gbdt_model.predict(X_test)

# print("Model trained successfully!")
# print(f"Training accuracy: {gbdt_model.score(X_train, y_train):.4f}")
# print(f"Test accuracy: {gbdt_model.score(X_test, y_test):.4f}")

def train_model_iris(df: pd.DataFrame):
    from sklearn.metrics import precision_recall_curve, auc
    # 2) Fit a GBDT model on the raw data
    # Prepare data
    X = df.drop(['target', 'species'], axis=1)
    y = df['target']

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Fit GBDT model (using XGBoost for better performance)
    gbdt_model = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.1,
        random_state=42,
        eval_metric='mlogloss'
    )

    gbdt_model.fit(X_train, y_train)

    # Make predictions
    y_pred = gbdt_model.predict(X_test)
    y_pred_proba = gbdt_model.predict_proba(X_test)

    print("Model trained successfully!")
    print(f"Training accuracy: {gbdt_model.score(X_train, y_train):.4f}")
    print(f"Test accuracy: {gbdt_model.score(X_test, y_test):.4f}")

    # Calculate PR AUC for each class
    pr_auc_scores = []
    for i, class_name in enumerate(['setosa', 'versicolor', 'virginica']):
        precision, recall, _ = precision_recall_curve(y_test == i, y_pred_proba[:, i])
        pr_auc = auc(recall, precision)
        pr_auc_scores.append(pr_auc)
        print(f"PR AUC for {class_name}: {pr_auc:.4f}")

    # Calculate macro average
    macro_pr_auc = sum(pr_auc_scores) / len(pr_auc_scores)
    print(f"Macro-averaged PR AUC: {macro_pr_auc:.4f}")
    return gbdt_model

gbdt_model = train_model_iris(df)

Model trained successfully!
Training accuracy: 1.0000
Test accuracy: 0.9333
PR AUC for setosa: 1.0000
PR AUC for versicolor: 0.9810
PR AUC for virginica: 0.9770
Macro-averaged PR AUC: 0.9860


In [None]:
# Suppress ProfileReport display to avoid nbformat issues
# The report has been saved to iris_dataset_profile.html

In [26]:
# 3) Plot the classification report as heatmap using plotly
# Get classification report as dictionary
report_dict = classification_report(y_test, y_pred, target_names=iris.target_names, output_dict=True)

# Convert to DataFrame for easier plotting
report_df = pd.DataFrame(report_dict).transpose()

# Create heatmap data
metrics = ['precision', 'recall', 'f1-score', 'support']
classes = iris.target_names.tolist() + ['accuracy', 'macro avg', 'weighted avg']

# Create the heatmap
fig = go.Figure(data=go.Heatmap(
    z=report_df[metrics].values,
    x=metrics,
    y=classes,
    colorscale='RdYlBu_r',
    text=np.round(report_df[metrics].values, 3),
    texttemplate='%{text}',
    textfont={"size": 12},
    hoverongaps=False
))

fig.update_layout(
    title='Classification Report Heatmap',
    xaxis_title='Metrics',
    yaxis_title='Classes',
    width=800,
    height=500,
    font=dict(size=14)
)

fig.show()

In [19]:
# 4) Get the feature importance
feature_importance = gbdt_model.feature_importances_
feature_names = X.columns

# Create DataFrame for feature importance
feature_imp_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(feature_imp_df)

Feature Importance:
             feature  importance
3   petal width (cm)    0.492275
2  petal length (cm)    0.469418
0  sepal length (cm)    0.025505
1   sepal width (cm)    0.012802


In [20]:
# 5) Get the cumulative feature importance gain with features sorted in reverse of their feature importance
# Features are already sorted in descending order of importance
feature_imp_df['cumulative_importance'] = feature_imp_df['importance'].cumsum()
feature_imp_df['cumulative_percentage'] = feature_imp_df['cumulative_importance'] / feature_imp_df['importance'].sum() * 100

print("Feature Importance with Cumulative Gain:")
print(feature_imp_df)

Feature Importance with Cumulative Gain:
             feature  importance  cumulative_importance  cumulative_percentage
3   petal width (cm)    0.492275               0.492275              49.227474
2  petal length (cm)    0.469418               0.961693              96.169312
0  sepal length (cm)    0.025505               0.987198              98.719826
1   sepal width (cm)    0.012802               1.000000             100.000000


In [21]:
# 6) & 7) Plot the cumulative feature importance gain with features sorted in reverse of their feature importance
# Add markers at 80% cumulative gain

# Find the 80% threshold
threshold_80 = 80.0
idx_80 = np.where(feature_imp_df['cumulative_percentage'] >= threshold_80)[0][0]

# Create the plot
fig = go.Figure()

# Add cumulative importance line
fig.add_trace(go.Scatter(
    x=list(range(len(feature_imp_df))),
    y=feature_imp_df['cumulative_percentage'],
    mode='lines+markers',
    name='Cumulative Importance',
    line=dict(color='blue', width=3),
    marker=dict(size=8, color='blue'),
    hovertemplate='Feature: %{customdata}<br>Cumulative: %{y:.2f}%<extra></extra>',
    customdata=feature_imp_df['feature']
))

# Add 80% threshold line
fig.add_trace(go.Scatter(
    x=[0, len(feature_imp_df)-1],
    y=[80, 80],
    mode='lines',
    name='80% Threshold',
    line=dict(color='red', width=2, dash='dash'),
    hovertemplate='80% Threshold<extra></extra>'
))

# Add marker at 80% point
fig.add_trace(go.Scatter(
    x=[idx_80],
    y=[feature_imp_df.iloc[idx_80]['cumulative_percentage']],
    mode='markers',
    name='80% Mark',
    marker=dict(size=15, color='red', symbol='diamond'),
    hovertemplate='80%% Mark<br>Feature: %{customdata}<br>Cumulative: %{y:.2f}%<extra></extra>',
    customdata=[feature_imp_df.iloc[idx_80]['feature']]
))

# Update layout
fig.update_layout(
    title='Cumulative Feature Importance Gain',
    xaxis_title='Number of Features',
    yaxis_title='Cumulative Importance (%)',
    width=900,
    height=600,
    font=dict(size=14),
    showlegend=True,
    xaxis=dict(tickmode='array', tickvals=list(range(len(feature_imp_df))), ticktext=feature_imp_df['feature'])
)

# Rotate x-axis labels for better readability
fig.update_xaxes(tickangle=45)

fig.show()

In [22]:
# 8) Get all features after the above mark
less_important_features = feature_imp_df.iloc[idx_80 + 1:]

print(f"Features after 80% cumulative importance mark ({idx_80 + 1} features):")
print(less_important_features[['feature', 'importance', 'cumulative_percentage']])

Features after 80% cumulative importance mark (2 features):
             feature  importance  cumulative_percentage
0  sepal length (cm)    0.025505              98.719826
1   sepal width (cm)    0.012802             100.000000


In [24]:
# 9) Display beautifully the less important features using plotly in a good white canvas and in tabular form
# Calculate statistics for less important features
stats_data = []
total_rows = len(df)
for feature in less_important_features['feature']:
    feature_data = df[feature]
    non_null_count = feature_data.count()
    null_count = total_rows - non_null_count
    null_percentage = (null_count / total_rows) * 100 if total_rows > 0 else 0
    stats = {
        'Feature Name': feature,
        'Feature Importance': f"{less_important_features[less_important_features['feature'] == feature]['importance'].values[0]:.4f}",
        'Non-null Values': non_null_count,
        'Null Values %': f"{null_percentage:.2f}%",
        'Mean': f"{feature_data.mean():.4f}",
        'Standard Deviation': f"{feature_data.std():.4f}",
        'Range': f"{feature_data.max() - feature_data.min():.4f}",
        'Median': f"{feature_data.median():.4f}",
        'IQR': f"{feature_data.quantile(0.75) - feature_data.quantile(0.25):.4f}",
        'Q1': f"{feature_data.quantile(0.25):.4f}",
        'Q3': f"{feature_data.quantile(0.75):.4f}"
    }
    stats_data.append(stats)

# Create DataFrame for the table
stats_df = pd.DataFrame(stats_data)

# Create the table using plotly
fig = go.Figure(data=[go.Table(
    columnwidth=[200, 150, 120, 120, 100, 150, 100, 100, 100, 100, 100],
    header=dict(
        values=['Feature Name', 'Feature Importance', 'Non-null Values', 'Null Values %', 'Mean', 'Standard Deviation', 'Range', 'Median', 'IQR', 'Q1', 'Q3'],
        fill_color='lightblue',
        align='center',
        font=dict(size=12, color='black'),
        height=40
    ),
    cells=dict(
        values=[stats_df[col] for col in stats_df.columns],
        fill_color='white',
        align=['left', 'center', 'center', 'center', 'center', 'center', 'center', 'center', 'center', 'center', 'center'],
        font=dict(size=11, color='black'),
        height=35,
        line_color='lightgray'
    )
)])

fig.update_layout(
    title=dict(
        text='Less Important Features Statistics<br><sup>Features contributing after 80% cumulative importance threshold</sup>',
        x=0.5,
        y=0.95,
        xanchor='center',
        yanchor='top',
        font=dict(size=16, color='black')
    ),
    width=1200,
    height=400,
    margin=dict(l=20, r=20, t=80, b=20),
    paper_bgcolor='white',
    plot_bgcolor='white'
)

fig.show()

In [34]:
from typing import List


def test_less_features(features_to_drop: List[str]):
    from sklearn.metrics import precision_recall_curve, auc
    # 2) Fit a GBDT model on the raw data
    # Prepare data
    X = df.drop(['target', 'species'] + features_to_drop, axis=1)
    y = df['target']

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Fit GBDT model (using XGBoost for better performance)
    gbdt_model = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.1,
        random_state=42,
        eval_metric='mlogloss'
    )

    gbdt_model.fit(X_train, y_train)

    # Make predictions
    y_pred = gbdt_model.predict(X_test)
    y_pred_proba = gbdt_model.predict_proba(X_test)

    print("Model trained successfully!")
    print(f"Training accuracy: {gbdt_model.score(X_train, y_train):.4f}")
    print(f"Test accuracy: {gbdt_model.score(X_test, y_test):.4f}")

    # Calculate PR AUC for each class
    pr_auc_scores = []
    for i, class_name in enumerate(['setosa', 'versicolor', 'virginica']):
        precision, recall, _ = precision_recall_curve(y_test == i, y_pred_proba[:, i])
        pr_auc = auc(recall, precision)
        pr_auc_scores.append(pr_auc)
        print(f"PR AUC for {class_name}: {pr_auc:.4f}")

    # Calculate macro average
    macro_pr_auc = sum(pr_auc_scores) / len(pr_auc_scores)
    print(f"Macro-averaged PR AUC: {macro_pr_auc:.4f}")

    return gbdt_model

In [35]:
df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target', 'species'],
      dtype='object')

In [36]:

new_model = test_less_features(features_to_drop=['sepal length (cm)', 'sepal width (cm)'])

Model trained successfully!
Training accuracy: 0.9810
Test accuracy: 0.9556
PR AUC for setosa: 1.0000
PR AUC for versicolor: 0.9841
PR AUC for virginica: 0.9827
Macro-averaged PR AUC: 0.9889


In [40]:
def compare_models_with_metrics():
    import pandas as pd
    import numpy as np
    from sklearn.metrics import precision_recall_curve, auc, precision_score, recall_score, f1_score, classification_report
    import plotly.graph_objects as go
    
    # Define different feature sets to compare
    feature_sets = {
        'All Features': [],
        'Drop Sepal Features': ['sepal length (cm)', 'sepal width (cm)'],
        'Drop Petal Features': ['petal length (cm)', 'petal width (cm)'],
        'Only Sepal Length': ['sepal width (cm)', 'petal length (cm)', 'petal width (cm)'],
        'Only Petal Features': ['sepal length (cm)', 'sepal width (cm)']
    }
    
    results = []
    
    for name, features_to_drop in feature_sets.items():
        # Prepare data
        X = df.drop(['target', 'species'] + features_to_drop, axis=1)
        y = df['target']
        
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
        
        # Fit GBDT model
        gbdt_model = xgb.XGBClassifier(
            n_estimators=100,
            max_depth=3,
            learning_rate=0.1,
            random_state=42,
            eval_metric='mlogloss'
        )
        
        gbdt_model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = gbdt_model.predict(X_test)
        y_pred_proba = gbdt_model.predict_proba(X_test)
        
        # Calculate metrics
        report = classification_report(y_test, y_pred, target_names=['setosa', 'versicolor', 'virginica'], output_dict=True)
        
        # Calculate PR AUC for each class
        pr_auc_scores = []
        for i in range(3):
            precision, recall, _ = precision_recall_curve(y_test == i, y_pred_proba[:, i])
            pr_auc = auc(recall, precision)
            pr_auc_scores.append(pr_auc)
        
        macro_pr_auc = sum(pr_auc_scores) / len(pr_auc_scores)
        
        results.append({
            'Model': name,
            'Features Used': list(X.columns),
            'PR AUC': macro_pr_auc,
            'Precision': report['macro avg']['precision'],
            'Recall': report['macro avg']['recall'],
            'F1 Score': report['macro avg']['f1-score'],
            'Accuracy': report['accuracy']
        })
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    
    # Calculate percentage changes from baseline (All Features)
    baseline = results_df[results_df['Model'] == 'All Features'].iloc[0]
    
    for metric in ['PR AUC', 'Precision', 'Recall', 'F1 Score', 'Accuracy']:
        results_df[f'{metric} % Change'] = ((results_df[metric] - baseline[metric]) / baseline[metric] * 100).round(2)
    
    # Create color-coded visualization
    metrics_to_plot = ['PR AUC', 'Precision', 'Recall', 'F1 Score']
    
    from plotly.subplots import make_subplots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=['PR AUC % Change', 'Precision % Change', 'Recall % Change', 'F1 Score % Change'],
        specs=[[{"type": "bar"}, {"type": "bar"}],
               [{"type": "bar"}, {"type": "bar"}]]
    )
    
    for i, metric in enumerate(metrics_to_plot):
        values = results_df[f'{metric} % Change']
        
        # Create color scale based on values
        colors = []
        for val in values:
            if val > 0:
                # Green shades for positive changes
                intensity = min(abs(val) / 20, 1)  # Max intensity at 20% change
                colors.append(f'rgba(0, {int(128 + intensity * 127)}, 0, 0.8)')
            else:
                # Red shades for negative changes
                intensity = min(abs(val) / 20, 1)  # Max intensity at 20% change
                colors.append(f'rgba({int(128 + intensity * 127)}, 0, 0, 0.8)')
        
        row = i // 2 + 1
        col = i % 2 + 1
        
        fig.add_trace(go.Bar(
            x=results_df['Model'],
            y=values,
            name=metric,
            marker_color=colors,
            text=[f'{val:+.2f}%' for val in values],
            textposition='auto',
            showlegend=False
        ), row=row, col=col)
    
    # Update layout for subplots
    fig.update_layout(
        title='Model Performance Comparison: % Change from Baseline (All Features)',
        height=800,
        showlegend=False
    )
    
    # Update axis labels
    for i in range(4):
        row = i // 2 + 1
        col = i % 2 + 1
        fig.update_xaxes(title_text='Model Configuration', tickangle=45, row=row, col=col)
        fig.update_yaxes(title_text='% Change', row=row, col=col)
    
    fig.show()
    
    # Also display the results table
    print("\nDetailed Results:")
    display_cols = ['Model', 'Features Used', 'PR AUC', 'Precision', 'Recall', 'F1 Score', 'Accuracy'] + \
                   [f'{metric} % Change' for metric in ['PR AUC', 'Precision', 'Recall', 'F1 Score', 'Accuracy']]
    
    # Create a formatted table using plotly
    table_fig = go.Figure(data=[go.Table(
        columnwidth=[150, 200, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80],
        header=dict(
            values=display_cols,
            fill_color='lightblue',
            align='center',
            font=dict(size=10, color='black'),
            height=40
        ),
        cells=dict(
            values=[results_df[col].apply(lambda x: ', '.join(x) if isinstance(x, list) else f'{x:.4f}' if isinstance(x, float) else str(x)) for col in display_cols],
            fill_color='white',
            align=['left', 'left', 'center', 'center', 'center', 'center', 'center', 'center', 'center', 'center', 'center', 'center'],
            font=dict(size=9, color='black'),
            height=30
        )
    )])
    
    table_fig.update_layout(
        title='Detailed Model Comparison Results',
        width=1400,
        height=400,
        margin=dict(l=10, r=10, t=50, b=10)
    )
    
    table_fig.show()
    
    return results_df

# Run the comparison
comparison_results = compare_models_with_metrics()


Detailed Results:
