# Import libraries

In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline

## Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Feature selection
from sklearn.feature_selection import SelectFromModel

## Model evaluators
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import RocCurveDisplay

colors = [
    [0.0, '#0b2819'],
    [0.1, '#0d301e'],
    [0.2, '#103823'],
    [0.3, '#124028'],
    [0.4, '#14482d'],
    [0.5, '#175132'],
    [0.6, '#175132'],
    [0.7, '#2e6246'],
    [0.8, '#45735a'],
    [0.9, '#5c856f'],
    [1.0, '#739684'],   
]


# Import and clean data

In [3]:
df = pd.read_csv("app/data/diabetes.csv") # 'df' is short for 'DataFrame'
df['class'] = df['class'] ==  'Positive'
df['Gender'] = df['Gender'] ==  'Male'
yesorno = {'Yes': True, 'No': False}
df = df.replace(yesorno)
df.columns=df.columns.str.title()
df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,Sudden Weight Loss,Weakness,Polyphagia,Genital Thrush,Visual Blurring,Itching,Irritability,Delayed Healing,Partial Paresis,Muscle Stiffness,Alopecia,Obesity,Class
0,40,True,False,True,False,True,False,False,False,True,False,True,False,True,True,True,True
1,58,True,False,False,False,True,False,False,True,False,False,False,True,False,True,False,True
2,41,True,True,False,False,True,True,False,False,True,False,True,False,True,True,False,True
3,45,True,False,False,True,True,True,True,False,True,False,True,False,False,False,False,True
4,60,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True


In [12]:
import numpy as np
import plotly.graph_objects as go


def nf_heatmaps_tri(df):
    corr = round(df.corr(), 3)
    mask = np.triu(np.ones_like(corr, dtype=bool))
    df_mask = corr.mask(mask)

    z = df_mask.to_numpy()
    x = df_mask.columns.tolist()
    y = df_mask.columns.tolist()
    

    # Create the heatmap trace
    trace = go.Heatmap(z=z, x=x, y=y, colorscale=colors)

    # Create the layout
    layout = go.Layout(
        title_font=dict(
            size=34, family="Old Standard TT, serif", color="black"),
        title_text='Correlations',
        title_x=0.5,
        xaxis=dict(showgrid=False, zeroline=False, side="bottom"),
        yaxis=dict(showgrid=False, zeroline=False, autorange='reversed'),
        plot_bgcolor='rgba(0,0,0,0)',
    )

    # Create the figure
    fig = go.Figure(data=[trace], layout=layout)

    # Update the annotations to handle NaN values
    for i in range(len(fig.layout.annotations)):
        if fig.layout.annotations[i].text == 'nan':
            fig.layout.annotations[i].text = ""

    
    return fig

In [13]:
heatmap=nf_heatmaps_tri(df)
heatmap.show()

In [8]:
# Everything except target variable
X = df.drop("Class", axis=1)

# Target variable
y = df['Class'].values

# Feature Pruning using Decision Tree
>  We'll start by using a Decision Tree Classifier to prune our features. This classifier will help us identify the most important features for predicting the target variable.

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV

# Instantiate the Decision Tree Classifier
clf = DecisionTreeClassifier()

# Fit the classifier on the entire dataset
clf.fit(X, y)

# Recursive Feature Elimination with Cross-Validation (RFECV)
trans = RFECV(clf)

# Transform the dataset based on selected features
X_trans = trans.fit_transform(X, y)

# Get the names of the features selected by RFECV
selected_feature_names = df.iloc[:, :-1].columns[trans.get_support()].values

print('Selected Features after RFECV:', selected_feature_names)


Selected Features after RFECV: ['Age' 'Gender' 'Polyuria' 'Polydipsia' 'Sudden Weight Loss'
 'Genital Thrush' 'Irritability' 'Delayed Healing' 'Muscle Stiffness'
 'Alopecia' 'Obesity']


> Now that we have the selected features after RFECV, we can proceed with the pipeline using these features.

### Displaying Feature Importance Plot using Plotly
> Next, let's create a Plotly graph to visualize the feature importance based on the Decision Tree model.

In [45]:
import plotly.graph_objects as go

# Train the Decision Tree model on the entire dataset
clf = DecisionTreeClassifier()
clf.fit(X, y)

# Get feature importances from the trained model
feature_importance = clf.feature_importances_

# Sort the features based on importance in descending order
sorted_indices = feature_importance.argsort()[::-1]
sorted_features = df.iloc[:, :-1].columns[sorted_indices]
sorted_importance = feature_importance[sorted_indices]

# Create a Plotly bar chart to visualize feature importance
fig = go.Figure(go.Bar(
    x=sorted_features,
    y=sorted_importance,
    marker=dict(
        color=sorted_importance,
        colorscale=colors,
        cmin=0,
        cmax=0.5
    ),
))

# Customize the plot
fig.update_layout(
    title='Feature Importance',
    xaxis_title='Features',
    yaxis_title='Importance',
    yaxis=dict(range=[0, 0.5]),
)

# Show the plot
fig.show()


> This Plotly graph will display the feature importance scores, helping us understand which features have the most significant impact on predicting the target variable.

In [48]:
pickle.dump(clf, open('decision_tree.pkl', 'wb'))

# Split data into training / test sets

In [11]:
# Random seed for reproducibility
np.random.seed(42)

# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(X_trans, # independent variables 
                                                    y, # dependent variable
                                                    test_size = 0.2) # percentage of data to use for test set

# Compare 5 potential models

In [16]:
# Step 1: Import the dataset
# Read the diabetes dataset from the specified CSV file and convert column names to title case.
import pandas as pd
df_new = pd.read_csv("app/data/diabetes_cleaned.csv")


# Step 2: Split the data into training and testing sets
# Separate the features (X) and the target variable (y) from the dataset and split them into training and testing sets.
from sklearn.model_selection import train_test_split
X = df_new.drop(columns=['Class'])
y = df_new['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Define hyperparameter grids for each classifier
# Hyperparameter grids are used for hyperparameter tuning during model selection and evaluation.

# Random Forest Classifier hyperparameter grid
rf_param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start=200, stop=2000, num=10)],  # Number of trees in the forest
    'max_depth': [int(x) for x in np.linspace(10, 110, num=11)] + [None],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Logistic Regression hyperparameter grid
logreg_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'penalty': ['l1', 'l2'],  # Regularization penalty ('l1' for L1 regularization, 'l2' for L2 regularization)
    'max_iter': [100, 200, 300, 400, 500]  # Maximum number of iterations for solver convergence
}

# Support Vector Machine Classifier hyperparameter grid
svc_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']  # Kernel function for the decision boundary
}

# k-Nearest Neighbors Classifier hyperparameter grid
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],  # Number of neighbors to consider for classification
    'weights': ['uniform', 'distance'],  # Weight function used in prediction ('uniform' or 'distance')
    'metric': ['euclidean', 'manhattan']  # Distance metric used for distance calculation
}

# Gaussian Naive Bayes hyperparameter grid
gnb_param_grid = {}  # GaussianNB does not have any hyperparameters to be tuned.


In [17]:
# Step 4: Perform randomized search for each classifier
# RandomizedSearchCV is used to perform hyperparameter tuning for each classifier by randomly sampling from the hyperparameter grid.

# Random Forest Classifier - RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator=RandomForestClassifier(),  # The base RandomForestClassifier model
                               param_distributions=rf_param_grid,  # Hyperparameter grid for tuning
                               n_iter=100,  # Number of parameter settings that are sampled
                               cv=3,  # Cross-validation folds
                               verbose=2,  # Verbosity level for output
                               random_state=42,  # Random seed for reproducibility
                               n_jobs=-1)  # Number of jobs to run in parallel (-1 means using all available cores)
rf_random.fit(X_train, y_train)  # Fit the RandomizedSearchCV on the training data

# Logistic Regression Classifier - RandomizedSearchCV
logreg_random = RandomizedSearchCV(estimator=LogisticRegression(solver='liblinear'),  # The base LogisticRegression model
                                   param_distributions=logreg_param_grid,  # Hyperparameter grid for tuning
                                   n_iter=50,  # Number of parameter settings that are sampled
                                   cv=3,  # Cross-validation folds
                                   verbose=2,  # Verbosity level for output
                                   random_state=42,  # Random seed for reproducibility
                                   n_jobs=-1)  # Number of jobs to run in parallel (-1 means using all available cores)
logreg_random.fit(X_train, y_train)  # Fit the RandomizedSearchCV on the training data

# Support Vector Machine Classifier - RandomizedSearchCV
svc_random = RandomizedSearchCV(estimator=SVC(),  # The base SVC model
                                param_distributions=svc_param_grid,  # Hyperparameter grid for tuning
                                n_iter=50,  # Number of parameter settings that are sampled
                                cv=3,  # Cross-validation folds
                                verbose=2,  # Verbosity level for output
                                random_state=42,  # Random seed for reproducibility
                                n_jobs=-1)  # Number of jobs to run in parallel (-1 means using all available cores)
svc_random.fit(X_train, y_train)  # Fit the RandomizedSearchCV on the training data

# k-Nearest Neighbors Classifier - RandomizedSearchCV
knn_random = RandomizedSearchCV(estimator=KNeighborsClassifier(),  # The base KNeighborsClassifier model
                                param_distributions=knn_param_grid,  # Hyperparameter grid for tuning
                                n_iter=50,  # Number of parameter settings that are sampled
                                cv=3,  # Cross-validation folds
                                verbose=2,  # Verbosity level for output
                                random_state=42,  # Random seed for reproducibility
                                n_jobs=-1)  # Number of jobs to run in parallel (-1 means using all available cores)
knn_random.fit(X_train.to_numpy(), y_train.to_numpy())  # Fit the RandomizedSearchCV on the training data

# Gaussian Naive Bayes Classifier - No hyperparameters to tune for GaussianNB, so no RandomizedSearchCV needed
gnb_random = GaussianNB()  # Create the GaussianNB model (no hyperparameters to tune)

# Step 5: Evaluate each classifier's performance using cross-validated metrics
# Create a dictionary of classifier models with their tuned hyperparameters (if applicable).
# Gaussian Naive Bayes does not require tuning, so the default model is used.

models = {
    "Random Forest": RandomForestClassifier(**rf_random.best_params_),  # Create RandomForestClassifier with best parameters
    "Logistic Regression": LogisticRegression(**logreg_random.best_params_),  # Create LogisticRegression with best parameters
    "Support Vector Machine": SVC(**svc_random.best_params_),  # Create SVC with best parameters
    "K-Nearest Neighbors": KNeighborsClassifier(**knn_random.best_params_),  # Create KNeighborsClassifier with best parameters
    "Gaussian Naive Bayes": GaussianNB()  # Use the default GaussianNB model (no tuning needed)
}


Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time=   1.1s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   0.4s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   0.4s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   0.4s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   0.5s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   0.4s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.1s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=800; tot


The total space of parameters 24 is smaller than n_iter=50. Running 24 iterations. For exhaustive searches, use GridSearchCV.



[CV] END ...............................C=100, kernel=linear; total time=   0.1s
[CV] END ...............................C=100, kernel=linear; total time=   0.2s
Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=7, weights=distance; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=7, wei


The total space of parameters 16 is smaller than n_iter=50. Running 16 iterations. For exhaustive searches, use GridSearchCV.



In [18]:
# Create a function to generate cross-validated metrics for multiple models

def cross_validated_metrics(models, X, y):
    """
    Calculate cross-validated metrics for multiple models using 5-fold cross-validation.

    Parameters:
        models (dict): A dictionary containing the models with their respective hyperparameters.
        X (array-like or DataFrame): The feature data for model training and evaluation.
        y (array-like or Series): The target variable for model training and evaluation.

    Returns:
        dict: A dictionary containing the cross-validated metrics for each model.
              The metrics include 'accuracy', 'precision', and 'recall' averaged over 5 folds.
    """

    # Set a random seed for reproducible results
    np.random.seed(42)

    # Create a dictionary to store model scores
    model_scores = {}

    # Loop through the models
    for name, model in models.items():
        # Create a dictionary to hold collected scores for each model
        collected_scores = {}

        # Gather the mean of cross-validated scores for a variety of metrics
        for metric in ['accuracy', 'precision', 'recall']:
            collected_scores[metric] = np.mean(cross_val_score(model, X, y, cv=5, scoring=metric))

        # Store the collected scores for the current model in the model_scores dictionary
        model_scores[name] = collected_scores

    return model_scores


# Call the function to calculate cross-validated metrics for the provided models
# X_trans is the transformed feature data (e.g., scaled or one-hot encoded) and y is the target variable
model_metrics = cross_validated_metrics(models=models, X=X_trans, y=y)

# The model_metrics dictionary will contain the average accuracy, precision, and recall scores
# for each model over 5 folds of cross-validation.
# Example output: {'Random Forest': {'accuracy': 0.85, 'precision': 0.87, 'recall': 0.81}, ...}


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import pickle

# Step 1: Import the dataset
df_new = pd.read_csv("app/data/diabetes_clean.csv")
df_new.columns = df_new.columns.str.title()
# Step 2: Split the data into training and testing sets
X = df_new.drop(columns=['Class'])
y = df_new['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Define hyperparameter grids for each classifier
rf_param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start=200, stop=2000, num=10)],
    'max_depth': [int(x) for x in np.linspace(10, 110, num=11)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

logreg_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'max_iter': [100, 200, 300, 400, 500]
}

svc_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

gnb_param_grid = {}  # No hyperparameters for GaussianNB

# Step 4: Perform randomized search for each classifier
rf_random = RandomizedSearchCV(estimator=RandomForestClassifier(),
                               param_distributions=rf_param_grid,
                               n_iter=100, cv=3, verbose=2,
                               random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)

logreg_random = RandomizedSearchCV(estimator=LogisticRegression(solver='liblinear'),
                                   param_distributions=logreg_param_grid,
                                   n_iter=50, cv=3, verbose=2,
                                   random_state=42, n_jobs=-1)
logreg_random.fit(X_train, y_train)

svc_random = RandomizedSearchCV(estimator=SVC(),
                                param_distributions=svc_param_grid,
                                n_iter=50, cv=3, verbose=2,
                                random_state=42, n_jobs=-1)
svc_random.fit(X_train, y_train)

knn_random = RandomizedSearchCV(estimator=KNeighborsClassifier(),
                                param_distributions=knn_param_grid,
                                n_iter=50, cv=3, verbose=2,
                                random_state=42, n_jobs=-1)
knn_random.fit(X_train.to_numpy(), y_train.to_numpy())

gnb_random = GaussianNB()  # No hyperparameters to tune for GaussianNB

# Step 5: Evaluate each classifier's performance using cross-validated metrics
models = {
    "Random Forest": RandomForestClassifier(**rf_random.best_params_),
    "Logistic Regression": LogisticRegression(**logreg_random.best_params_),
    "Support Vector Machine": SVC(**svc_random.best_params_),
    "K-Nearest Neighbors": KNeighborsClassifier(**knn_random.best_params_),
    "Gaussian Naive Bayes": GaussianNB()
}



# Create function to generate cross validated metrics for multiple models
def cross_validated_metrics(models, X,  y):
    
    # Random seed for reproducible results
    np.random.seed(42)
    # Make a list to keep model scores totals
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        # make a list to hold collected scores for each model
        collected_scores = {}
        # gather mean of cross validated score for a variety of metrics
        for metric in ['accuracy', 'precision', 'recall']:
            collected_scores[metric] = np.mean(cross_val_score(model,
                                                               X,
                                                               y,
                                                               cv=5, # 5-fold cross-validation
                                                               scoring=metric)) # loop through scoring methods
        model_scores[name] = collected_scores
    return model_scores

model_metrics = cross_validated_metrics(models=models,
                                       X=X_trans,
                                       y=y)
model_metrics



Fitting 3 folds for each of 100 candidates, totalling 300 fits


[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   0.4s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   0.3s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   0.4s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   0.4s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time=   0.8s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   0.3s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.1s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   0.4s
[CV] END bootstrap=False, max_depth=80, min_sam


The total space of parameters 24 is smaller than n_iter=50. Running 24 iterations. For exhaustive searches, use GridSearchCV.


The total space of parameters 16 is smaller than n_iter=50. Running 16 iterations. For exhaustive searches, use GridSearchCV.



[CV] END ...............................C=100, kernel=linear; total time=   0.1s
[CV] END ...............................C=100, kernel=linear; total time=   0.2s
Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=7, weights=uniform; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=7, weights=distance; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=7, wei

{'Random Forest': {'accuracy': 0.9692307692307693,
  'precision': 0.9703010033444815,
  'recall': 0.984375},
 'Logistic Regression': {'accuracy': 0.9096153846153847,
  'precision': 0.9209093079872538,
  'recall': 0.9375},
 'Support Vector Machine': {'accuracy': 0.8846153846153847,
  'precision': 0.9316896325134312,
  'recall': 0.88125},
 'K-Nearest Neighbors': {'accuracy': 0.9346153846153846,
  'precision': 0.971151601278993,
  'recall': 0.921875},
 'Gaussian Naive Bayes': {'accuracy': 0.8692307692307694,
  'precision': 0.8978936985611068,
  'recall': 0.89375}}

In [20]:
import plotly.graph_objects as go

def plot_model_comparison(compare_metrics):
    """
    Create a Plotly bar chart to compare model metrics for different models.

    Parameters:
        compare_metrics (pd.DataFrame): DataFrame containing model metrics as columns and models as rows.

    Returns:
        plotly.graph_objects.Figure: A Plotly figure representing the model comparison.
    """
    # Create a Plotly figure
    fig = go.Figure()

    # Loop through the columns to add bar traces for each metric
    for metric in compare_metrics.columns:
        # Convert metric values to percentages and round to two decimal places for hover text
        hover_values = (compare_metrics[metric] * 100).round(2).tolist()
        hovertext = [f"{metric}: {value:.2f}%" for value in hover_values]

        # Add a bar trace for the current metric
        fig.add_trace(go.Bar(
            x=compare_metrics.index,
            y=compare_metrics[metric],
            name=metric,
            hovertext=hovertext
        ))

    # Update the layout
    fig.update_layout(
        title='Comparison of Model Metrics',  # Title of the plot
        xaxis_title='Models',  # Label for the x-axis
        yaxis_title='Metrics',  # Label for the y-axis
        barmode='group',  # 'group' for grouped bars, 'stack' for stacked bars
        legend=dict(x=1, y=1),  # Legend position
        paper_bgcolor='rgba(0,0,0,0)'  # Transparent background for the plot
    )

    return fig


compare_metrics = pd.DataFrame(model_metrics)
fig = plot_model_comparison(compare_metrics)
fig.show()


In [39]:
rf_random

# Export the trained model to pickle

In [122]:
pickle.dump(rf, open('app/data/rf_model.pkl', 'wb'))