In [1]:
# Checking Performance of Classification Machine Learning Models

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def check_model_performance_classification(ModelClasses, data, dependent_var, drop_columns=[], test_size=0.2,scaler=MinMaxScaler(feature_range=(0, 1)), random_state=0):
    # Define the independent and dependent variables
    X = data.drop([dependent_var] + drop_columns, axis=1)
    y = data[dependent_var]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale the independent variables
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    X_scaled = scaler.transform(X)

    results = []
    for ModelClass in ModelClasses:
        model_name = ModelClass.__name__
        model = ModelClass().fit(X_train_scaled, y_train)
        y_pred_train = model.predict(X_train_scaled)
        y_pred_test = model.predict(X_test_scaled)
        train_accuracy = accuracy_score(y_train, y_pred_train)
        test_accuracy = accuracy_score(y_test, y_pred_test)
        conf_mat = confusion_matrix(y_test, y_pred_test)
        class_report = classification_report(y_test, y_pred_test, output_dict=True, zero_division=1)

        simple_report = {
            'Accuracy': round(class_report['accuracy'], 3),
            'Precision': round(class_report['macro avg']['precision'], 3),
            'Recall': round(class_report['macro avg']['recall'], 3),
            'F1-score': round(class_report['macro avg']['f1-score'], 3),
            'Supoort': round(class_report['macro avg']['support'], 3)
        }

        results.append(
            {
                "Model": model_name,
                "Train Accuracy": train_accuracy,
                "Test Accuracy": test_accuracy,
                "Confusion Matrix": conf_mat,
                "Classification Report": simple_report
            }
        )
    return pd.DataFrame(results)

In [2]:
# ML_Classification_models_with_GridSearch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from joblib import dump, load
import os
# Classification models
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from xgboost import XGBRegressor, XGBClassifier
import warnings
# warnings.filterwarnings("ignore")
def ML_Classification_models_with_GridSearch(ModelClasses, hyperparameters, data, dependent_var, drop_columns=[], 
                                           new_data=None, test_size=0.2, random_state=42, scaler=MinMaxScaler(feature_range=(0, 1)), 
                                           scoring = 'accuracy', cv = 5, dim_reduction=None, n_components=None, save_model=False, save_dir=None):
    """
    
    This function performs grid search for the best classification models.

    Parameters:
    Modelclasses (list): A list of classifier classes.
    
    Example: Use the following code to create a list of model classes.
    ModelClasses = [
    LogisticRegression,
    DecisionTreeClassifier,
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    SVC,
    KNeighborsClassifier,
    MultinomialNB,
    BernoulliNB,
    XGBClassifier
]
    hyperparameters (dict): Hyperparameters to be used in grid search.
    
    Example: use the following code to create a dictionary of hyperparameters.
    hyperparameters = {
    'LogisticRegression': {
        'C': [0.1, 1, 10],
        'solver': ['newton-cg', 'lbfgs', 'saga']
    },
    'DecisionTreeClassifier': {
        'max_depth': [2, 4, 8],
        'min_samples_leaf': [1, 2, 5]
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 150],
        'max_depth': [2, 4, 8]
    },
    'GradientBoostingClassifier': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 1],
        'max_depth': [2, 4]
    },
    'AdaBoostClassifier': {
        'n_estimators': [25, 50, 100],
        'learning_rate': [0.5, 1, 2]
    },
    'SVC': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    },
    'MultinomialNB': {
        'alpha': [0.1, 0.5, 1]
    },
    'BernoulliNB': {
        'alpha': [0.1, 0.5, 1]
    },
    'XGBClassifier': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 1],
        'max_depth': [2, 4, 6],
        'subsample': [0.5, 0.75, 1],
    }
}

    data (pd.DataFrame): A DataFrame containing the data.
    dependent_var (str): Column name of the dependent variable.
    drop_columns (list, optional): Column names to be excluded from the analysis.
    new_data (pd.DataFrame, optional): DataFrame containing the new data.
    test_size (float, optional): Test set size for splitting (default is 0.2).
    random_state (int, optional): Random seed for reproducibility (default is 42).
    scaler (scikit-learn scaler, optional): Scaler object for feature scaling (default is MinMaxScaler).
    scoring (str, optional): Scoring method for grid search (default is 'accuracy').
    cv (int, optional): Number of folds for cross-validation (default is 5).
    dim_reduction (str, optional): Dimensionality reduction method - 'PCA' or 'LDA' (default is None).
    n_components (int, optional): Number of components for PCA or LDA (default is None).
    save_model: bool, optional
        Whether to save the best model for each ModelClass (default False)
    save_dir: str, optional
        The directory to save the model files in, if save_model is True (default None,
        which means save the models in the same directory as the script)

    Returns:
    pd.DataFrame: DataFrame containing model performance metrics.    
    """

    # Ignore the specific UserWarning raised when feature names do not match.
    warnings.filterwarnings('ignore', message="X does not have valid feature names")

    if drop_columns is None:
        drop_columns = []

    if not isinstance(ModelClasses, list):
        raise TypeError("ModelClasses must be a list of classifier classes.")
    if not isinstance(hyperparameters, dict):
        raise TypeError("hyperparameters must be a dictionary.")
    if not isinstance(data, pd.DataFrame):
        raise TypeError("data must be a pandas DataFrame.")
    if not isinstance(drop_columns, list):
        raise TypeError("drop_columns must be a list of column names.")

    # Define the independent and dependent variables
    X = data.drop([dependent_var] + drop_columns, axis=1)
    y = data[dependent_var]
    
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Apply PCA or LDA if specified
    if dim_reduction is not None and n_components is not None:
        if dim_reduction == "PCA":
            pca_transformer = PCA(n_components=n_components)
            X_train = pca_transformer.fit_transform(X_train)
            X_test = pca_transformer.transform (X_test)
        elif dim_reduction == "LDA":
            lda_transformer = LinearDiscriminantAnalysis(n_components=n_components)
            X_train = lda_transformer.fit_transform(X_train, y_train)
            X_test =  lda_transformer.transform (X_test)
        else:
            raise ValueError(f"Invalid dim_reduction option: {dim_reduction}. Choose either 'PCA' or 'LDA'.")

    
    # Scaling
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 2. Run regression models
    results = []
    for ModelClass in ModelClasses:
        # Perform grid search
        grid_search = GridSearchCV(ModelClass(), hyperparameters[ModelClass.__name__], scoring=scoring, cv=5)
        grid_search.fit(X_train_scaled, y_train)

        # Predict using best model
        best_model = grid_search.best_estimator_
        y_pred_train = best_model.predict(X_train_scaled)
        y_pred_test = best_model.predict(X_test_scaled)
        
        # Calculate performance metrics
        train_accuracy = accuracy_score(y_train, y_pred_train)
        test_accuracy = accuracy_score(y_test, y_pred_test)
        conf_mat = confusion_matrix(y_test, y_pred_test)
        class_report = classification_report(y_test, y_pred_test, output_dict=True, zero_division=1)
        
        # Predict values for new data
        if new_data is not None:
            new_X = new_data.drop([dependent_var] + drop_columns, axis=1).values
            new_y = new_data[dependent_var].values

            if dim_reduction is not None and n_components is not None:
                if dim_reduction == "PCA":
                    new_X = pca_transformer.transform (new_X)
                if dim_reduction == "LDA":
                    new_X = lda_transformer.transform (new_X)
                
            new_X_scaled = scaler.transform(new_X)
            new_y_pred = np.round(best_model.predict(new_X_scaled), 3)
            new_y_residual = np.round(new_y - new_y_pred, 3)

        else:
            new_y_pred = None
            new_y_residual = None
        
        # Save the best model for the current ModelClass if save_model is True
        if save_model:
            model_file_name = f"{ModelClass.__name__}_best_classification_ML_model.joblib"
            if save_dir is not None:
                os.makedirs(save_dir, exist_ok=True)
                model_file_path = os.path.join(save_dir, model_file_name)
            else:
                model_file_path = model_file_name

            dump(best_model, model_file_path)
            print(f"Saved {model_file_name} to {model_file_path}")
        
        simple_report = {
            'Accuracy': round(class_report['accuracy'], 3),
            'Precision': round(class_report['macro avg']['precision'], 3),
            'Recall': round(class_report['macro avg']['recall'], 3),
            'F1-score': round(class_report['macro avg']['f1-score'], 3),
            'Supoort': round(class_report['macro avg']['support'], 3)
        }

        results.append(
            {
                "Title" :"ML Classification Models with GridSearch (Best Model)",
                "Model":  ModelClass.__name__,
                "Train Accuracy": round(train_accuracy,3),
                "Test Accuracy": round(test_accuracy,3),
                "Confusion Matrix": conf_mat,
                "Classification Report": simple_report,
                "Best parameters": grid_search.best_params_,
                "New Predicted": new_y_pred, 
                "New Residual": new_y_residual
            }
        )
    return pd.DataFrame(results)

In [3]:
# get predicted values from new data set after finding the best machine learning model_Improved

import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler


def prediction_of_new_data(ModelClass, data, dependent_var, new_data, drop_columns=[], scaler=MinMaxScaler()):
    """
    Predict values for new data using a trained model.

    Args:
        ModelClass (class): The model class to use for making predictions.
        data (pandas.DataFrame): The dataset used to train the model.
        dependent_var (str): The name of the dependent variable column.
        new_data (pandas.DataFrame): New data for which to predict values.
        drop_columns (list, optional): A list of column names to drop from the independent variables. Defaults to [].
        scaler (object, optional): The scaler to use for scaling the independent variables. Defaults to MinMaxScaler().

    Returns:
        pandas.Series: A series with the predicted values for the dependent variable.
    """
    
    # Prepare the original dataset
    X = data.drop([dependent_var] + drop_columns, axis=1)
    y = data[dependent_var]
    
    # Prepare new data
    new_X = new_data.drop([dependent_var] + drop_columns, axis=1)
    if not set(X.columns) == set(new_X.columns):
        raise ValueError("New data has different columns than original data")
    if not all(new_X.dtypes == X.dtypes):
        raise ValueError("New data has different data types than original data")
    scaler.fit(X)  # Fit the scaler object on the original data
    new_X_scaled = scaler.transform(new_X)

    # Train the model on the original dataset
    X_scaled = scaler.transform(X)  # Use the fitted scaler to transform the original data
    model = ModelClass().fit(X_scaled, y)

    # Predict values for the new data
    new_y_pred = model.predict(new_X_scaled)
    return pd.Series(new_y_pred, name='predicted_'+dependent_var)

In [21]:
ModelClasses = [
    LogisticRegression,
    DecisionTreeClassifier,
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    SVC,
    KNeighborsClassifier,
    MultinomialNB,
    BernoulliNB,
    XGBClassifier
]

hyperparameters = {
    'LogisticRegression': {
        'C': [0.1, 1, 10],
        'solver': ['newton-cg', 'lbfgs', 'saga']
    },
    'DecisionTreeClassifier': {
        'max_depth': [2, 4, 8],
        'min_samples_leaf': [1, 2, 5]
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 150],
        'max_depth': [2, 4, 8]
    },
    'GradientBoostingClassifier': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 1],
        'max_depth': [2, 4]
    },
    'AdaBoostClassifier': {
        'n_estimators': [25, 50, 100],
        'learning_rate': [0.5, 1, 2]
    },
    'SVC': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    },
    'MultinomialNB': {
        'alpha': [0.1, 0.5, 1]
    },
    'BernoulliNB': {
        'alpha': [0.1, 0.5, 1]
    },
    'XGBClassifier': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 1],
        'max_depth': [2, 4, 6],
        'subsample': [0.5, 0.75, 1],
    }
}

In [11]:
df = pd.read_csv('/home/young78703/Data_Science_Project/data/iris.csv')

# Preprocess a categorical column (ordinal variable) using mapping
# Define a dictionary to map the categorical values to integers
mapping = {'setosa': 0, 'versicolor': 1, 'virginica': 2}

# Use the map method to apply the dictionary to the column
df['species'] = df['species'].map(mapping)

new_data=df.sample(5)
results= ML_Classification_models_with_GridSearch(ModelClasses, hyperparameters, df, 'species', new_data=new_data, scoring='accuracy', test_size=0.2, random_state=42,
                                           scaler=MinMaxScaler(feature_range=(0, 1)), dim_reduction="LDA", n_components=2)
results.to_csv('ML_Classification_Model_results.csv')

In [12]:
results

Unnamed: 0,Title,Model,Train Accuracy,Test Accuracy,Confusion Matrix,Classification Report,Best parameters,New Predicted,New Residual
0,ML Classification Models with GridSearch (Best...,LogisticRegression,0.958,1.0,"[[10, 0, 0], [0, 9, 0], [0, 0, 11]]","{'Accuracy': 1.0, 'Precision': 1.0, 'Recall': ...","{'C': 10, 'solver': 'newton-cg'}","[1, 2, 0, 2, 1]","[0, -1, 0, 0, 0]"
1,ML Classification Models with GridSearch (Best...,DecisionTreeClassifier,0.983,0.967,"[[10, 0, 0], [0, 8, 1], [0, 0, 11]]","{'Accuracy': 0.967, 'Precision': 0.972, 'Recal...","{'max_depth': 2, 'min_samples_leaf': 1}","[1, 1, 0, 2, 1]","[0, 0, 0, 0, 0]"
2,ML Classification Models with GridSearch (Best...,RandomForestClassifier,0.975,0.967,"[[10, 0, 0], [0, 9, 0], [0, 1, 10]]","{'Accuracy': 0.967, 'Precision': 0.967, 'Recal...","{'max_depth': 2, 'n_estimators': 100}","[1, 2, 0, 2, 1]","[0, -1, 0, 0, 0]"
3,ML Classification Models with GridSearch (Best...,GradientBoostingClassifier,1.0,0.967,"[[10, 0, 0], [0, 8, 1], [0, 0, 11]]","{'Accuracy': 0.967, 'Precision': 0.972, 'Recal...","{'learning_rate': 1, 'max_depth': 2, 'n_estima...","[1, 1, 0, 2, 1]","[0, 0, 0, 0, 0]"
4,ML Classification Models with GridSearch (Best...,AdaBoostClassifier,0.975,0.967,"[[10, 0, 0], [0, 9, 0], [0, 1, 10]]","{'Accuracy': 0.967, 'Precision': 0.967, 'Recal...","{'learning_rate': 0.5, 'n_estimators': 25}","[1, 1, 0, 2, 1]","[0, 0, 0, 0, 0]"
5,ML Classification Models with GridSearch (Best...,SVC,0.975,1.0,"[[10, 0, 0], [0, 9, 0], [0, 0, 11]]","{'Accuracy': 1.0, 'Precision': 1.0, 'Recall': ...","{'C': 10, 'kernel': 'linear'}","[1, 2, 0, 2, 1]","[0, -1, 0, 0, 0]"
6,ML Classification Models with GridSearch (Best...,KNeighborsClassifier,0.983,1.0,"[[10, 0, 0], [0, 9, 0], [0, 0, 11]]","{'Accuracy': 1.0, 'Precision': 1.0, 'Recall': ...","{'n_neighbors': 3, 'weights': 'uniform'}","[1, 2, 0, 2, 1]","[0, -1, 0, 0, 0]"
7,ML Classification Models with GridSearch (Best...,MultinomialNB,0.708,0.7,"[[10, 0, 0], [0, 9, 0], [0, 9, 2]]","{'Accuracy': 0.7, 'Precision': 0.833, 'Recall'...",{'alpha': 0.1},"[1, 1, 0, 1, 1]","[0, 0, 0, 1, 0]"
8,ML Classification Models with GridSearch (Best...,BernoulliNB,0.358,0.333,"[[0, 10, 0], [0, 9, 0], [0, 10, 1]]","{'Accuracy': 0.333, 'Precision': 0.77, 'Recall...",{'alpha': 0.1},"[1, 1, 1, 1, 1]","[0, 0, -1, 1, 0]"
9,ML Classification Models with GridSearch (Best...,XGBClassifier,0.983,1.0,"[[10, 0, 0], [0, 9, 0], [0, 0, 11]]","{'Accuracy': 1.0, 'Precision': 1.0, 'Recall': ...","{'learning_rate': 0.01, 'max_depth': 4, 'n_est...","[1, 1, 0, 2, 1]","[0, 0, 0, 0, 0]"


In [14]:
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv('/home/young78703/Data_Science_Project/data/Breast_Cancer.csv')
df['Class']= LabelEncoder().fit_transform(df['Class'])

In [15]:
new_data=df.sample(5)
results_1= ML_Classification_models_with_GridSearch(ModelClasses, hyperparameters, df, 'Class', drop_columns = ['Sample code number'], new_data=new_data, scoring='accuracy', test_size=0.2, random_state=42,
                                           scaler=MinMaxScaler(feature_range=(0, 1)), dim_reduction='PCA', n_components=3)
print(results_1)
results_1.to_csv('ML_Classification_Model_GridSearch_results.csv')


In [16]:
results_1

Unnamed: 0,Title,Model,Train Accuracy,Test Accuracy,Confusion Matrix,Classification Report,Best parameters,New Predicted,New Residual
0,ML Classification Models with GridSearch (Best...,LogisticRegression,0.973,0.956,"[[78, 1], [5, 53]]","{'Accuracy': 0.956, 'Precision': 0.961, 'Recal...","{'C': 10, 'solver': 'newton-cg'}","[1, 0, 0, 1, 1]","[0, 0, 0, 0, 0]"
1,ML Classification Models with GridSearch (Best...,DecisionTreeClassifier,0.974,0.985,"[[77, 2], [0, 58]]","{'Accuracy': 0.985, 'Precision': 0.983, 'Recal...","{'max_depth': 2, 'min_samples_leaf': 1}","[1, 0, 0, 1, 1]","[0, 0, 0, 0, 0]"
2,ML Classification Models with GridSearch (Best...,RandomForestClassifier,0.98,0.971,"[[77, 2], [2, 56]]","{'Accuracy': 0.971, 'Precision': 0.97, 'Recall...","{'max_depth': 4, 'n_estimators': 50}","[1, 0, 0, 1, 1]","[0, 0, 0, 0, 0]"
3,ML Classification Models with GridSearch (Best...,GradientBoostingClassifier,0.987,0.971,"[[77, 2], [2, 56]]","{'Accuracy': 0.971, 'Precision': 0.97, 'Recall...","{'learning_rate': 0.1, 'max_depth': 2, 'n_esti...","[1, 0, 0, 1, 1]","[0, 0, 0, 0, 0]"
4,ML Classification Models with GridSearch (Best...,AdaBoostClassifier,1.0,0.971,"[[77, 2], [2, 56]]","{'Accuracy': 0.971, 'Precision': 0.97, 'Recall...","{'learning_rate': 1, 'n_estimators': 50}","[1, 0, 0, 1, 1]","[0, 0, 0, 0, 0]"
5,ML Classification Models with GridSearch (Best...,SVC,0.973,0.956,"[[78, 1], [5, 53]]","{'Accuracy': 0.956, 'Precision': 0.961, 'Recal...","{'C': 1, 'kernel': 'linear'}","[1, 0, 0, 1, 1]","[0, 0, 0, 0, 0]"
6,ML Classification Models with GridSearch (Best...,KNeighborsClassifier,0.976,0.971,"[[78, 1], [3, 55]]","{'Accuracy': 0.971, 'Precision': 0.973, 'Recal...","{'n_neighbors': 7, 'weights': 'uniform'}","[1, 0, 0, 1, 1]","[0, 0, 0, 0, 0]"
7,ML Classification Models with GridSearch (Best...,MultinomialNB,0.804,0.737,"[[79, 0], [36, 22]]","{'Accuracy': 0.737, 'Precision': 0.843, 'Recal...",{'alpha': 0.1},"[1, 0, 0, 0, 1]","[0, 0, 0, 1, 0]"
8,ML Classification Models with GridSearch (Best...,BernoulliNB,0.672,0.577,"[[79, 0], [58, 0]]","{'Accuracy': 0.577, 'Precision': 0.788, 'Recal...",{'alpha': 0.1},"[0, 0, 0, 0, 0]","[1, 0, 0, 1, 1]"
9,ML Classification Models with GridSearch (Best...,XGBClassifier,1.0,0.985,"[[78, 1], [1, 57]]","{'Accuracy': 0.985, 'Precision': 0.985, 'Recal...","{'learning_rate': 1, 'max_depth': 2, 'n_estima...","[1, 0, 0, 1, 1]","[0, 0, 0, 0, 0]"
