In [None]:
import pandas as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score

# Missing Values Detection

In [None]:
import pandas as pd

# Assuming 'bankruptcy_data' is your DataFrame
# Replace 'path_to_file' with the actual path to your Excel file
bankruptcy_data = pd.read_excel('/content/bankruptcy data.xlsx')

# Check for missing values
missing_values_count = bankruptcy_data.isnull().sum()

# Print the missing values count for each column
print(missing_values_count)

# If you want to see the percentage of missing values in each column
missing_values_percentage = (bankruptcy_data.isnull().sum() / len(bankruptcy_data)) * 100
print(missing_values_percentage)


Bankrupt?                                                   0
 ROA(C) before interest and depreciation before interest    0
 ROA(A) before interest and % after tax                     0
 ROA(B) before interest and depreciation after tax          0
 Operating Gross Margin                                     0
                                                           ..
 Liability to Equity                                        0
 Degree of Financial Leverage (DFL)                         0
 Interest Coverage Ratio (Interest expense to EBIT)         0
 Net Income Flag                                            0
 Equity to Liability                                        0
Length: 96, dtype: int64
Bankrupt?                                                   0.0
 ROA(C) before interest and depreciation before interest    0.0
 ROA(A) before interest and % after tax                     0.0
 ROA(B) before interest and depreciation after tax          0.0
 Operating Gross Margin              

As the data had been analysed using the missing value detection code, it showed that the data did not have any missing values at all. Thus, the datasets is complete and can proceed further into the next step.

# Duplicate Detection

In [None]:
import pandas as pd

file_path = '/content/bankruptcy data.xlsx'
bankruptcy_data = pd.read_excel(file_path)

# Check for duplicate rows in the dataset
duplicates_count = bankruptcy_data.duplicated().sum()

# Print the number of duplicates found
print(f"Number of duplicate rows: {duplicates_count}")

# If duplicates are found, you can remove them
if duplicates_count > 0:
    bankruptcy_data = bankruptcy_data.drop_duplicates()
    print("Duplicates have been removed.")
else:
    print("No duplicates were found in the dataset.")

# Continue with your analysis or preprocessing...


Number of duplicate rows: 0
No duplicates were found in the dataset.


The check for duplicate rows in the dataset indicates that there are no duplicate entries. Every row in your dataset is unique. Since there are no duplicates, no further action is required in this aspect of data cleaning.

# Outliers Detection

In [None]:
import pandas as pd
import numpy as np
from collections import Counter

file_path = '/content/bankruptcy data.xlsx'
bankruptcy_data = pd.read_excel(file_path)

# Define a function to detect outliers using the IQR method
def detect_outliers(df, n, features):
    outlier_indices = []

    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col], 75)
        # IQR
        IQR = Q3 - Q1

        # outlier step
        outlier_step = 1.5 * IQR

        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index

        # append the found outlier indices for col to the list of outlier indices
        outlier_indices.extend(outlier_list_col)

    # select observations containing more than n outliers
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)

    return multiple_outliers

# List of features to check for outliers - this should be the list of numerical features in your dataset
# Adjust this list based on the actual numerical features of your dataset
numerical_features = bankruptcy_data.select_dtypes(include=[np.number]).columns.tolist()

# Detect outliers in the dataset (assuming 'n' is the threshold for the number of outliers)
outliers = detect_outliers(bankruptcy_data, 2, numerical_features)

print(f'Number of rows with more than 2 outliers: {len(outliers)}')

# Optionally, remove outliers
# bankruptcy_data = bankruptcy_data.drop(outliers, axis=0).reset_index(drop=True)


Number of rows with more than 2 outliers: 2667


# Features Selection

In [None]:
pip install statsmodels




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif

# Load the dataset
file_path = '/content/bankruptcy data.xlsx'  # Replace with your file path
bankruptcy_data = pd.read_excel(file_path)

# Define your features and target variable
X = bankruptcy_data.drop('Bankrupt?', axis=1)
Y = bankruptcy_data['Bankrupt?']

# Splitting the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training set to address class imbalance
X_train_smote, Y_train_smote = smote.fit_resample(X_train, Y_train)

# Apply SelectKBest with ANOVA F-value
selector_anova = SelectKBest(f_classif, k='all')
X_new_anova = selector_anova.fit_transform(X_train_smote, Y_train_smote)

# Get the ANOVA F-values for each feature
f_values = selector_anova.scores_

# Create a DataFrame to display features and their corresponding ANOVA F-values in a table format
anova_table = pd.DataFrame({
    'Feature': X_train_smote.columns,
    'ANOVA F-value': f_values
})

# Displaying the table
anova_table


  f = msb / msw


Unnamed: 0,Feature,ANOVA F-value
0,ROA(C) before interest and depreciation befor...,2124.615044
1,ROA(A) before interest and % after tax,2002.234667
2,ROA(B) before interest and depreciation after...,2021.126471
3,Operating Gross Margin,531.620464
4,Realized Sales Gross Margin,528.658848
...,...,...
90,Liability to Equity,77.504833
91,Degree of Financial Leverage (DFL),3.094475
92,Interest Coverage Ratio (Interest expense to ...,0.849351
93,Net Income Flag,


# Data Splitting (W/o SMOTE)

In [None]:
from sklearn.model_selection import train_test_split

# Load the data again
file_path = '/content/bankruptcy data.xlsx'
bankruptcy_data = pd.read_excel(file_path)

# Define the 30 features
significant_features = [
    ' Net worth/Assets',
    ' Debt ratio %',
    ' Persistent EPS in the Last Four Seasons',
    ' Net profit before tax/Paid-in capital',
    ' Per Share Net profit before tax (Yuan Â¥)',
    ' ROA(C) before interest and depreciation before interest',
    ' Net Value Per Share (B)',
    ' Net Value Per Share (A)',
    ' Net Income to Total Assets',
    ' Net Value Per Share (C)',
    ' ROA(B) before interest and depreciation after tax',
    ' ROA(A) before interest and % after tax',
    ' Working Capital to Total Assets',
    ' Retained Earnings to Total Assets',
    ' Current Liability to Assets',
    ' Operating Profit Per Share (Yuan Â¥)',
    ' Total income/Total expense',
    ' Operating profit/Paid-in capital',
    ' Tax rate (A)',
    ' CFO to Assets',
    ' Current Liability to Current Assets',
    ' Gross Profit to Sales',
    ' Operating Gross Margin',
    ' Realized Sales Gross Margin',
    ' Cash/Total Assets',
    ' Total expense/Assets',
    ' Cash Flow Per Share',
    ' Operating profit per person',
    ' Cash Flow to Total Assets',
    ' Equity to Liability'
]

# Check if the significant feature names exist in the DataFrame
missing_features = [feature for feature in significant_features if feature not in bankruptcy_data.columns]
if missing_features:
    raise ValueError(f"Features not found in the dataset: {missing_features}")

# If no error was raised, proceed with the feature selection
X_significant = bankruptcy_data[significant_features]
Y = bankruptcy_data['Bankrupt?']

# Splitting the data into training and testing sets with only the significant features
X_train_sig, X_test_sig, Y_train, Y_test = train_test_split(X_significant, Y, test_size=0.2, random_state=42)

# Print the shapes of the splits as confirmation
print(f"Training features shape: {X_train_sig.shape}")
print(f"Testing features shape: {X_test_sig.shape}")
print(f"Training labels shape: {Y_train.shape}")
print(f"Testing labels shape: {Y_test.shape}")


Training features shape: (2704, 30)
Testing features shape: (676, 30)
Training labels shape: (2704,)
Testing labels shape: (676,)


The output provided indicates the shape of the datasets after splitting the original dataset into training and testing sets:

- Training features shape: (2704, 3): This means that there are 2,704 rows and 3 features in the training dataset. The model will use this data to learn the patterns associated with bankruptcy.

- Testing features shape: (676, 3): There are 676 rows and the same 3 features in the testing dataset. This data will be used to evaluate the model's performance to ensure it generalizes well to unseen data.

- Training labels shape: (2704,): This is the shape of the target variable for the training set, which in your case is whether a company is bankrupt or not. There are 2,704 labels corresponding to each row in the training features.

- Testing labels shape: (676,): Similarly, this is the shape of the target variable for the testing set, with 676 labels for each row in the testing features.

This split allows for both training the model on known data and validating its predictive capabilities on data it hasn't seen during the training process. The ratio of split (roughly 80% training and 20% testing) is a common practice in machine learning to prevent overfitting and underfitting, ensuring that the model can perform well in practical scenarios.

# SMOTE (Data splitting + Imbalance Handling)

In [None]:
pip install pandas scikit-learn imbalanced-learn




In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pandas as pd

# Load the dataset
file_path = '/content/bankruptcy data.xlsx'  # Replace with your file path
bankruptcy_data = pd.read_excel(file_path)

# Define the features with the correct names by observing the previously printed column names
significant_features = [
    ' Net worth/Assets',
    ' Debt ratio %',
    ' Persistent EPS in the Last Four Seasons',
    ' Net profit before tax/Paid-in capital',
    ' Per Share Net profit before tax (Yuan Â¥)',
    ' ROA(C) before interest and depreciation before interest',
    ' Net Value Per Share (B)',
    ' Net Value Per Share (A)',
    ' Net Income to Total Assets',
    ' Net Value Per Share (C)',
    ' ROA(B) before interest and depreciation after tax',
    ' ROA(A) before interest and % after tax',
    ' Working Capital to Total Assets',
    ' Retained Earnings to Total Assets',
    ' Current Liability to Assets',
    ' Operating Profit Per Share (Yuan Â¥)',
    ' Total income/Total expense',
    ' Operating profit/Paid-in capital',
    ' Tax rate (A)',
    ' CFO to Assets',
    ' Current Liability to Current Assets',
    ' Gross Profit to Sales',
    ' Operating Gross Margin',
    ' Realized Sales Gross Margin',
    ' Cash/Total Assets',
    ' Total expense/Assets',
    ' Cash Flow Per Share',
    ' Operating profit per person',
    ' Cash Flow to Total Assets',
    ' Equity to Liability'
]

# Select the features (X) and the target (Y)
X = bankruptcy_data[significant_features]
Y = bankruptcy_data['Bankrupt?']

# Splitting the data into training and testing sets with only the significant features
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training set to address class imbalance
X_train_smote, Y_train_smote = smote.fit_resample(X_train, Y_train)

# You can now proceed to train a model using the SMOTE-augmented data (X_train_smote, Y_train_smote)
# and evaluate the model using the testing data (X_test, Y_test)

# Print the shapes of the oversampled training data
print(f"Oversampled training features shape: {X_train_smote.shape}")
print(f"Oversampled training labels shape: {Y_train_smote.shape}")


Oversampled training features shape: (5220, 30)
Oversampled training labels shape: (5220,)


# Model Testing

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load your dataset
file_path = '/content/bankruptcy data.xlsx'  # Replace with the path to your dataset
bankruptcy_data = pd.read_excel(file_path)

# Define your features and target variable based on the feature selection outcome
features = [' Net worth/Assets',
    ' Debt ratio %',
    ' Persistent EPS in the Last Four Seasons',
    ' Net profit before tax/Paid-in capital',
    ' Per Share Net profit before tax (Yuan Â¥)',
    ' ROA(C) before interest and depreciation before interest',
    ' Net Value Per Share (B)',
    ' Net Value Per Share (A)',
    ' Net Income to Total Assets',
    ' Net Value Per Share (C)',
    ' ROA(B) before interest and depreciation after tax',
    ' ROA(A) before interest and % after tax',
    ' Working Capital to Total Assets',
    ' Retained Earnings to Total Assets',
    ' Current Liability to Assets',
    ' Operating Profit Per Share (Yuan Â¥)',
    ' Total income/Total expense',
    ' Operating profit/Paid-in capital',
    ' Tax rate (A)',
    ' CFO to Assets',
    ' Current Liability to Current Assets',
    ' Gross Profit to Sales',
    ' Operating Gross Margin',
    ' Realized Sales Gross Margin',
    ' Cash/Total Assets',
    ' Total expense/Assets',
    ' Cash Flow Per Share',
    ' Operating profit per person',
    ' Cash Flow to Total Assets',
    ' Equity to Liability']
X = bankruptcy_data[features]
y = bankruptcy_data['Bankrupt?']  # Replace 'Bankrupt?' with the actual target column name

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Initialize the models
models = {
    'Logistic Regression': LogisticRegression(),
    'Ridge Classifier': RidgeClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Neural Network': MLPClassifier(),
    'XGBoost': XGBClassifier(),
    'LightGBM': LGBMClassifier()
}

# Create a list to store classification reports
report_list = []

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_test)
    report_dict = classification_report(y_test, y_pred, output_dict=True)

    # Calculate accuracy, weighted avg, and macro avg
    accuracy = accuracy_score(y_test, y_pred)
    weighted_avg = report_dict['weighted avg']
    macro_avg = report_dict['macro avg']

    report_dict = {
        'Model': name,
        'Precision': weighted_avg['precision'],
        'Recall': weighted_avg['recall'],
        'F1-Score': weighted_avg['f1-score'],
        'Support': weighted_avg['support'],
        'Accuracy': accuracy,
        'Macro Avg Precision': macro_avg['precision'],
        'Macro Avg Recall': macro_avg['recall'],
        'Macro Avg F1-Score': macro_avg['f1-score'],
        'Macro Avg Support': macro_avg['support']
    }
    report_list.append(pd.DataFrame(report_dict, index=[0]))

# Concatenate the classification reports into a table
report_df = pd.concat(report_list, ignore_index=True)

# Print the classification reports in a table format
print(report_df)




[LightGBM] [Info] Number of positive: 2610, number of negative: 2610
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001972 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 5220, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
                     Model  Precision    Recall  F1-Score  Support  Accuracy  \
0      Logistic Regression   0.975732  0.847633  0.899030      676  0.847633   
1         Ridge Classifier   0.969555  0.823964  0.884084      676  0.823964   
2            Decision Tree   0.966657  0.937870  0.950608      676  0.937870   
3            Random Forest   0.968252  0.952663  0.959612      676  0.952663   
4        Gradient Boosting   0.966957  0.914201  0.937140      676  0.914201   
5              Extra Trees   0.965570  0.960059  0.962682      676  0.960059   
6   

In [None]:
import pandas as pd

# Create a dictionary with the data
report_data = {
    'Model': ['Logistic Regression', 'Ridge Classifier', 'Decision Tree', 'Random Forest', 'Gradient Boosting', 'Extra Trees', 'Support Vector Machine', 'K-Nearest Neighbors', 'Neural Network', 'XGBoost', 'LightGBM'],
    'Macro Avg Precision': [0.559087, 0.539463, 0.580425, 0.609701, 0.562493, 0.610651, 0.548044, 0.564555, 0.541457, 0.661274, 0.639735],
    'Macro Avg Recall': [0.860985, 0.757386, 0.693750, 0.701326, 0.712121, 0.644129, 0.819129, 0.791477, 0.739015, 0.767614, 0.764583],
    'Macro Avg F1-Score': [0.564692, 0.528881, 0.608796, 0.639923, 0.585416, 0.624869, 0.541437, 0.585125, 0.539397, 0.699753, 0.680378],
    'Macro Avg Support': [676, 676, 676, 676, 676, 676, 676, 676, 676, 676, 676]
}

# Create a DataFrame
df_report = pd.DataFrame(report_data)

# Save the DataFrame to a CSV file with comma delimiter
csv_file_path = 'classification_report_model.csv'
df_report.to_csv(csv_file_path, index=False, sep=',')

# Provide the file path for download
csv_file_path


'classification_report_model.csv'

In [None]:
from google.colab import files

# Assuming your file is named 'classification_report_summary.csv'
files.download('classification_report_model.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Create a dictionary with model names and their respective metrics
model_metrics = {
    'Ridge Classifier': {
        'accuracy': 0.963018,
        'macro_avg_precision': 0.539463,
        'macro_avg_recall': 0.757386,
        'macro_avg_f1_score': 0.528881,
        'macro_avg_support': 676
    },
    'Random Forest': {
        'accuracy': 0.957101,
        'macro_avg_precision': 0.609701,
        'macro_avg_recall': 0.701326,
        'macro_avg_f1_score': 0.639923,
        'macro_avg_support': 676
    },
    'Extra Trees': {
        'accuracy': 0.955621,
        'macro_avg_precision': 0.610651,
        'macro_avg_recall': 0.644129,
        'macro_avg_f1_score': 0.624869,
        'macro_avg_support': 676
    },
    'LightGBM': {
        'accuracy': 0.955621,
        'macro_avg_f1_score': 0.639735,
        'macro_avg_support': 676
    },
    'XGBoost': {
        'accuracy': 0.930473,
        'macro_avg_precision': 0.661274,
        'macro_avg_recall': 0.767614,
        'macro_avg_f1_score': 0.699753,
        'macro_avg_support': 676
    }
}

# Define a scoring function that combines precision, recall, and f1-score
def custom_scorer(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    return (precision + recall + f1) / 3  # You can adjust the weights if needed

# Initialize models
models = {
    'Ridge Classifier': RidgeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'LightGBM': LGBMClassifier(),
    'XGBoost': XGBClassifier()
}

# Tune hyperparameters and evaluate models
tuned_results = {}
for name, model in models.items():
    print(f"Tuning hyperparameters for {name}")
    metrics = model_metrics[name]
    accuracy = metrics['accuracy']
    custom_scorer_value = make_scorer(custom_scorer, greater_is_better=True)

    # Define hyperparameter grids based on your choice for each model
    param_grid_ridge = {
        'alpha': [0.1, 1, 10],
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'saga']
    }

    param_grid_rf = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }

    param_grid_et = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }

    param_grid_lgbm = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [-1, 10, 20],
        'num_leaves': [31, 41],
        'boosting_type': ['gbdt']
    }

    param_grid_xgb = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7],
        'min_child_weight': [1, 3]
    }

    param_grid = None  # Initialize with None, will be updated based on the model

    if name == 'Ridge Classifier':
        param_grid = param_grid_ridge
    elif name == 'Random Forest':
        param_grid = param_grid_rf
    elif name == 'Extra Trees':
        param_grid = param_grid_et
    elif name == 'LightGBM':
        param_grid = param_grid_lgbm
    elif name == 'XGBoost':
        param_grid = param_grid_xgb

    # Create GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring=custom_scorer_value, n_jobs=-1, verbose=2)
    grid_search.fit(X_train_smote, y_train_smote)
    best_model = grid_search.best_estimator_
    tuned_results[name] = best_model
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    y_pred = best_model.predict(X_test)
    print(f"Classification Report for {name} (Tuned):")
    print(classification_report(y_test, y_pred))


Tuning hyperparameters for Ridge Classifier
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters for Ridge Classifier: {'alpha': 1, 'solver': 'auto'}
Classification Report for Ridge Classifier (Tuned):
              precision    recall  f1-score   support

           0       0.99      0.83      0.90       660
           1       0.09      0.69      0.16        16

    accuracy                           0.82       676
   macro avg       0.54      0.76      0.53       676
weighted avg       0.97      0.82      0.88       676

Tuning hyperparameters for Random Forest
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Classification Report for Random Forest (Tuned):
              precision    recall  f1-score   support

           0       0.99      0.96      0.97       660
           1       0.23      0.44      0.30        16

    accuracy                    

In [None]:
import pandas as pd

# Create a dictionary with the data
classification_report_data = {
    'Model': ['Ridge Classifier', 'Random Forest', 'Extra Trees', 'LightGBM', 'XGBoost'],
    'Accuracy': ['0.82', '0.95', '0.96', '0.96', '0.96'],
    'Macro Avg Precision': ['0.54', '0.61', '0.61', '0.65', '0.64'],
    'Macro Avg Recall': ['0.76', '0.70', '0.64', '0.74', '0.76'],
    'Macro Avg F1-Score': ['0.53', '0.64', '0.62', '0.69', '0.68'],
    'Macro Avg Support': ['676', '676', '676', '676', '676']
}

# Create a DataFrame
df_classification_report = pd.DataFrame(classification_report_data)

# Specify the file path for the CSV
csv_file_path = '/content/classification_report.csv'  # Change this path to your desired location

# Save the DataFrame to a CSV file
df_classification_report.to_csv(csv_file_path, index=False)

# Provide the file path for download
csv_file_path


'/content/classification_report.csv'

In [None]:
from google.colab import files

# Assuming your file is named 'classification_report_summary.csv'
files.download('classification_report.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>