<a href="https://colab.research.google.com/github/zhou1126/Marquette_teaching_finance/blob/main/Taiwan_bankruptcy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
fedesoriano_company_bankruptcy_prediction_path = kagglehub.dataset_download('fedesoriano/company-bankruptcy-prediction')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import matplotlib.pyplot as plt

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/company-bankruptcy-prediction/data.csv')
# Remove leading and trailing spaces from column names
df.columns = df.columns.str.strip()
df.head()

# Profitability: Operating Gross Margin
# Liquidity: Current Ratio
# Leverage: 'Total debt/Total net worth'
# Efficiency: 'Total Asset Turnover'
# Cash flow: 'Cash Flow to Total Assets'
# Growth: 'Operating Profit Growth Rate'

df = df[['Bankrupt?', 'Operating Gross Margin', 'Current Ratio', 'Total debt/Total net worth', 'Total Asset Turnover', 'Cash Flow to Total Assets', 'Operating Profit Growth Rate']]
df = df.dropna()
print(f'df has {df.shape[0]} rows')


percentile_10 = df[['Operating Gross Margin', 'Current Ratio', 'Total debt/Total net worth', 'Total Asset Turnover', 'Cash Flow to Total Assets', 'Operating Profit Growth Rate']].quantile(0.1)
percentile_90 = df[['Operating Gross Margin', 'Current Ratio', 'Total debt/Total net worth', 'Total Asset Turnover', 'Cash Flow to Total Assets', 'Operating Profit Growth Rate']].quantile(0.9)

# Filter data to keep values within 10th and 90th percentiles for each column
filtered_df = df[
    (df[['Operating Gross Margin', 'Current Ratio', 'Total debt/Total net worth', 'Total Asset Turnover', 'Cash Flow to Total Assets', 'Operating Profit Growth Rate']] >= percentile_10).all(axis=1) &
    (df[['Operating Gross Margin', 'Current Ratio', 'Total debt/Total net worth', 'Total Asset Turnover', 'Cash Flow to Total Assets', 'Operating Profit Growth Rate']] <= percentile_90).all(axis=1)
]

print(f'filtered_df has {filtered_df.shape[0]} rows')


In [None]:
filtered_df.head()

In [None]:
percentile_10, percentile_90

In [None]:
grouped_averages = filtered_df.groupby("Bankrupt?").median()

# Display the grouped averages
grouped_averages.head()

In [None]:
filtered_df.info()

In [None]:
filtered_df.columns

# Cluster Analysis

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

columns_for_clustering = ['Operating Gross Margin', 'Current Ratio', 'Total debt/Total net worth', 'Total Asset Turnover', 'Cash Flow to Total Assets', 'Operating Profit Growth Rate']
X = filtered_df[columns_for_clustering]


In [None]:
X.head()

In [None]:
# Elbow Method
inertia = []
silhouette_scores = []
range_n_clusters = range(2, 11)  # Test for 2 to 10 clusters

for k in range_n_clusters:
    kmeans = KMeans(n_clusters=k, random_state=20).fit(X)
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X, kmeans.labels_))

# Plot Inertia and Silhouette Score
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range_n_clusters, inertia, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')

plt.subplot(1, 2, 2)
plt.plot(range_n_clusters, silhouette_scores, marker='o', color='green')
plt.title('Silhouette Score')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')

plt.tight_layout()
plt.show()

# Elbow Method Plot (Inertia vs. Number of Clusters):
# Look for the "elbow," which is the point where the decrease in inertia slows down significantly. This suggests the optimal number of clusters.

# Silhouette Score Plot (Silhouette Score vs. Number of Clusters):
# The peak value of the Silhouette Score indicates the best number of clusters, as it represents the best separation and compactness of the clusters.

# Looks 6 is a good number of clusters

In [None]:
optimal_clusters = 3

# K-Means Clustering
kmeans = KMeans(n_clusters=optimal_clusters, random_state=20).fit(X)
X['Cluster'] = kmeans.labels_

# Append "Bankrupt?" column to analyze
X['Bankrupt?'] = df['Bankrupt?']

# Analyze each cluster
cluster_analysis = X.groupby('Cluster').mean()
cluster_analysis['Record Count'] = X['Cluster'].value_counts().sort_index()

print("Cluster Analysis (Means and Record Counts):\n", cluster_analysis)

In [None]:
cluster_analysis

In [None]:
cluster_analysis

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Columns to use for PCA
pca_columns = ['Operating Gross Margin', 'Current Ratio', 'Total debt/Total net worth',
               'Total Asset Turnover', 'Cash Flow to Total Assets', 'Operating Profit Growth Rate']

# Perform PCA
pca = PCA(n_components=2, random_state=42)  # Reduce to 2 components for visualization
pca_result = pca.fit_transform(X[pca_columns])

# Add PCA results and cluster labels to the DataFrame
X['PCA1'] = pca_result[:, 0]
X['PCA2'] = pca_result[:, 1]

# Plot the PCA results, colored by cluster
plt.figure(figsize=(10, 7))
for cluster_label in X['Cluster'].unique():
    cluster_data = X[X['Cluster'] == cluster_label]
    plt.scatter(cluster_data['PCA1'], cluster_data['PCA2'], label=f'Cluster {cluster_label}', s=50, alpha=0.7)

# Add labels and title
plt.title('PCA Visualization of Clusters', fontsize=16)
plt.xlabel('PCA Component 1', fontsize=12)
plt.ylabel('PCA Component 2', fontsize=12)
plt.legend(title='Clusters')
plt.grid(alpha=0.5)
plt.tight_layout()
plt.show()


In [None]:
X

## Run EDA

In [None]:
col = ' Quick Ratio'

plt.figure(figsize=(10, 6))

# Create the box plot
boxplot = df[df[col]<0.05].boxplot(column=col, by="Bankrupt?", grid=False, showfliers=True, patch_artist=True, widths=0.6)

# Get positions of the boxes
positions = range(1, len(df["Bankrupt?"].unique()) + 1)

# # Add mean points to the box plot
# for i, category in enumerate(sorted(df["Bankrupt?"].unique())):
#     category_values = df[df["Bankrupt?"] == category][col].dropna()
#     mean_value = category_values.mean()

#     # Plot the mean at the corresponding box position
#     plt.scatter(positions[i], mean_value, color='red', zorder=3, label="Mean" if i == 0 else None)

# Customize the plot
plt.title(f"Distribution of {col} by Bankruptcy Status")
plt.suptitle("")  # Remove default 'Boxplot grouped by...' title
plt.xlabel("Bankruptcy Status (0 = No, 1 = Yes)")
plt.ylabel(col)

# Add a legend for the mean
plt.legend(loc="best")

# Show grid for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.show()

In [None]:
# List all columns except "Bankrupt?" for comparison
columns_to_compare = [col for col in df.columns if col != "Bankrupt?"]

for col in columns_to_compare[0:]:
    plt.figure(figsize=(10, 6))

    # Create the box plot
    boxplot = df.boxplot(column=col, by="Bankrupt?", grid=False, showfliers=True, patch_artist=True, widths=0.6)

    # Get positions of the boxes
    positions = range(1, len(df["Bankrupt?"].unique()) + 1)

    # Add mean points to the box plot
    for i, category in enumerate(sorted(df["Bankrupt?"].unique())):
        category_values = df[df["Bankrupt?"] == category][col].dropna()
        mean_value = category_values.mean()

        # Plot the mean at the corresponding box position
        plt.scatter(positions[i], mean_value, color='red', zorder=3, label="Mean" if i == 0 else None)

    # Customize the plot
    plt.title(f"Distribution of {col} by Bankruptcy Status")
    plt.suptitle("")  # Remove default 'Boxplot grouped by...' title
    plt.xlabel("Bankruptcy Status (0 = No, 1 = Yes)")
    plt.ylabel(col)

    # Add a legend for the mean
    plt.legend(loc="best")

    # Show grid for better readability
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Show the plot
    plt.show()

### Try a Logistic Regression Model

In [None]:
df.columns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score,
    roc_curve,
    accuracy_score
)
import matplotlib.pyplot as plt
import seaborn as sns

# selected_features = [" Current Liability to Current Assets"]
selected_features = df.columns.drop("Bankrupt?")
target = "Bankrupt?"

# Features and target
X = df[selected_features]
y = df[target]

# Split data into training and testing sets (75% training, 25% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Initialize logistic regression model
log_reg = LogisticRegression()

# Train the model
log_reg.fit(X_train, y_train)

# Generate predictions and probabilities
y_pred = log_reg.predict(X_test)
y_pred_prob = log_reg.predict_proba(X_test)[:, 1]

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Not Bankrupt", "Bankrupt"], yticklabels=["Not Bankrupt", "Bankrupt"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Classification Report for precision, recall, F1-score
print("Classification Report:\n", classification_report(y_test, y_pred))

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# AUC (Area Under the Curve)
auc_score = roc_auc_score(y_test, y_pred_prob)
print("AUC:", auc_score)

# Plot ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {auc_score:.2f})", color="blue")
plt.plot([0, 1], [0, 1], "k--", label="Random Classifier")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="best")
plt.grid()
plt.show()


In [None]:
print(df[target].mean())

### Bankrupted companies are too few. The data is biased.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score,
    roc_curve,
    accuracy_score
)
import matplotlib.pyplot as plt
import seaborn as sns

# selected_features = [" Current Liability to Current Assets"]
selected_features = df.columns.drop("Bankrupt?")
target = "Bankrupt?"

# Features and target
X = df[selected_features]
y = df[target]

# Split data into training and testing sets (75% training, 25% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Initialize logistic regression model
# log_reg = LogisticRegression()
log_reg = LogisticRegression(class_weight='balanced', random_state=42)


# Train the model
log_reg.fit(X_train, y_train)

# Generate predictions and probabilities
y_pred = log_reg.predict(X_test)
y_pred_prob = log_reg.predict_proba(X_test)[:, 1]

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Not Bankrupt", "Bankrupt"], yticklabels=["Not Bankrupt", "Bankrupt"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Classification Report for precision, recall, F1-score
print("Classification Report:\n", classification_report(y_test, y_pred))

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# AUC (Area Under the Curve)
auc_score = roc_auc_score(y_test, y_pred_prob)
print("AUC:", auc_score)

# Plot ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {auc_score:.2f})", color="blue")
plt.plot([0, 1], [0, 1], "k--", label="Random Classifier")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="best")
plt.grid()
plt.show()


## Variable Selection

In [None]:
# Get the absolute values of the coefficients
coefficients = pd.DataFrame(log_reg.coef_[0], X.columns, columns=["Coefficient"])

# Sort by absolute coefficient value
coefficients['abs_coeff'] = coefficients['Coefficient'].abs()
coefficients = coefficients.sort_values(by="abs_coeff", ascending=False)

# Display the most significant features
print(coefficients[['Coefficient', 'abs_coeff']])

In [None]:
from sklearn.linear_model import LogisticRegressionCV

# Use Logistic Regression with L1 regularization
lasso_log_reg = LogisticRegressionCV(penalty='l1', solver='liblinear', max_iter=1000)
lasso_log_reg.fit(X_train, y_train)

# Get the non-zero coefficients
selected_features_lasso = X.columns[(lasso_log_reg.coef_ != 0).flatten()]
print("Selected Features by Lasso:", selected_features_lasso)