In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import boxcox, skew
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

file_path = 'american_bankruptcy.csv'
df = pd.read_csv(file_path)

print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
df.head()

Examining Data Skewness

Financial data typically exhibits significant positive skewness. This means the distribution has a long right tail, with a few very large values that can disproportionately influence statistical models. 


In [None]:
main_variables = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 
                  'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 
                  'X16', 'X17', 'X18']

skewness = df[main_variables].skew().sort_values(ascending=False)
print("Skewness of raw financial variables:")
print(skewness)

plt.figure(figsize=(12, 6))
skewness.plot(kind='bar', color='teal')
plt.axhline(y=0, color='r', linestyle='-', alpha=0.3)
plt.axhline(y=1, color='y', linestyle='--', alpha=0.3)
plt.axhline(y=-1, color='y', linestyle='--', alpha=0.3)
plt.title('Skewness of Financial Variables')
plt.xlabel('Variable')
plt.ylabel('Skewness')
plt.grid(True, alpha=0.3)
plt.show()

All the financial variables show positive skewness, with many having extreme values well above 1, indicating severe right skew. This confirms the need for transformations. 

NOTE - Guidelines for interpreting skewness:
0 = No skew (perfectly symmetrical)
Between -0.5 and 0.5 = Approximately symmetrical
Between -1 and -0.5 or 0.5 and 1 = Moderately skewed
Less than -1 or greater than 1 = Highly skewed

Below we visualize distributions of a few key variables:

In [None]:
variables_to_plot = ['X8', 'X10', 'X6', 'X11']  # Market value, Total assets, Net income, Long-term debt

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, var in enumerate(variables_to_plot):
    sns.histplot(df[var], kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {var} (Skewness: {df[var].skew():.2f})')
    axes[i].set_xlabel(var)
    
plt.tight_layout()
plt.show()

Transforming Company Size Metrics

Transform two key company size metrics:

X8 (Market value) - The total market capitalization of the company

X10 (Total assets) - The total assets on the company's balance sheet

We test three common transformations for skewed financial data:
1. Logarithmic transformation
2. Square root transformation
3. Box-Cox transformation

The goal is to find which transformation most effectively reduces skewness toward zero.


In [None]:
def test_skewness(column):
    """Test different transformations and their effect on skewness"""
    original_skew = df[column].skew()

     # Log Transformation (log(1+x) to handle zero values)
    df[f'{column}_log'] = np.log1p(df[column])
    log_skew = df[f'{column}_log'].skew()

    # Box-Cox Transformation (requires positive values). Add 1 to ensure all values are positive
    df[f'{column}_boxcox'], _ = boxcox(df[column] + 1)  
    boxcox_skew = df[f'{column}_boxcox'].skew()

    print(f"Skewness for {column}:")
    print(f"  Original: {original_skew:.3f}")
    print(f"  Log transformation: {log_skew:.3f}")
    print(f"  Box-Cox transformation: {boxcox_skew:.3f}")

    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    sns.histplot(df[column], kde=True, ax=axes[0])
    axes[0].set_title(f'Original {column} (Skewness: {original_skew:.3f})')
    
    sns.histplot(df[f'{column}_log'], kde=True, ax=axes[1])
    axes[1].set_title(f'Log-transformed {column} (Skewness: {log_skew:.3f})')
    
    sns.histplot(df[f'{column}_boxcox'], kde=True, ax=axes[2])
    axes[2].set_title(f'Box-Cox transformed {column} (Skewness: {boxcox_skew:.3f})')
    
    plt.tight_layout()
    plt.show()

# Test transformations for market value and total assets
print("Testing transformations for company size metrics:")
test_skewness('X8')  # Market value
test_skewness('X10')  # Total assets

The Box-Cox transformation seems most effective at reducing skewness for both market value (X8) and total assets (X10). The transformation finds the optimal power transformation that makes the data most normally distributed.

For bankruptcy prediction, these size metrics are crucial because:
1. Company size is a significant predictor of bankruptcy risk (larger companies tend to be more resilient)
2. Size impacts how other financial ratios should be interpreted
3. The transformed metrics will be more suitable for linear and logistic regression models





Creating and Transforming Financial Ratios

Next we create important financial ratios and then transform them to address skewness. Financial ratios provide more insight than raw financial values because they control for company size and provide standardized measures of financial health.

In [None]:
# Create key financial ratios
print("Creating financial ratios...")

# Liquidity Ratios
df['current_ratio'] = df['X1'] / df['X14']  # Current Assets / Current Liabilities
df['quick_ratio'] = (df['X1'] - df['X5']) / df['X14']  # (Current Assets - Inventory) / Current Liabilities

# Leverage Ratios
df['debt_to_equity_ratio'] = df['X17'] / (df['X10'] - df['X17'])  # Total Liabilities / Equity
df['long_term_debt_to_assets'] = df['X11'] / df['X10']  # Long-term Debt / Total Assets

# Profitability Ratios
df['profit_margin'] = df['X6'] / df['X16']  # Net Income / Total Revenue
df['gross_margin'] = df['X13'] / df['X16']  # Gross Profit / Total Revenue
df['return_on_assets'] = df['X6'] / df['X10']  # Net Income / Total Assets
df['return_on_equity'] = df['X6'] / (df['X10'] - df['X17'])  # Net Income / Equity

# Efficiency Ratios
df['asset_turnover'] = df['X9'] / df['X10']  # Net Sales / Total Assets

# Altman Z-Score Components
df['working_capital_to_assets'] = (df['X1'] - df['X14']) / df['X10']  # Working Capital / Total Assets
df['retained_earnings_to_assets'] = df['X15'] / df['X10']  # Retained Earnings / Total Assets
df['ebit_to_assets'] = df['X12'] / df['X10']  # EBIT / Total Assets
df['market_value_to_liabilities'] = df['X8'] / df['X17']  # Market Value / Total Liabilities
df['sales_to_assets'] = df['X9'] / df['X10']  # Sales / Total Assets

# Calculate the Altman Z-Score
df['z_score'] = (1.2 * df['working_capital_to_assets'] + 
                1.4 * df['retained_earnings_to_assets'] + 
                3.3 * df['ebit_to_assets'] + 
                0.6 * df['market_value_to_liabilities'] + 
                1.0 * df['sales_to_assets'])

# Market to Book ratio
df['market_to_book'] = df['X8'] / (df['X10'] - df['X17'])  # Market Value / Book Value

print("\nSummary statistics of created financial ratios:")
created_ratios = ['current_ratio', 'quick_ratio', 'debt_to_equity_ratio', 
                  'long_term_debt_to_assets', 'profit_margin', 'return_on_assets', 
                  'return_on_equity', 'z_score', 'market_to_book']
df[created_ratios].describe().round(3)


Handling Problematic Values in Financial Ratios

Financial ratios can produce problematic values due to:
 - Division by zero
 - Very small denominators leading to extreme values
 - Negative equity values

In [None]:
# Check for infinite values in the ratios
inf_count = {}
for col in created_ratios:
    inf_count[col] = np.isinf(df[col]).sum()

print("Number of infinite values in each ratio:")
for col, count in inf_count.items():
    print(f"{col}: {count}")

# Replace infinite values with NaN
df.replace([float('inf'), -float('inf')], np.nan, inplace=True)

# Check for missing values after replacing infinities
missing_values = df[created_ratios].isna().sum()
print("\nMissing values in each ratio after replacing infinities:")
for col, count in missing_values.items():
    print(f"{col}: {count}")

# Impute missing values with the median of each column
for col in created_ratios:
    df[col] = df[col].fillna(df[col].median())

# Verify that the fix worked
print("\nRemaining missing values after imputation:")
print(df[created_ratios].isna().sum().sum())

Testing Transformations for Financial Ratios

After creating financial ratios and handling problematic values, we need to transform them to address skewness. 

We'll test four transformations:

1. Logarithmic transformation - Effective for positive right-skewed data
2. Square root transformation - Milder than log, works for right-skewed data
3. Reciprocal transformation - Can be effective for certain ratio distributions
4. Original (no transformation) - Sometimes ratios are already well-distributed



In [None]:
def test_ratio_transformations(ratio):
    """Test different transformations for a financial ratio and find the best one"""
    # Check if the ratio contains any values <= 0
    min_val = df[ratio].min()
    contains_neg_or_zero = min_val <= 0
    
    original_skew = df[ratio].skew()
    
    shift = abs(min_val) + 1 if contains_neg_or_zero else 0
    
    # Log transformation (log(x+shift) to handle non-positive values)
    df[f'{ratio}_log'] = np.log1p(df[ratio] + shift)
    log_skew = df[f'{ratio}_log'].skew()
    
    # Square root transformation (sqrt(x+shift) to handle non-positive values)
    df[f'{ratio}_sqrt'] = np.sqrt(df[ratio] + shift)
    sqrt_skew = df[f'{ratio}_sqrt'].skew()
    
    # Reciprocal transformation (1/(x+shift+0.01) to avoid division by zero)
    df[f'{ratio}_reciprocal'] = 1 / (df[ratio] + shift + 0.01)
    recip_skew = df[f'{ratio}_reciprocal'].skew()
    
    # Find the transformation with skewness closest to zero
    skews = [abs(original_skew), abs(log_skew), abs(sqrt_skew), abs(recip_skew)]
    transforms = ["Original", "Log", "Square root", "Reciprocal"]
    best_index = skews.index(min(skews))
    best_transform = transforms[best_index]
    
    print(f"Transformations for {ratio}:")
    print(f"  Original skewness: {original_skew:.3f}")
    print(f"  Log transformation skewness: {log_skew:.3f}")
    print(f"  Square root transformation skewness: {sqrt_skew:.3f}")
    print(f"  Reciprocal transformation skewness: {recip_skew:.3f}")
    print(f"  Best transformation: {best_transform} (skewness: {skews[best_index]:.3f})")
    
    # Visualize the original and best transformation
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Original distribution
    sns.histplot(df[ratio], kde=True, ax=ax1)
    ax1.set_title(f'Original {ratio} (Skewness: {original_skew:.3f})')
    
    # Best transformation
    if best_transform == "Original":
        sns.histplot(df[ratio], kde=True, ax=ax2)
    else:
        transform_col = f'{ratio}_{best_transform.lower().replace(" ", "_")}'
        sns.histplot(df[transform_col], kde=True, ax=ax2)
    
    ax2.set_title(f'Best transformation: {best_transform} (Skewness: {skews[best_index]:.3f})')
    
    plt.tight_layout()
    plt.show()
    
    return best_transform

# Test transformations for key financial ratios
ratios_to_transform = ['quick_ratio', 'debt_to_equity_ratio', 'long_term_debt_to_assets', 
                      'return_on_equity', 'working_capital_to_assets', 'z_score', 'market_to_book']

print("Testing transformations for financial ratios...")
best_transforms = {}
for ratio in ratios_to_transform:
    best_transforms[ratio] = test_ratio_transformations(ratio)


Special Treatment for Profitability Measures

Profitability measures like profit margin and return on assets often require special handling because they can be negative. We'll create shifted versions of these variables and then test transformations:


In [None]:
profitability_measures = ['profit_margin', 'return_on_assets']

print("Creating shifted versions of profitability measures...")
for measure in profitability_measures:
    # Shift to ensure all values are positive
    min_val = df[measure].min()
    shift = abs(min_val) + 1
    df[f'{measure}_shifted'] = df[measure] + shift
    
    print(f"Original {measure} min: {min_val:.3f}")
    print(f"Shifted {measure} min: {df[f'{measure}_shifted'].min():.3f}")

# Test transformations on the shifted variables
for measure in profitability_measures:
    shifted_measure = f'{measure}_shifted'
    print(f"\nTesting transformations for {shifted_measure}:")
    best_transforms[shifted_measure] = test_ratio_transformations(shifted_measure)


Creating Binary Features



In [None]:
# Create a left_censored variable (1 if company was present from the beginning of the dataset)
first_year_in_dataset = df['year'].min()
first_year_per_company = df.groupby('company_name')['year'].min()
df['left_censored'] = df['company_name'].map(lambda x: 1 if first_year_per_company[x] == first_year_in_dataset else 0)

# Create period_after_2008 variable (1 for observations after the financial crisis)
df['period_after_2008'] = df['year'].apply(lambda x: 1 if x > 2008 else 0)

# Create bankruptcy indicator based on Z-score (traditional threshold is 1.81)
df['bankruptcy'] = (df['z_score'] < 1.81).astype(int)


binary_features = ['left_censored', 'period_after_2008', 'bankruptcy']
for feature in binary_features:
    print(f"\n{feature} distribution:")
    print(df[feature].value_counts())
    print(f"Percentage of 1's: {df[feature].mean() * 100:.2f}%")


Target Variable Refinement


In [None]:
# Initialize with 0
df['target_last'] = 0

# Get indices of the last 'failed' status for each company
last_failed_indices = df[df['status_label'] == 'failed'].groupby('company_name')['year'].idxmax()

# Set the last instance of 'failed' status to 1
df.loc[last_failed_indices, 'target_last'] = 1

# Display the target distribution
print("\nTarget variable distribution:")
print(df['target_last'].value_counts())
print(f"Bankruptcy rate: {df['target_last'].mean() * 100:.2f}%")


Selecting the Optimal Transformed Features

Based on our analysis, we'll select the optimal transformation for each feature. These transformed features will be used in our later bankruptcy prediction models:


In [None]:
transformed_features = [
    'X8_boxcox',  # Company size (Box-Cox transformed)
    'quick_ratio_reciprocal',  # Quick ratio (best transformation from analysis)
    'debt_to_equity_ratio_log',  # Debt to equity (best transformation from analysis)
    'long_term_debt_to_assets_reciprocal',  # Long-term debt ratio (best transformation)
    'return_on_equity_log',  # Return on equity (best transformation)
    'working_capital_to_assets_sqrt',  # Working capital (best transformation)
    'z_score_log',  # Z-score (best transformation)
    'market_to_book_log',  # Market to book (best transformation)
    'left_censored',  # Binary indicator for companies present since the beginning
    'period_after_2008',  # Binary indicator for post-financial crisis
    'target_last'  # Target variable
]

final_df = df[transformed_features].copy()

print("Final transformed dataset:")
final_df.head()

Correlation Analysis of Transformed Features

In [None]:
# correlation matrix for transformed features
transformed_predictors = [col for col in transformed_features if col != 'target_last']
correlation_matrix = final_df[transformed_predictors].corr()

# heatmap of the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0, fmt='.2f')
plt.title('Correlation Matrix of Transformed Features')
plt.tight_layout()
plt.show()

# Examine correlations with the target
target_correlations = df[transformed_predictors + ['target_last']].corr()['target_last'].drop('target_last')
target_correlations = target_correlations.sort_values(ascending=False)

plt.figure(figsize=(12, 6))
target_correlations.plot(kind='bar')
plt.title('Correlation of Transformed Features with Target')
plt.xlabel('Feature')
plt.ylabel('Correlation Coefficient')
plt.axhline(y=0, color='r', linestyle='-', alpha=0.3)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

Save the Transformed Dataset

In [None]:
final_df.to_csv('bankruptcy_transformed_features.csv', index=False)
print("Transformed dataset saved successfully to 'bankruptcy_transformed_features.csv'")


SUMMARY

In this notebook, we've applied several transformations to prepare financial data for bankruptcy prediction:

1. Company Size Transformations: Box-Cox transformation for market value (X8)

2. Financial Ratio Transformations:
- Quick Ratio → Reciprocal transformation
- Debt to Equity Ratio → Log transformation
- Long-term Debt to Assets → Reciprocal transformation
- Return on Equity → Log transformation
- Working Capital to Assets → Square root transformation
- Z-score → Log transformation
- Market to Book Ratio → Log transformation

3. Binary Feature Creation:
- Left Censored (indicates companies present from dataset beginning)
- Period after 2008 (captures post-financial crisis period)
- Bankruptcy (based on Z-score threshold of 1.81)

4. Target Variable Refinement:
- Created target_last to identify the final bankruptcy event for each company

These transformations have successfully:
- Reduced skewness in financial variables
- Handled problematic values (infinities, negative values)
- Created more normally distributed features for modeling
- Prepared the data for statistical and machine learning models