In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the enhanced dataset
df = pd.read_csv('credit-risk-dataset/employment_enhanced_dataset.csv')

# Create visualizations
plt.figure(figsize=(15, 10))

# Plot 1: Employment Length Risk Score Distribution
plt.subplot(2, 2, 1)
sns.histplot(data=df, x='emp_length_risk_score', bins=20)
plt.title('Distribution of Employment Length Risk Scores')
plt.xlabel('Risk Score')
plt.ylabel('Count')

# Plot 2: Employment Length Ratio Distribution
plt.subplot(2, 2, 2)
sns.histplot(data=df, x='emp_length_ratio', bins=20)
plt.title('Distribution of Employment Length Ratios')
plt.xlabel('Length Ratio (Individual/Average)')
plt.ylabel('Count')

# Plot 3: Turnover Risk by Loan Status
plt.subplot(2, 2, 3)
sns.barplot(data=df, x='turnover_risk', y='loan_status', estimator=lambda x: len(x[x==1])/len(x)*100)
plt.title('Default Rate by Turnover Risk')
plt.xlabel('Turnover Risk')
plt.ylabel('Default Rate (%)')

# Plot 4: Risk Score vs Loan Amount
plt.subplot(2, 2, 4)
sns.scatterplot(data=df, x='emp_length_risk_score', y='loan_amnt', alpha=0.5)
plt.title('Risk Score vs Loan Amount')
plt.xlabel('Employment Length Risk Score')
plt.ylabel('Loan Amount')

plt.tight_layout()
plt.show()

# Print summary statistics
print("\nSummary Statistics for New Features:")
print("\nEmployment Length Risk Score:")
print(df['emp_length_risk_score'].describe())
print("\nEmployment Length Ratio:")
print(df['emp_length_ratio'].describe())
print("\nTurnover Risk Distribution:")
print(df['emp_turnover_risk'].value_counts(normalize=True))
