In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings('ignore')

np.random.seed(42)

# Set a random seed for reproducibility
np.random.seed(42)

# Number of samples
num_samples = 100

# Generating synthetic features
data = pd.DataFrame({
    'age': np.random.randint(18, 70, size=num_samples),
    'income': np.random.normal(50000, 15000, size=num_samples),
    'credit_score': np.random.randint(300, 850, size=num_samples),
    'loan_amount': np.random.normal(15000, 5000, size=num_samples),
    'unemployment_rate': np.random.uniform(3, 10, size=num_samples),
    'GDP_growth_rate': np.random.uniform(-2, 5, size=num_samples),
    'default': np.random.binomial(1, 0.1, size=num_samples)  # 10% default rate
})

# Display the first few rows of the dataset
print(data.head())

# Save the dataset to a CSV file
data.to_csv('synthetic_credit_card_data.csv', index=False)

data = pd.read_csv('synthetic_credit_card_data.csv')


features = ['age', 'income', 'credit_score', 'loan_amount', 'unemployment_rate', 'GDP_growth_rate']
target = 'default'


X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, y_pred_log)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)

print(f"Logistic Regression Accuracy: {log_reg_accuracy}")
print(f"Random Forest Accuracy: {rf_accuracy}")

from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the model with best parameters
rf_best = RandomForestClassifier(**best_params)
rf_best.fit(X_train, y_train)
y_pred_rf_best = rf_best.predict(X_test)
rf_best_accuracy = accuracy_score(y_test, y_pred_rf_best)
print(f"Optimized Random Forest Accuracy: {rf_best_accuracy}")

import numpy as np

# Define scenarios
scenarios = {
    'baseline': {'unemployment': 5, 'GDP': 2},
    'stress': {'unemployment': 10, 'GDP': -5}
}

def simulate_scenario(data, scenario):
    modified_data = data.copy()
    modified_data['unemployment'] = scenario['unemployment']
    modified_data['GDP'] = scenario['GDP']
    return modified_data

# Stress test
stress_data = simulate_scenario(data, scenarios['stress'])

# Predict default probabilities under stress scenario
stress_data_scaled = scaler.transform(stress_data[features])
stress_preds = rf_best.predict_proba(stress_data_scaled)[:, 1]

print(f"Stress Scenario Default Probability: {np.mean(stress_preds)}")


   age        income  credit_score   loan_amount  unemployment_rate  \
0   56  21864.848417           597  17854.452553           9.008889   
1   69  29498.267917           562  20677.828201           6.002958   
2   46  59544.576625           443  19770.008817           8.256097   
3   32  36399.189971           645  18256.956257           8.281800   
4   60  57140.638811           301  13423.653777           3.721867   

   GDP_growth_rate  default  
0         2.085593        0  
1        -1.455858        0  
2         4.820764        0  
3         4.903475        0  
4         2.887132        0  
Logistic Regression Accuracy: 0.9666666666666667
Random Forest Accuracy: 0.9666666666666667
Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Optimized Random Forest Accuracy: 0.9666666666666667
Stress Scenario Default Probability: 0.0576
