In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, silhouette_score
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from scipy.stats import mode

In [2]:
df = pd.read_csv('german_credit_data.csv', index_col=0)

In [3]:
categorical_cols = ['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [4]:
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [5]:
numerical_cols = ['Age', 'Job', 'Credit amount', 'Duration']
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())

In [6]:
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [7]:
kmeans = KMeans(n_clusters=2, random_state=42)
gmm = GaussianMixture(n_components=2, random_state=42)
agg_clustering = AgglomerativeClustering(n_clusters=2)

kmeans_labels = kmeans.fit_predict(df_scaled)
gmm_labels = gmm.fit_predict(df_scaled)
agg_labels = agg_clustering.fit_predict(df_scaled)



In [8]:
labels_matrix = np.vstack([kmeans_labels, gmm_labels, agg_labels]).T
ensemble_labels, _ = mode(labels_matrix, axis=1)


In [9]:
df['Ensemble_Cluster'] = ensemble_labels.flatten()
df['Risk'] = df['Ensemble_Cluster'].apply(lambda x: 'Risk' if x == 1 else 'Non-Risk')

silhouette_avg = silhouette_score(df_scaled, ensemble_labels.flatten())
print(f"Silhouette Score for Ensemble Clustering: {silhouette_avg:.4f}")

Silhouette Score for Ensemble Clustering: 0.1287


In [10]:
X = df.drop(columns=['Ensemble_Cluster', 'Risk'])
y = df['Risk'].map({'Risk': 1, 'Non-Risk': 0}) 

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training class distribution:", pd.Series(y_train).value_counts())
print("Test class distribution:", pd.Series(y_test).value_counts())

Training class distribution: Risk
0    639
1    161
Name: count, dtype: int64
Test class distribution: Risk
0    160
1     40
Name: count, dtype: int64


In [12]:
over = SMOTE(sampling_strategy=0.5, random_state=42) 
under = RandomUnderSampler(sampling_strategy=1.0, random_state=42) 
pipeline = Pipeline(steps=[('o', over), ('u', under)])

In [13]:
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)
print("Resampled training class distribution:", pd.Series(y_train_resampled).value_counts())

Resampled training class distribution: Risk
0    319
1    319
Name: count, dtype: int64


In [14]:
rf_clf = RandomForestClassifier(class_weight='balanced', random_state=42)
log_reg_clf = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
gb_clf = GradientBoostingClassifier(random_state=42)

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

In [15]:
grid_search_rf = GridSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42),
                              param_grid_rf, cv=5, scoring='f1', n_jobs=-1)
grid_search_rf.fit(X_train_resampled, y_train_resampled)
best_rf = grid_search_rf.best_estimator_
print("Best Random Forest Parameters:", grid_search_rf.best_params_)

Best Random Forest Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}


In [16]:

param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}
grid_search_gb = GridSearchCV(GradientBoostingClassifier(random_state=42),
                              param_grid_gb, cv=5, scoring='f1', n_jobs=-1)
grid_search_gb.fit(X_train_resampled, y_train_resampled)
best_gb = grid_search_gb.best_estimator_
print("Best Gradient Boosting Parameters:", grid_search_gb.best_params_)

Best Gradient Boosting Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}


In [None]:
ensemble_clf = VotingClassifier(
    estimators=[
        ('rf', best_rf),
        ('lr', log_reg_clf),
        ('gb', best_gb)
    ],
    voting='soft'  
)

cv_scores = cross_val_score(ensemble_clf, X_train_resampled, y_train_resampled, cv=5, scoring='f1')
print(f"Cross-Validation F1 Scores: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

In [None]:

ensemble_clf.fit(X_train_resampled, y_train_resampled)

y_pred = ensemble_clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)

print("\nTest Set Evaluation Metrics (SMOTE + Undersampling):")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

In [None]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Non-Risk', 'Risk']))

In [None]:
y_pred_proba = ensemble_clf.predict_proba(X_test)[:, 1]
threshold = 0.4  
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)

print("\nTest Set Evaluation Metrics (Threshold = 0.4):")
print(f"Accuracy: {accuracy_score(y_test, y_pred_adjusted):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_adjusted, pos_label=1):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_adjusted, pos_label=1):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_adjusted, pos_label=1):.4f}")

print("\nConfusion Matrix (Threshold = 0.4):")
print(confusion_matrix(y_test, y_pred_adjusted))

In [None]:
adasyn = ADASYN(sampling_strategy='auto', random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)
print("ADASYN training class distribution:", pd.Series(y_train_adasyn).value_counts())

ensemble_clf.fit(X_train_adasyn, y_train_adasyn)
y_pred_adasyn = ensemble_clf.predict(X_test)

print("\nTest Set Evaluation Metrics (ADASYN):")
print(f"Accuracy: {accuracy_score(y_test, y_pred_adasyn):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_adasyn, pos_label=1):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_adasyn, pos_label=1):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_adasyn, pos_label=1):.4f}")

print("\nConfusion Matrix (ADASYN):")
print(confusion_matrix(y_test, y_pred_adasyn))
print("\nClassification Report (ADASYN):")
print(classification_report(y_test, y_pred_adasyn, target_names=['Non-Risk', 'Risk']))

