<a href="https://www.kaggle.com/code/wuttipats/german-credit-risk-model-training-and-evaluating?scriptVersionId=150499084" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<div style="text-align: center;">
    <img src="https://img.freepik.com/free-vector/businessman-pushing-credit-score-speedometer-from-poor-good-tiny-person-improving-personal-credit-history-with-efforts-flat-vector-illustration-business-reputation-customer-loan-concept_74855-20943.jpg?w=740&t=st=1696525462~exp=1696526062~hmac=12cdc5dcb2b9e55f2cf928026560fbb7cbe4bb135e92e23b136bc99b026fa1ab" alt="Fraud Transaction" width="500"/>
    <p style="text-align: center;"><a href="https://www.freepik.com/free-vector/businessman-pushing-credit-score-speedometer-from-poor-good-tiny-person-improving-personal-credit-history-with-efforts-flat-vector-illustration-business-reputation-customer-loan-concept_21683311.htm#query=credit%20score&position=4&from_view=search&track=ais">Image by freepik</a><p>
</div>

In [None]:
import os
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE')

if iskaggle:
    path = '/kaggle/input/german-credit-data-with-risk'
else:
    path = "{}".format(os.getcwd())

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
color_dict = {'good': 'limegreen', 'bad': 'salmon'}

In [None]:
data = pd.read_csv(os.path.join(path,'german_credit_data.csv'), index_col=0)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
# Convert 'Job' column dtype from 'int64' to 'category'
data.Job = data.Job.astype('category')

# Identify categorical columns based on data type
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
data[categorical_columns] = data[categorical_columns].astype('category')
print('Unique values for every categorical columns in dataset\n')
    
# Loop through each categorical column and print unique values
for column in categorical_columns:
    print(f"\t{column}: {list(data[column].unique())}\n")


In [None]:
data.describe()

In [None]:
data.nunique()

In [None]:
sns.heatmap(data.isna())
plt.show()
print(data.isna().sum())

In [None]:
data.duplicated().sum()

# Data Cleaning

In [None]:
df_cleaned = data.copy()

In [None]:
# Data Imputing
for column in categorical_columns:
    most_frequent = df_cleaned[column].mode()[0]  # Get the most frequent value for each column
    df_cleaned[column].fillna(most_frequent, inplace=True)  # Fill NaN values with the most frequent value

In [None]:
df_cleaned.info()

In [None]:
# Remove Outliers
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3-Q1
    lower = Q1-1.5*IQR
    upper = Q3+1.5*IQR
    
    return df.loc[(df[column] >= lower) & (df[column] <= upper)]

numerical_columns = df_cleaned.select_dtypes('int64').columns

# Apply to 'numerical' columns
for col in numerical_columns:
    df_cleaned = remove_outliers(df_cleaned, col)

In [None]:
df_cleaned.describe()

# Data Visaulizing

In [None]:
ordered_colors = [color_dict[label] for label in df_cleaned['Risk'].value_counts().index]
df_cleaned['Risk'].value_counts().plot(kind='bar', color=ordered_colors)
plt.show()

In [None]:
sns.histplot(data=df_cleaned, x='Age', hue='Risk', binrange=[0,80], binwidth=10, palette=color_dict, multiple='fill')
plt.show()

In [None]:
sns.histplot(data=df_cleaned, x='Credit amount', hue='Risk', binrange=[0,8000], binwidth=1000,palette=color_dict, multiple='fill')
plt.show()

In [None]:
sns.histplot(data=df_cleaned, x='Duration', hue='Risk', binrange=[0,40], binwidth=5,palette=color_dict, multiple='fill')
plt.show()

In [None]:
# Boxplot 'numrical' variables across all 'categorical' vaiables
def create_plot0(df, categorical_cols, numerical_cols ,hue):
    for cat in categorical_cols:
        df = df.sort_values(cat)
        for num in numerical_cols:
            sns.boxplot(x=cat, y=num, hue=hue, data=df, palette=color_dict)
            plt.tight_layout
            plt.xticks(rotation=90)
            plt.show()
            print("\n\n")
            
create_plot0(df_cleaned, categorical_columns.drop('Risk'), numerical_columns, 'Risk')


In [None]:
# Plots Categories Columns
def create_plots_1(df, cat_feature, hue_feature):
    
    df = df.sort_values(cat_feature)
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    
    # Plot for count plot
    sns.countplot(x=cat_feature, hue=hue_feature, data=df, ax=axes[0], palette=color_dict)
    axes[0].set_title(f'Count Plot of {cat_feature} vs. {hue_feature}')
    axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=90)

    # Plot for 100% stacked bar chart
    ct = pd.crosstab(df[cat_feature], df[hue_feature])
    ct_percentage = ct.div(ct.sum(axis=1), axis=0) * 100
    ct_percentage.plot(kind='bar', stacked=True, ax=axes[1], color=[color_dict[hue] for hue in ct.columns])
    axes[1].set_ylabel('Percentage')
    axes[1].set_title(f'100% Stacked Bar Chart of {cat_feature} vs. {hue_feature}')

    plt.tight_layout()
    plt.show()


for col in categorical_columns.drop('Risk'):
    create_plots_1(df_cleaned, col, 'Risk')


# Data Preprocessing

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

X = df_cleaned.drop(columns=['Risk'])
y = df_cleaned['Risk']
X_transformed = X.copy()

In [None]:
# Features Viriables Scaling
scaler = MinMaxScaler()
scaler.fit(X_transformed[numerical_columns])
X_transformed[numerical_columns] = scaler.transform(X_transformed[numerical_columns])

In [None]:
X_transformed.head()

In [None]:
# OneHotEncoder
for col in [item for item in categorical_columns if item not in ['Risk']]:
    dummies = pd.get_dummies(X_transformed[col], prefix=col)
    X_transformed.drop(col, axis=1, inplace=True)
    X_transformed = pd.concat([X_transformed, dummies], axis=1)

In [None]:
X_transformed.head()

In [None]:
# Apply SMOTE
from imblearn.over_sampling import SMOTE

def smote(X, y, sampling_strategy='minority'):
    
    # Original label
    print('Before')
    print(f'{y.value_counts()}')
    
    # Initialize SMOTE
    smote = SMOTE(sampling_strategy='minority', random_state=21)

    # Apply SMOTE to our data and check the counts
    X_smote, y_smote = smote.fit_resample(X, y)

    # Convert the result back to a dataframe
    X_smote_df = pd.DataFrame(X_smote, columns=X.columns)
    
    # Transformed label
    print('\n\nAfter SMOTE')
    print(f'{y_smote.value_counts()}')

    return X_smote_df, y_smote

X_transformed, y = smote(X_transformed,y)



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=21)

# Model Training and Evaluating

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_fscore_support

import time

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_validate


# Initialize the models
models = {
    'Logistic Regression': LogisticRegression(),
    'Gaussian NB': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)  # Set verbose=0 to not print training progress
}

# Initialize the StratifiedKFold object
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

# Define the scoring metrics we want to compute
scoring = ['roc_auc']

for model_name, model in models.items():
    # Use cross_validate to get the scores for each fold
    scores = cross_validate(model, X_transformed, y, cv=kfold, scoring=scoring)

    # Print the mean and std for each metric
    for metric_name, score_values in scores.items():
        if metric_name not in ['fit_time', 'score_time']:
            print(f"{model_name} {metric_name}: {np.mean(score_values):.2f} ± {np.std(score_values):.2f}")


In [None]:
def run_model_and_evaluate(model, X_train, X_test, y_train, y_test):
    
    start_time = time.time()  # Start the timer

    # Train the model
    model.fit(X_train, y_train)

    # Predict on test set
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # Get the probability of the positive class
    
    # Stop the timer
    end_time = time.time()
    elapsed_time = end_time - start_time  # Calculate elapsed time in seconds

    # Evaluate classifier's performance
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)  # Use probabilities to compute ROC AUC
    
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)
    
    metrics_dict = {
        'running_time': elapsed_time,
        'accuracy': accuracy,
        'roc_auc': roc_auc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

    print(f"Running Time: {elapsed_time:.2f} seconds")
    print(f"Accuracy: {accuracy}")
    print(f"ROC AUC: {roc_auc}")
    print(classification_report(y_test, y_pred))

    # Compute the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # ROC curve
    fpr, tpr, thresholds = roc_curve(y_test.map({'bad':0, 'good':1}), y_pred_proba)    

    fig, axes = plt.subplots(1, 2, figsize=(8, 4))

    # Confusion matrix
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', cbar=False,
                xticklabels=["bad", "good"], yticklabels=["bad", "good"], ax=axes[0])
    axes[0].set_xlabel('Predicted labels')
    axes[0].set_ylabel('True labels')
    axes[0].set_title('Confusion Matrix')

    # ROC curve
    axes[1].plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    axes[1].plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
    axes[1].set_xlabel('False Positive Rate')
    axes[1].set_ylabel('True Positive Rate')
    axes[1].set_title('Receiver Operating Characteristic (ROC) Curve')
    axes[1].legend(loc='lower right')
    axes[1].grid(alpha=0.2)

    plt.tight_layout()
    plt.show()
    
    return metrics_dict

In [None]:
lr = run_model_and_evaluate(LogisticRegression(), X_train, X_test, y_train, y_test)

In [None]:
nb = run_model_and_evaluate(GaussianNB(), X_train, X_test, y_train, y_test)

In [None]:
dt = run_model_and_evaluate(DecisionTreeClassifier(), X_train, X_test, y_train, y_test)

In [None]:
rf = run_model_and_evaluate(RandomForestClassifier(), X_train, X_test, y_train, y_test)

In [None]:
ada = run_model_and_evaluate(AdaBoostClassifier(), X_train, X_test, y_train, y_test)

In [None]:
gdb = run_model_and_evaluate(GradientBoostingClassifier(), X_train, X_test, y_train, y_test)

In [None]:
cat = run_model_and_evaluate(CatBoostClassifier(verbose=0), X_train, X_test, y_train, y_test)