# 1. Taiwan Dataset

# 1.1. Taiwan Feature Selection

In [4]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

# Load the dataset
data = pd.read_csv('UCI_Credit_Card.csv')

# Drop the specified features
features_to_drop = ['ID']  # Replace with the features you want to drop
data.drop(columns=features_to_drop, inplace=True)

# Split the dataset into features and target variable
X = data.drop(columns=['default.payment.next.month'])
y = data['default.payment.next.month']

# Anomaly detection using Isolation Forest before scaling
isolation_forest = IsolationForest(contamination=0.1, random_state=42)
outlier_preds = isolation_forest.fit_predict(X)

# Remove outliers
X_cleaned = X[outlier_preds == 1]
y_cleaned = y[outlier_preds == 1]

# Applying Robust Scaling after removing outliers
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_cleaned)

# Perform feature selection using information gain
selector = SelectKBest(score_func=mutual_info_classif, k=5)  # Select top 5 features
X_selected = selector.fit_transform(X_scaled, y_cleaned)

# Display selected feature names
selected_feature_names = X.columns[selector.get_support(indices=True)]
print("Selected Features:", selected_feature_names)



Selected Features: Index(['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_AMT1'], dtype='object')


# 1.2. Taiwan Main Code

In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.tree import DecisionTreeClassifier
from rotation_forest import RotationForestClassifier
from sklearn.ensemble import RandomForestClassifier
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from imblearn.combine import SMOTEENN
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from keras.models import Sequential
from keras.layers import Dense, LSTM
# Load the Taiwan Credit Risk dataset
df = pd.read_csv('UCI_Credit_Card.csv')
df.drop(['PAY_6',  'PAY_5', 'PAY_AMT4', 'PAY_AMT3', 'PAY_AMT6', 'LIMIT_BAL', 'PAY_AMT5', 'PAY_AMT2', 'BILL_AMT1', 'EDUCATION', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT5', 'BILL_AMT6', 'ID', 'BILL_AMT4', 'MARRIAGE', 'AGE', 'SEX'],axis=1, inplace=True)

# Split the dataset into features and target variable
X = df.drop(columns=['default.payment.next.month'])
y = df['default.payment.next.month']

# Rescale data using RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Handle class imbalance using SMOTEENN
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_scaled, y)

# Convert data to 3D for LSTM input
X_reshaped = X_resampled.reshape(X_resampled.shape[0], 1, X_resampled.shape[1])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_resampled, test_size=0.2, random_state=42)

# Define LSTM architecture with multiple layers
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(units=40, return_sequences=True))
model.add(LSTM(units=30, return_sequences=True))
model.add(LSTM(units=20))  # Last layer doesn't need return_sequences=True
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# Print accuracy and loss of each epoch
for epoch, acc in enumerate(history.history['accuracy']):
    print("Epoch", epoch+1, "- Accuracy:", acc, "- Loss:", history.history['loss'][epoch])

# Get predicted probabilities
y_pred_prob = model.predict(X_test)

# Convert probabilities to class labels
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate accuracy
overall_accuracy = accuracy_score(y_test, y_pred)
print("Overall Accuracy:", overall_accuracy)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

# Calculate AUC score
auc = roc_auc_score(y_test, y_pred_prob)
print("AUC Score:", auc)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.2292320728302002
Test Accuracy: 0.9211705923080444
Epoch 1 - Accuracy: 0.868655264377594 - Loss: 0.36899593472480774
Epoch 2 - Accuracy: 0.9154581427574158 - Loss: 0.26318901777267456
Epoch 3 - Accuracy: 0.919495701789856 - Loss: 0.25123119354248047
Epoch 4 - Accuracy: 0.9211437106132507 - Loss: 0.2452741414308548
Epoch 5 - Accuracy: 0.9205669164657593 - Loss: 0.24243450164794922
Epoch 6 - Accuracy: 0.9218853116035461 - Loss: 0.23899047076702118
Epoch 7 - Accuracy: 0.9209789037704468 - Loss: 0.2362259328365326
Epoch 8 - Accuracy: 0.9222972989082336 - Loss: 0.23372304439544678
Epoch 9 - Accuracy: 0.9222972989082336 - Loss: 0.23182721436023712
Epoch 10 - Accuracy: 0.9227917194366455 - Loss: 0.2307872474193573
Overall Accuracy: 0.9211705773793831
F1 Score: 0.8715083798882682
AUC Score: 0.9447917658786851


# 2. Australian Dataset

# 2.1. Australian Feature Selection

In [9]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

# Load the dataset
df = pd.read_csv('australian.dat', sep=' ', header=None)

# Split the dataset into features and target variable
X = df.drop(columns=[14])
y = df[14]


# Anomaly detection using Isolation Forest before scaling
isolation_forest = IsolationForest(contamination=0.1, random_state=42)
outlier_preds = isolation_forest.fit_predict(X)

# Remove outliers
X_cleaned = X[outlier_preds == 1]
y_cleaned = y[outlier_preds == 1]

# Applying Robust Scaling after removing outliers
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_cleaned)

# Perform feature selection using information gain
selector = SelectKBest(score_func=mutual_info_classif, k=5)  # Select top 5 features
X_selected = selector.fit_transform(X_scaled, y_cleaned)

# Display selected feature names
selected_feature_names = X.columns[selector.get_support(indices=True)]
print("Selected Features:", selected_feature_names)

Selected Features: Int64Index([6, 7, 8, 9, 13], dtype='int64')


# 2.2. Australian Main Code

In [11]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.tree import DecisionTreeClassifier
from rotation_forest import RotationForestClassifier
from sklearn.ensemble import RandomForestClassifier
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from imblearn.combine import SMOTEENN
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from keras.models import Sequential
from keras.layers import Dense, LSTM
# Load the dataset
df = pd.read_csv('australian.dat', sep=' ', header=None)
df.drop([0, 1, 2, 3, 4, 5,  10, 11, 12], axis=1, inplace=True)

# Split the dataset into features and target variable
X = df.drop(columns=[14])
y = df[14]

# Rescale data using RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Handle class imbalance using SMOTEENN
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_scaled, y)

# Convert data to 3D for LSTM input
X_reshaped = X_resampled.reshape(X_resampled.shape[0], 1, X_resampled.shape[1])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_resampled, test_size=0.2, random_state=42)

# Define LSTM architecture with multiple layers
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(units=40, return_sequences=True))
model.add(LSTM(units=30, return_sequences=True))
model.add(LSTM(units=20))  # Last layer doesn't need return_sequences=True
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# Print accuracy and loss of each epoch
for epoch, acc in enumerate(history.history['accuracy']):
    print("Epoch", epoch+1, "- Accuracy:", acc, "- Loss:", history.history['loss'][epoch])

# Get predicted probabilities
y_pred_prob = model.predict(X_test)

# Convert probabilities to class labels
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate accuracy
overall_accuracy = accuracy_score(y_test, y_pred)
print("Overall Accuracy:", overall_accuracy)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

# Calculate AUC score
auc = roc_auc_score(y_test, y_pred_prob)
print("AUC Score:", auc)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.2862035930156708
Test Accuracy: 0.9569892287254333
Epoch 1 - Accuracy: 0.5286195278167725 - Loss: 0.6928650736808777
Epoch 2 - Accuracy: 0.5656565427780151 - Loss: 0.6912654042243958
Epoch 3 - Accuracy: 0.6969696879386902 - Loss: 0.6874521970748901
Epoch 4 - Accuracy: 0.8451178669929504 - Loss: 0.6771596670150757
Epoch 5 - Accuracy: 0.9057239294052124 - Loss: 0.6489147543907166
Epoch 6 - Accuracy: 0.932659924030304 - Loss: 0.5823029279708862
Epoch 7 - Accuracy: 0.9292929172515869 - Loss: 0.481110543012619
Epoch 8 - Accuracy: 0.9259259104728699 - Loss: 0.39432546496391296
Epoch 9 - Accuracy: 0.939393937587738 - Loss: 0.339498907327652
Epoch 10 - Accuracy: 0.9494949579238892 - Loss: 0.29572516679763794
Overall Accuracy: 0.956989247311828
F1 Score: 0.9591836734693877
AUC Score: 0.9814814814814814


# 3. German Dataset

# 3.1. German Feature Selection

In [3]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('german_credit_data.csv')

# Drop unnecessary columns
df.drop(['Unnamed: 0', 'Checking account'], axis=1, inplace=True)

# Convert categorical variables to numerical form using label encoding
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Housing'] = le.fit_transform(df['Housing'])
df['Saving accounts'] = le.fit_transform(df['Saving accounts'])
df['Purpose'] = le.fit_transform(df['Purpose'])

# Split the dataset into features and target variable
X = df.drop(columns=['Risk'])
y = df['Risk']

# Anomaly detection using Isolation Forest before scaling
isolation_forest = IsolationForest(contamination=0.1, random_state=42)
outlier_preds = isolation_forest.fit_predict(X)

# Remove outliers
X_cleaned = X[outlier_preds == 1]
y_cleaned = y[outlier_preds == 1]

# Applying Robust Scaling after removing outliers
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_cleaned)

# Perform feature selection using information gain
selector = SelectKBest(score_func=mutual_info_classif, k=5)  # Select top 5 features
X_selected = selector.fit_transform(X_scaled, y_cleaned)

# Display selected feature names
selected_feature_names = X.columns[selector.get_support(indices=True)]
print("Selected Features:", selected_feature_names)

Selected Features: Index(['Age', 'Sex', 'Saving accounts', 'Duration', 'Purpose'], dtype='object')




# 3.2. German Main Code

In [5]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from imblearn.combine import SMOTEENN
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from keras.models import Sequential
from keras.layers import Dense, LSTM
from sklearn.preprocessing import LabelEncoder

# Load the German Credit Risk dataset
df = pd.read_csv('german_credit_data.csv')

# Drop unnecessary columns
df.drop(['Unnamed: 0', 'Job','Housing','Checking account','Credit amount', 'Risk'], axis=1, inplace=True)

# Convert categorical variables to numerical form using label encoding
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
#df['Housing'] = le.fit_transform(df['Housing'])
#df['Risk'] = le.fit_transform(df['Risk'])
df['Saving accounts'] = le.fit_transform(df['Saving accounts'])
#df['Checking account'] = le.fit_transform(df['Checking account'])
df['Purpose'] = le.fit_transform(df['Purpose'])

# Split the dataset into input and output variables
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Rescale data using RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Handle class imbalance using SMOTEENN
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_scaled, y)

# Convert data to 3D for LSTM input
X_reshaped = X_resampled.reshape(X_resampled.shape[0], 1, X_resampled.shape[1])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_resampled, test_size=0.2, random_state=42)

# Define LSTM architecture with multiple layers
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(units=40, return_sequences=True))
model.add(LSTM(units=30, return_sequences=True))
model.add(LSTM(units=20))  # Last layer doesn't need return_sequences=True
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# Print accuracy and loss of each epoch
for epoch, acc in enumerate(history.history['accuracy']):
    print("Epoch", epoch+1, "- Accuracy:", acc, "- Loss:", history.history['loss'][epoch])

# Get predicted probabilities
y_pred_prob = model.predict(X_test)

# Convert probabilities to class labels
y_pred = (y_pred_prob > 0.5).astype(int)

# Convert y_test to binary labels
y_test_binary = (y_test > 0).astype(int)

# Calculate accuracy
overall_accuracy = accuracy_score(y_test_binary, y_pred)
print("Overall Accuracy:", overall_accuracy)

# Calculate F1 score
f1 = f1_score(y_test_binary, y_pred)
print("F1 Score:", f1)

# Calculate AUC score
auc = roc_auc_score(y_test_binary, y_pred_prob)
print("AUC Score:", auc)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: -25.509479522705078
Test Accuracy: 0.012578615918755531
Epoch 1 - Accuracy: 0.009842519648373127 - Loss: 0.6121997237205505
Epoch 2 - Accuracy: 0.003937007859349251 - Loss: 0.38130536675453186
Epoch 3 - Accuracy: 0.003937007859349251 - Loss: -0.12257333099842072
Epoch 4 - Accuracy: 0.003937007859349251 - Loss: -1.6974008083343506
Epoch 5 - Accuracy: 0.003937007859349251 - Loss: -6.393825054168701
Epoch 6 - Accuracy: 0.003937007859349251 - Loss: -12.540762901306152
Epoch 7 - Accuracy: 0.003937007859349251 - Loss: -17.228303909301758
Epoch 8 - Accuracy: 0.003937007859349251 - Loss: -20.30708122253418
Epoch 9 - Accuracy: 0.003937007859349251 - Loss: -22.353191375732422
Epoch 10 - Accuracy: 0.003937007859349251 - Loss: -23.8876895904541
Overall Accuracy: 0.8867924528301887
F1 Score: 0.9400000000000001
AUC Score: 0.4387312844759653
