In [39]:
import pandas as pd
import numpy as np
import aif360
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

# Load data
df = pd.read_csv('german_processed.csv')


In [40]:
df

Unnamed: 0,GoodCustomer,Gender,ForeignWorker,Single,Age,LoanDuration,PurposeOfLoan,LoanAmount,LoanRateAsPercentOfIncome,YearsAtCurrentHome,...,OtherLoansAtBank,OtherLoansAtStore,HasCoapplicant,HasGuarantor,OwnsHouse,RentsHouse,Unemployed,YearsAtCurrentJob_lt_1,YearsAtCurrentJob_geq_4,JobClassIsSkilled
0,1,Male,0,1,67,6,Electronics,1169,4,4,...,0,0,0,0,1,0,0,0,1,1
1,-1,Female,0,0,22,48,Electronics,5951,2,2,...,0,0,0,0,1,0,0,0,0,1
2,1,Male,0,1,49,12,Education,2096,2,3,...,0,0,0,0,1,0,0,0,1,0
3,1,Male,0,1,45,42,Furniture,7882,2,4,...,0,0,0,1,0,0,0,0,1,1
4,-1,Male,0,1,53,24,NewCar,4870,3,4,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,Female,0,0,31,12,Furniture,1736,3,4,...,0,0,0,0,1,0,0,0,1,0
996,1,Male,0,0,40,30,UsedCar,3857,4,4,...,0,0,0,0,1,0,0,0,0,1
997,1,Male,0,1,38,12,Electronics,804,4,4,...,0,0,0,0,1,0,0,0,1,1
998,-1,Male,0,1,23,45,Electronics,1845,4,4,...,0,0,0,0,0,0,0,0,0,1


In [41]:
df['Gender'] = df['Gender'].replace({'Male': 1, 'Female': 0})
df['GoodCustomer'] = df['GoodCustomer'].replace({-1: 0})
df = df.dropna()
df = df.reset_index(drop=True)

In [42]:
df.head(2)

Unnamed: 0,GoodCustomer,Gender,ForeignWorker,Single,Age,LoanDuration,PurposeOfLoan,LoanAmount,LoanRateAsPercentOfIncome,YearsAtCurrentHome,...,OtherLoansAtBank,OtherLoansAtStore,HasCoapplicant,HasGuarantor,OwnsHouse,RentsHouse,Unemployed,YearsAtCurrentJob_lt_1,YearsAtCurrentJob_geq_4,JobClassIsSkilled
0,1,1,0,1,67,6,Electronics,1169,4,4,...,0,0,0,0,1,0,0,0,1,1
1,0,0,0,0,22,48,Electronics,5951,2,2,...,0,0,0,0,1,0,0,0,0,1


In [43]:
from sklearn.preprocessing import LabelEncoder

# define the list of column names that you want to encode
string_cols = [ 'PurposeOfLoan']

# create a LabelEncoder object
le = LabelEncoder()

# iterate over the string columns and encode their values
for col in string_cols:
    df[col] = le.fit_transform(df[col])

In [44]:
df.columns

Index(['GoodCustomer', 'Gender', 'ForeignWorker', 'Single', 'Age',
       'LoanDuration', 'PurposeOfLoan', 'LoanAmount',
       'LoanRateAsPercentOfIncome', 'YearsAtCurrentHome',
       'NumberOfOtherLoansAtBank', 'NumberOfLiableIndividuals', 'HasTelephone',
       'CheckingAccountBalance_geq_0', 'CheckingAccountBalance_geq_200',
       'SavingsAccountBalance_geq_100', 'SavingsAccountBalance_geq_500',
       'MissedPayments', 'NoCurrentLoan', 'CriticalAccountOrLoansElsewhere',
       'OtherLoansAtBank', 'OtherLoansAtStore', 'HasCoapplicant',
       'HasGuarantor', 'OwnsHouse', 'RentsHouse', 'Unemployed',
       'YearsAtCurrentJob_lt_1', 'YearsAtCurrentJob_geq_4',
       'JobClassIsSkilled'],
      dtype='object')

In [45]:
data = df
# Split data into features and target
X = data.drop('GoodCustomer', axis=1)
y = data['GoodCustomer']

# One-hot encode categorical variables
cat_cols = ['Gender', 'ForeignWorker', 'Single', 'PurposeOfLoan', 
            'HasTelephone', 'CheckingAccountBalance_geq_0', 'CheckingAccountBalance_geq_200', 
            'SavingsAccountBalance_geq_100', 'SavingsAccountBalance_geq_500', 'NoCurrentLoan', 
            'CriticalAccountOrLoansElsewhere', 'HasCoapplicant', 'HasGuarantor', 'OwnsHouse', 
            'RentsHouse', 'Unemployed', 'YearsAtCurrentJob_lt_1', 'YearsAtCurrentJob_geq_4', 'JobClassIsSkilled']
enc = OneHotEncoder(handle_unknown='ignore')
X_cat = enc.fit_transform(X[cat_cols]).toarray()

# Normalize numerical variables
num_cols = ['Age', 'LoanDuration', 'LoanAmount', 'LoanRateAsPercentOfIncome', 
            'YearsAtCurrentHome', 'NumberOfOtherLoansAtBank', 'NumberOfLiableIndividuals', 
            'MissedPayments', 'OtherLoansAtBank', 'OtherLoansAtStore']
scaler = StandardScaler()
X_num = scaler.fit_transform(X[num_cols])

# Combine categorical and numerical features
X = np.concatenate((X_num, X_cat), axis=1)

# Balance dataset
# ros = RandomOverSampler(random_state=0)
# X_resampled, y_resampled = ros.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Save training and testing data
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)
np.save('X_test.npy', X_test)
np.save('y_test.npy', y_test)

feature_names = num_cols + enc.get_feature_names_out().tolist()
np.save("feature_names.npy", np.array(feature_names))

In [35]:
feature_names[11]

'Gender_1'

In [36]:
y_train.shape

(800,)

In [37]:
np.sum(y_train)

559

In [26]:
df['CriticalAccountOrLoansElsewhere']

0      1
1      0
2      1
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    1
Name: CriticalAccountOrLoansElsewhere, Length: 1000, dtype: int64