In [279]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

I used the German Credit data, which has no missing values and 20 features. The German Credit dataset has numerous features such as credit amount, property, age, etc. It's target feature is Bad credit (2) and good credit(1). It contains 1000 data points. The data mainly has qualitative features, so categorical encoding must be done. Since a logistic regression model is being used, the data is encoded with ordinal and one hot encoding techniques. Firstly, the data is put into a readable format and then encoded.

In [280]:
data_url = "/Users/trishanandakumar/Desktop/BURE/Datasets/statlog+german+credit+data/german.data"

columns = [
    'Checking Account Status', 'Duration', 'Credit History', 'Purpose', 'Credit Amount', 
    'Savings', 'Employment Since', 'Installment Rate', 'Personal Info', 
    'Debtors', 'Residence Since', 'Property', 'Age', 'Installment Plans', 
    'Housing', 'Existing Credits', 'Job', 'Liables', 'Telephone', 'Foreign', 
    'Target'
]

Attribute1 = {
    'A11': '< 0 DM',
    'A12': '0 <= ... < 200 DM',
    'A13': '>= 200 DM',
    'A14': 'no checking account'
}
Attribute3 = {
    'A30': 'no credits taken/all paid back duly',
    'A31': 'all credits paid back duly',
    'A32': 'existing credits paid back duly till now',
    'A33': 'delay in paying off in the past',
    'A34': 'critical account/other credits existing'
}
Attribute4 = {
    'A40': 'car (new)',
    'A41': 'car (used)',
    'A42': 'furniture/equipment',
    'A43': 'radio/television',
    'A44': 'domestic appliances',
    'A45': 'repairs',
    'A46': 'education',
    'A47': 'vacation',
    'A48': 'retraining',
    'A49': 'business',
    'A410': 'other'
}
Attribute6 = {
    'A61': '< 100 DM',
    'A62': '100 <= ... < 500 DM',
    'A63': '500 <= ... < 1000 DM',
    'A64': '>= 1000 DM',
    'A65': 'unknown/no savings account'
}
Attribute7 = {
    'A71': 'unemployed',
    'A72': '< 1 year',
    'A73': '1 <= ... < 4 years',
    'A74': '4 <= ... < 7 years',
    'A75': '>= 7 years'
}
Attribute9 = {
    'A91': 'male, divorced/separated',
    'A92': 'female, divorced/separated/married',
    'A93': 'male, single',
    'A94': 'male, married/widowed',
    'A95': 'female, single'
}
Attribute10 = {
    'A101': 'none',
    'A102': 'co-applicant',
    'A103': 'guarantor'
}
Attribute12 = {
    'A121': 'real estate',
    'A122': 'building society savings/life insurance',
    'A123': 'car or other',
    'A124': 'unknown / no property'
}
Attribute14 = {
    'A141': 'bank',
    'A142': 'stores',
    'A143': 'none'
}
Attribute15 = {
    'A151': 'rent',
    'A152': 'own',
    'A153': 'for free'
}
Attribute17 = {
    'A171': 'unemployed/unskilled - non-resident',
    'A172': 'unskilled - resident',
    'A173': 'skilled employee/official',
    'A174': 'management/self-employed/highly qualified'
}
Attribute19 = {
    'A191': 'none',
    'A192': 'yes, registered'
}
Attribute20 = {
    'A201': 'yes',
    'A202': 'no'
}


german_data = pd.read_csv(data_url, sep=' ', header=None, names=columns)

german_data['Checking Account Status'] = german_data['Checking Account Status'].map(Attribute1) 
german_data['Credit History'] = german_data['Credit History'].map(Attribute3)
german_data['Purpose'] = german_data['Purpose'].map(Attribute4)
german_data['Savings'] = german_data['Savings'].map(Attribute6)
german_data['Employment Since'] = german_data['Employment Since'].map(Attribute7)
german_data['Personal Info'] = german_data['Personal Info'].map(Attribute9)
german_data['Debtors'] = german_data['Debtors'].map(Attribute10)
german_data['Property'] = german_data['Property'].map(Attribute12)
german_data['Installment Plans'] = german_data['Installment Plans'].map(Attribute14)
german_data['Housing'] = german_data['Housing'].map(Attribute15)
german_data['Job'] = german_data['Job'].map(Attribute17)
german_data['Telephone'] = german_data['Telephone'].map(Attribute19)    
german_data['Foreign'] = german_data['Foreign'].map(Attribute20)


pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

german_data.head(5)
#german_data.columns


Unnamed: 0,Checking Account Status,Duration,Credit History,Purpose,Credit Amount,Savings,Employment Since,Installment Rate,Personal Info,Debtors,Residence Since,Property,Age,Installment Plans,Housing,Existing Credits,Job,Liables,Telephone,Foreign,Target
0,< 0 DM,6,critical account/other credits existing,radio/television,1169,unknown/no savings account,>= 7 years,4,"male, single",none,4,real estate,67,none,own,2,skilled employee/official,1,"yes, registered",yes,1
1,0 <= ... < 200 DM,48,existing credits paid back duly till now,radio/television,5951,< 100 DM,1 <= ... < 4 years,2,"female, divorced/separated/married",none,2,real estate,22,none,own,1,skilled employee/official,1,none,yes,2
2,no checking account,12,critical account/other credits existing,education,2096,< 100 DM,4 <= ... < 7 years,2,"male, single",none,3,real estate,49,none,own,1,unskilled - resident,2,none,yes,1
3,< 0 DM,42,existing credits paid back duly till now,furniture/equipment,7882,< 100 DM,4 <= ... < 7 years,2,"male, single",guarantor,4,building society savings/life insurance,45,none,for free,1,skilled employee/official,2,none,yes,1
4,< 0 DM,24,delay in paying off in the past,car (new),4870,< 100 DM,1 <= ... < 4 years,3,"male, single",none,4,unknown / no property,53,none,for free,2,skilled employee/official,2,none,yes,2


Below the one hot encoder is being

In [282]:
#print("Unique values in Personal Info column:")
#print(german_data['Personal Info'].value_counts())

one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')
to_encode_columns = ['Purpose', 'Personal Info', 'Debtors', 'Job', 'Housing', 'Property', 'Installment Plans','Telephone','Foreign']

encoded_val = one_hot_encoder.fit_transform(german_data[to_encode_columns])
feature_names = one_hot_encoder.get_feature_names_out(to_encode_columns)

onehot_df = pd.DataFrame(encoded_val, columns=feature_names, index=german_data.index)

german_data = german_data.drop(to_encode_columns, axis=1)
german_data = pd.concat([german_data, onehot_df], axis=1)


Unnamed: 0,Checking Account Status,Duration,Credit History,Credit Amount,Savings,Employment Since,Installment Rate,Residence Since,Age,Existing Credits,Liables,Target,Purpose_car (new),Purpose_car (used),Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_other,Purpose_radio/television,Purpose_repairs,Purpose_retraining,"Personal Info_male, divorced/separated","Personal Info_male, married/widowed","Personal Info_male, single",Debtors_guarantor,Debtors_none,Job_skilled employee/official,Job_unemployed/unskilled - non-resident,Job_unskilled - resident,Housing_own,Housing_rent,Property_car or other,Property_real estate,Property_unknown / no property,Installment Plans_none,Installment Plans_stores,"Telephone_yes, registered",Foreign_yes
0,< 0 DM,6,critical account/other credits existing,1169,unknown/no savings account,>= 7 years,4,4,67,2,1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
1,0 <= ... < 200 DM,48,existing credits paid back duly till now,5951,< 100 DM,1 <= ... < 4 years,2,2,22,1,1,2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,no checking account,12,critical account/other credits existing,2096,< 100 DM,4 <= ... < 7 years,2,3,49,1,2,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,< 0 DM,42,existing credits paid back duly till now,7882,< 100 DM,4 <= ... < 7 years,2,4,45,1,2,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,< 0 DM,24,delay in paying off in the past,4870,< 100 DM,1 <= ... < 4 years,3,4,53,2,2,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


In [283]:
ord_encoder = OrdinalEncoder()

encoded_val_2 = ord_encoder.fit_transform(german_data[['Checking Account Status', 'Credit History']])

german_data[['Checking Account Status', 'Credit History']] = pd.DataFrame(
    encoded_val_2,
    columns=['Checking Account Status', 'Credit History'],
    index=german_data.index
)

savings_category = [['unknown/no savings account', '< 100 DM', '100 <= ... < 500 DM', 
                      '500 <= ... < 1000 DM', '>= 1000 DM']]

savings_encoder = OrdinalEncoder(categories=savings_category)
german_data['Savings'] = savings_encoder.fit_transform(german_data[['Savings']])

employment_category = [['unemployed', '< 1 year', '1 <= ... < 4 years','4 <= ... < 7 years', '>= 7 years']]
employment_encoder = OrdinalEncoder(categories=employment_category)
german_data['Employment Since'] = employment_encoder.fit_transform(german_data[['Employment Since']])


In [284]:
german_data.head(5)

Unnamed: 0,Checking Account Status,Duration,Credit History,Credit Amount,Savings,Employment Since,Installment Rate,Residence Since,Age,Existing Credits,Liables,Target,Purpose_car (new),Purpose_car (used),Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_other,Purpose_radio/television,Purpose_repairs,Purpose_retraining,"Personal Info_male, divorced/separated","Personal Info_male, married/widowed","Personal Info_male, single",Debtors_guarantor,Debtors_none,Job_skilled employee/official,Job_unemployed/unskilled - non-resident,Job_unskilled - resident,Housing_own,Housing_rent,Property_car or other,Property_real estate,Property_unknown / no property,Installment Plans_none,Installment Plans_stores,"Telephone_yes, registered",Foreign_yes
0,1.0,6,1.0,1169,0.0,4.0,4,4,67,2,1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
1,0.0,48,3.0,5951,1.0,2.0,2,2,22,1,1,2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,3.0,12,1.0,2096,1.0,3.0,2,3,49,1,2,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,1.0,42,3.0,7882,1.0,3.0,2,4,45,1,2,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1.0,24,2.0,4870,1.0,2.0,3,4,53,2,2,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
