# import and packages

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

train = pd.read_csv('Wilson_train.csv', low_memory = False, index_col = False)
test = pd.read_csv('Wilson_test.csv', low_memory = False, index_col = False)

# (training) cleaning null and strange value

In [4]:
# column 2 3 8 9 25
train = train.dropna()

# column 2 (only for training data)
train = train[train['Date_Of_Disbursement'].str.len() < 10]

# column 3
train = train[(train.iloc[:, 2] != '0')]

# column 6 (only for training data)
train.iloc[:, 5] = train.iloc[:, 5].str[:4]
train['Year_Of_Commitment '] = pd.to_numeric(train['Year_Of_Commitment '])

# column 7
train.iloc[:, 6] = train.iloc[:, 6].str[3:]
train['Guaranteed_Approved _Loan'] = pd.to_numeric(train['Guaranteed_Approved _Loan'])

# column 9
train = train[(train['Low_Documentation_Loan'] == 'Yes') | (train['Low_Documentation_Loan'] == 'No')]

# column 12
train.iloc[:, 11] = train.iloc[:, 11].str[3:]
train['ChargedOff_Amount '] = pd.to_numeric(train['ChargedOff_Amount '])

# column 18
train.iloc[:, 17] = train.iloc[:, 17].str[3:]
train['Loan_Approved_Gross'] = pd.to_numeric(train['Loan_Approved_Gross'])

# column 19
train.iloc[:, 18] = train.iloc[:, 18].str[3:]
train['Gross_Amount_Disbursed  '] = pd.to_numeric(train['Gross_Amount_Disbursed  '])

# column 23
train.iloc[:, 22] = train.iloc[:, 22] > 0
train['Code_Franchise'] = train['Code_Franchise']*1

# column 25
train['Revolving_Credit_Line'] = train['Revolving_Credit_Line'].str.replace('0' , 'No')
train['Revolving_Credit_Line'] = train['Revolving_Credit_Line'].str.replace('1' , 'Yes')
train['Revolving_Credit_Line'] = train['Revolving_Credit_Line'].str.replace('T' , 'Yes')
train = train[(train['Revolving_Credit_Line'] == 'Yes') | (train['Revolving_Credit_Line'] == 'No')]

# (training) new column: day difference = disbursement date - commitment date

In [7]:
# function for date difference
def days_difference(d1, d2):
    d1 = datetime.strptime(d1, '%d-%b-%y')
    d2 = datetime.strptime(d2, '%d-%b-%y')
    return abs((d2 - d1).days)

# creating new variable days difference
days_diff_train = [None]*train.shape[0]
for i in range(0, train.shape[0]):
    commit_date = train['Commitment_Date'].iloc[i]
    disburse_date = train['Date_Of_Disbursement'].iloc[i]
    days_diff_train[i] = days_difference(commit_date, disburse_date)

# add the new column to the original dataset
train['Days_Difference'] = days_diff_train
columns = list(train.columns)
columns[-2], columns[-1] = columns[-1], columns[-2]
train = train[columns]

# (training) converting categorical variable to numeric variable

In [9]:
# convert 'Date_Of_Disbursement' to days since the earliest
train['Date_Of_Disbursement'] = pd.to_datetime(train['Date_Of_Disbursement'], format='%d-%b-%y')
earliest_date = train['Date_Of_Disbursement'].min()
train['Date_Of_Disbursement'] = (train['Date_Of_Disbursement'] - earliest_date).dt.days

# convert 'Business' to categorical variable
train['Business'] = pd.Categorical(train['Business'])

# convert 'Low_Documentation_Loan' to categorical variable
train['Low_Documentation_Loan'] = pd.Categorical(train['Low_Documentation_Loan'])

# convert 'Demography' to categorical variable
train['Demography'] = pd.Categorical(train['Demography'])

# convert 'State_Of_Bank' to categorical variable
train['State_Of_Bank'] = pd.Categorical(train['State_Of_Bank'])

# convert 'Borrower_State' to categorical variable
train['Borrower_State'] = pd.Categorical(train['Borrower_State'])

# convert 'Commitment_Date' to days since the earliest
train['Commitment_Date'] = pd.to_datetime(train['Commitment_Date'], format='%d-%b-%y')
earliest_date = train['Commitment_Date'].min()
train['Commitment_Date'] = (train['Commitment_Date'] - earliest_date).dt.days

# convert 'Revolving_Credit_Line' to categorical variable
train['Revolving_Credit_Line'] = pd.Categorical(train['Revolving_Credit_Line'])

# (training) discard unwanted column

In [13]:
# drop columns
train = train.drop(['ID', 'Borrower_Name ', 'Borrower_City', 'Gross_Amount_Balance', 
                    'Classification_Code ', 'Primary_Loan_Digit', 'Name_Of_Bank'], axis=1)

# (testing) managing null and strange value

In [16]:
# column 7
test.iloc[:, 6] = test.iloc[:, 6].str[3:]
test['Guaranteed_Approved _Loan'] = pd.to_numeric(test['Guaranteed_Approved _Loan'])

# column 12
test.iloc[:, 11] = test.iloc[:, 11].str[3:]
test['ChargedOff_Amount '] = pd.to_numeric(test['ChargedOff_Amount '])

# column 18
test.iloc[:, 17] = test.iloc[:, 17].str[3:]
test['Loan_Approved_Gross'] = pd.to_numeric(test['Loan_Approved_Gross'])

# column 19
test.iloc[:, 18] = test.iloc[:, 18].str[3:]
test['Gross_Amount_Disbursed  '] = pd.to_numeric(test['Gross_Amount_Disbursed  '])

# column 23
test.iloc[:, 22] = test.iloc[:, 22] > 0
test['Code_Franchise'] = test['Code_Franchise']*1

# column 25
test['Revolving_Credit_Line'] = test['Revolving_Credit_Line'].str.replace('0' , 'No')
test['Revolving_Credit_Line'] = test['Revolving_Credit_Line'].str.replace('1' , 'Yes')
test['Revolving_Credit_Line'] = test['Revolving_Credit_Line'].str.replace('T' , 'Yes')

# (testing) new column: day difference = disbursement date - commitment date

In [19]:
# function for date difference
def days_difference(d1, d2):
    try:
        if isinstance(d1, float):
            d1 = datetime.fromordinal(int(d1))
        else:
            d1 = datetime.strptime(str(d1), '%d-%b-%y')
        if isinstance(d2, float):
            d2 = datetime.fromordinal(int(d2))
        else:
            d2 = datetime.strptime(str(d2), '%d-%b-%y')
        return abs((d2 - d1).days)
    except ValueError:
        return None

# creating new variable days difference
days_diff_test = [None]*test.shape[0]
for i in range(0, test.shape[0]):
    commit_date = test['Commitment_Date'].iloc[i]
    disburse_date = test['Date_Of_Disbursement'].iloc[i]
    days_diff_test[i] = days_difference(commit_date, disburse_date)

# add the new column to the original dataset
test['Days_Difference'] = days_diff_test
columns = list(test.columns)
columns[-2], columns[-1] = columns[-1], columns[-2]
test = test[columns]

# (testing) converting categorical variable to numeric variable

In [22]:
# convert 'Date_Of_Disbursement' to days since the earliest
test['Date_Of_Disbursement'] = pd.to_datetime(test['Date_Of_Disbursement'], format='%d-%b-%y')
earliest_date = test['Date_Of_Disbursement'].min()
test['Date_Of_Disbursement'] = (test['Date_Of_Disbursement'] - earliest_date).dt.days

# convert 'Business' to categorical variable
test['Business'] = pd.Categorical(test['Business'])

# convert 'Low_Documentation_Loan' to categorical variable
test['Low_Documentation_Loan'] = pd.Categorical(test['Low_Documentation_Loan'])

# convert 'Demography' to categorical variable
test['Demography'] = pd.Categorical(test['Demography'])

# convert 'State_Of_Bank' to categorical variable
test['State_Of_Bank'] = pd.Categorical(test['State_Of_Bank'])

# convert 'Borrower_State' to categorical variable
test['Borrower_State'] = pd.Categorical(test['Borrower_State'])

# convert 'Commitment_Date' to days since the earliest
test['Commitment_Date'] = pd.to_datetime(test['Commitment_Date'], format='%d-%b-%y')
earliest_date = test['Commitment_Date'].min()
test['Commitment_Date'] = (test['Commitment_Date'] - earliest_date).dt.days

# convert 'Revolving_Credit_Line' to categorical variable
test['Revolving_Credit_Line'] = pd.Categorical(test['Revolving_Credit_Line'])

# (testing) discard unwanted column

In [25]:
# drop columns
test = test.drop(['ID', 'Borrower_Name ', 'Borrower_City', 'Gross_Amount_Balance', 
                    'Classification_Code ', 'Primary_Loan_Digit', 'Name_Of_Bank'], axis=1)

# GA-BPNN 1

In [28]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, classification_report
from sklearn.neural_network import MLPClassifier
from deap import base, creator, tools, algorithms
import random

# 1. Data Preparation

# Define feature columns
numeric_features = ['Jobs_Reatained', 'Jobs_Created ', 'Year_Of_Commitment ', 'Guaranteed_Approved _Loan',
                    'ChargedOff_Amount ', 'Count_Employees', 'Loan_Approved_Gross', 'Gross_Amount_Disbursed  ',
                    'Loan_Term', 'Code_Franchise', 'Days_Difference']

categorical_features = ['Business', 'Low_Documentation_Loan', 'Demography', 'State_Of_Bank', 'Borrower_State',
                        'Revolving_Credit_Line']

date_features = ['Date_Of_Disbursement', 'Commitment_Date']

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Convert date features to numeric (number of days since a reference date)
reference_date = pd.to_datetime('1970-01-01')

for date_col in date_features:
    train[date_col] = pd.to_datetime(train[date_col])
    test[date_col] = pd.to_datetime(test[date_col])
    
    train[f'{date_col}_days'] = (train[date_col] - reference_date).dt.days
    test[f'{date_col}_days'] = (test[date_col] - reference_date).dt.days
    
    numeric_features.append(f'{date_col}_days')

# Prepare the data
X_train = preprocessor.fit_transform(train)
X_test = preprocessor.transform(test)

y_train = train['Default'].values
y_test = test['Default'].values

# 2. Define GA-BPNN

def create_mlp(hidden_layer_sizes):
    return MLPClassifier(hidden_layer_sizes=hidden_layer_sizes,
                         max_iter=1000,
                         early_stopping=True,
                         random_state=42)

def evaluate_mlp(hidden_layer_sizes):
    mlp = create_mlp(hidden_layer_sizes)
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    return f1_score(y_test, y_pred),

# Setup genetic algorithm
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_int", random.randint, 1, 100)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_int, n=3)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate_mlp)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutUniformInt, low=1, up=100, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)

# 3. Run Genetic Algorithm with progress tracking
def print_stats(gen, population, fits):
    length = len(population)
    mean = sum(fits) / length
    sum2 = sum(x*x for x in fits)
    std = abs(sum2 / length - mean**2)**0.5
    print(f"Gen {gen}: Max = {max(fits):.4f}, Avg = {mean:.4f}, Std = {std:.4f}")

population = toolbox.population(n=50)
ngen = 10
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("max", max)

for gen in range(ngen):
    offspring = algorithms.varAnd(population, toolbox, cxpb=0.5, mutpb=0.2)
    fits = toolbox.map(toolbox.evaluate, offspring)
    for fit, ind in zip(fits, offspring):
        ind.fitness.values = fit
    population = toolbox.select(offspring, k=len(population))
    record = stats.compile(population)
    print_stats(gen, population, [ind.fitness.values[0] for ind in population])

# Get the best individual
best_ind = tools.selBest(population, k=1)[0]
print(f"\nBest hidden layer sizes: {best_ind}")

# 4. Train and evaluate the best model
best_mlp = create_mlp(best_ind)
best_mlp.fit(X_train, y_train)

# Handle missing values in test set
X_test_clean = np.nan_to_num(X_test, nan=0)  # Replace NaN with 0

y_pred = best_mlp.predict(X_test_clean)

# 5. Print results
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nF1 Score:", f1_score(y_test, y_pred))

Gen 0: Max = 0.9865, Avg = 0.9853, Std = 0.0005
Gen 1: Max = 0.9866, Avg = 0.9858, Std = 0.0004
Gen 2: Max = 0.9868, Avg = 0.9861, Std = 0.0004
Gen 3: Max = 0.9868, Avg = 0.9864, Std = 0.0004
Gen 4: Max = 0.9868, Avg = 0.9866, Std = 0.0002
Gen 5: Max = 0.9871, Avg = 0.9867, Std = 0.0003
Gen 6: Max = 0.9871, Avg = 0.9868, Std = 0.0003
Gen 7: Max = 0.9871, Avg = 0.9870, Std = 0.0002
Gen 8: Max = 0.9871, Avg = 0.9871, Std = 0.0001
Gen 9: Max = 0.9871, Avg = 0.9871, Std = 0.0000

Best hidden layer sizes: [14, 64, 7]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     15161
           1       0.98      0.99      0.99      5839

    accuracy                           0.99     21000
   macro avg       0.99      0.99      0.99     21000
weighted avg       0.99      0.99      0.99     21000


F1 Score: 0.9870704321197686
