In [None]:
# import PCA
from sklearn.decomposition import PCA
base_cadastral_pca = PCA(n_components=3).fit_transform(base_cadastral_scaled)
print(base_cadastral_pca.shape)

# use UMAP to reduce dimensionality
import umap
base_cadastral_umap = umap.UMAP(n_components=3).fit_transform(base_cadastral_scaled)
print(base_cadastral_umap.shape)

In [None]:
# plot the PCA
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(base_cadastral_pca[:, 0], base_cadastral_pca[:, 1], base_cadastral_pca[:, 2], c=c_column, cmap='coolwarm')
plt.title('PCA')
plt.show()

# plot the UMAP
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(base_cadastral_umap[:, 0], base_cadastral_umap[:, 1], base_cadastral_umap[:, 2], c=c_column, cmap='coolwarm')
plt.title('UMAP')
plt.show()

In [None]:
# show that ddd and cep_2_dig are not good features using chi2 test
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
X = base_cadastral_scaled.drop(['fraud'], axis=1)
y = base_cadastral_scaled['fraud']  
chi2_selector = SelectKBest(chi2, k=2)
X_kbest = chi2_selector.fit_transform(X, y)
print(X_kbest.shape)
print(chi2_selector.get_support(indices=True))

In [None]:
def is_fraud_dependent_of_the_data_exclusive_to_pagamentos_table(info, pagamentos):
  info = info.copy()
  pagamentos = pagamentos.copy()

  # lable as 1 all rows of pagamentos where the pair (id_cliente, safra_ref) is in info and as 0 otherwise
  pagamentos['id_cliente_safra_ref'] = pagamentos['id_cliente'].astype(str) + '_' + pagamentos['safra_ref'].astype(str)
  info['id_cliente_safra_ref'] = info['id_cliente'].astype(str) + '_' + info['safra_ref'].astype(str)
  pagamentos['coherent'] = np.where(pagamentos['id_cliente_safra_ref'].isin(info['id_cliente_safra_ref']), 1, 0)

  # Chi-square test to determine if the fraud is independent of the coherent column
  from scipy.stats import chi2_contingency
  contingency_table = pd.crosstab(pagamentos['fraud'], pagamentos['coherent'])
  stat, p, dof, expected = chi2_contingency(contingency_table)

  if p < 0.05:
    print('The fraud is dependent of the data in pagamentos where its key does not appear in the info table.\n' +
          'This data should be kept in the pagamentos table')
  else:
    print('The fraud is independent of the data in pagamentos where its key does not appear in the info table\n' +
          'This data can be removed from the pagamentos table')
  return p < 0.05

if not is_fraud_dependent_of_the_data_exclusive_to_pagamentos_table(base_info_date, base_pagamentos_date):
  def clean_pagamentos_table(info, pagamentos):
    # Create a DataFrame that represents the primary key of base_info
    base_info_keys = base_info_date[['id_cliente', 'safra_ref']]

    # Merge with base_pagamentos, keeping only the records with matching keys
    base_pagamentos_coherent = pd.merge(base_pagamentos_date, base_info_keys, on=['id_cliente', 'safra_ref'], how='inner')

    # Count the number of excluded rows
    excluded_rows = len(base_pagamentos_date) - len(base_pagamentos_coherent)
    print(f'Number of excluded rows: {excluded_rows}')
    print(f'Percentage of excluded rows: {excluded_rows / len(base_pagamentos_date) * 100:.2f}%')

    # Save the coherented DataFrame to a new CSV file
    # base_pagamentos_coherent.to_csv('base_pagamentos_desenvolvimento_coherent.csv', index=False)

    print('The pagamentos table was cleaned successfully')
    return base_pagamentos_coherent
  base_pagamentos_date = clean_pagamentos_table(base_info_date, base_pagamentos_date)

In [None]:
# plot a gaussian distribution of late payments for payments within -20 and 20 days of delay
plt.figure(figsize=(10, 6))
sns.histplot(base_pagamentos_date[(base_pagamentos_date['late_payment'] >= -20) & (base_pagamentos_date['late_payment'] <= 20)]['late_payment'], bins=np.arange(-20.5, 20.5, 1), kde=True, log_scale=(False, True))
plt.title('Late payments')
plt.xlabel('Days')
plt.ylabel('Frequency')
plt.show()


In [None]:
base_info_date['safra_ref'].value_counts().sort_index().plot(kind='bar', figsize=(10, 6))

In [None]:
from sheets.create_sheets import create_spreadsheet, upload_csv_to_sheet

spreadsheet_id = create_spreadsheet('Test Spreadsheet')
upload_csv_to_sheet(spreadsheet_id, 'base_pagamentos_sorted_id_emissao_pagamento.csv', 'Sheet1')
upload_csv_to_sheet(spreadsheet_id, 'base_pagamentos_drop_dupl.csv', 'Sheet2')
# print the spreadsheet link
print(f"https://docs.google.com/spreadsheets/d/{spreadsheet_id}/edit")

In [None]:
# # grafico de linha do 50% percentile de transacoes por cliente, transacoes por cliente fraudulento e transacoes fraudulentas por cliente fraudulento
# plt.figure(figsize=(10, 6))

# def concatenate_descriptions(dataframes, names):
#     descriptions = []
    
#     for df, name in zip(dataframes, names):
#         description = df.describe().to_frame().T
#         description.index = [name]
#         descriptions.append(description)
    
#     concatenated_descriptions = pd.concat(descriptions).T
#     return concatenated_descriptions

# dataframes = [transacoes_por_cliente, transacoes_por_cliente_fraudulento, fraudulent_transacoes_por_cliente_fraudulento]
# names = ['transacoes_por_cliente', 'transacoes_por_cliente_fraudulento', 'fraudulent_transacoes_por_cliente_fraudulento']
# transacoes_descriptions = concatenate_descriptions(dataframes, names)

# # plot the line graph
# sns.lineplot(data=transacoes_descriptions.loc[['25%', '50%', '75%'], :], dashes=False)
# plt.title('Number of transactions per client')
# plt.xlabel('Percentile')
# plt.ylabel('Number of transactions')
# plt.show()

# transacoes_descriptions.head(10)
# # base_pagamentos_date.groupby("id_cliente")["fraud"].sum()
# # base_pagamentos_date[base_pagamentos_date["id_cliente"] == 209314261782935157]['fraud']

In [None]:
# clientes aparecem em base_info_date mas não em base_pagamentos_date
clientes_info = base_info_date['id_cliente'].unique()
clientes_pagamentos = base_pagamentos_date['id_cliente'].unique()
clientes_info_not_in_pagamentos = [cliente for cliente in clientes_info if cliente not in clientes_pagamentos]
clientes_pagamentos_not_in_info = [cliente for cliente in clientes_pagamentos if cliente not in clientes_info]
print(f'Number of clientes in base_info but not in base_pagamentos: {len(clientes_info_not_in_pagamentos)}')
print(f'Percentage of clientes in base_info but not in base_pagamentos: {len(clientes_info_not_in_pagamentos) / len(clientes_info) * 100:.2f}%\n')
print(f'Number of clientes in base_pagamentos but not in base_info: {len(clientes_pagamentos_not_in_info)}')
print(f'Percentage of clientes in base_pagamentos but not in base_info: {len(clientes_pagamentos_not_in_info) / len(clientes_pagamentos) * 100:.2f}%\n')

# porcentage of transactions marked as fraudulent
print(f'Percentage of transactions marked as fraudulent: {base_pagamentos_date["fraud"].sum() / len(base_pagamentos_date) * 100:.2f}%')

# porcentage of clients marked as fraudulent
print(f'Percentage of clients marked as fraudulent at least once: {base_pagamentos_date.groupby("id_cliente")["fraud"].any().sum() / len(base_pagamentos_date["id_cliente"].unique()) * 100:.2f}%\n')

# quantidade de clientes
print(f'Number of clients: {len(base_pagamentos_date["id_cliente"].unique())}\n')

# quantas transações cada cliente tem em base_pagamentos_date
transacoes_por_cliente = base_pagamentos_date.groupby('id_cliente').size().sort_values(ascending=False)
print(f'Statistics of the number of transactions per client:\n{transacoes_por_cliente.describe()}\n')

# quantas transações cada cliente fraudulento tem em base_pagamentos_date
transacoes_por_cliente_fraudulento = base_pagamentos_date.groupby('id_cliente').agg({'fraud': 'any', 'id_cliente': 'count'})
transacoes_por_cliente_fraudulento.columns = ['fraud', 'transacoes']
transacoes_por_cliente_fraudulento = transacoes_por_cliente_fraudulento[transacoes_por_cliente_fraudulento['fraud'] == 1]['transacoes'].sort_values(ascending=False)
print(f'Statistics of the number of transactions per fraudulent client:\n{transacoes_por_cliente_fraudulento.describe()}\n')

# quantas transações fraudulentas cada cliente fraudulento tem em base_pagamentos_date
fraudulent_transacoes_por_cliente_fraudulento = base_pagamentos_date[base_pagamentos_date['fraud'] == 1].groupby('id_cliente').size().sort_values(ascending=False)
print(f'Statistics of the number of fraudulent transactions per fraudulent client:\n{fraudulent_transacoes_por_cliente_fraudulento.describe()}\n')

# porcentage of the first transaction of each client that is fraudulent
print(f"Percentage of the first transaction that is fraudulent: {base_pagamentos_date.sort_values(['id_cliente', 'data_emissao_documento']).drop_duplicates('id_cliente').fraud.mean() * 100:.2f}%\n")

In [None]:
# Number of late payments each client has at the time of a new borrowing
base_pagamentos_date_test = base_pagamentos_date.copy().sort_values(['id_cliente', 'data_emissao_documento'])
base_pagamentos_date_test['loan_count'] = base_pagamentos_date_test.groupby('id_cliente').cumcount() + 1
df_non_expired = base_pagamentos_date_test[base_pagamentos_date_test['data_pagamento'] <= base_pagamentos_date_test['data_vencimento']]
df_non_expired['loan_count_non_expired'] = df_non_expired.groupby('id_cliente').cumcount() + 1
base_pagamentos_date_test = pd.merge(base_pagamentos_date_test, df_non_expired[['id_cliente', 'data_emissao_documento', 'loan_count_non_expired']], 
                                      on=['id_cliente', 'data_emissao_documento'], how='left')
# base_pagamentos_date_test['loan_count_non_expired'] = base_pagamentos_date_test['loan_count_non_expired'].fillna(0)
base_pagamentos_date_test.tail(40)

In [None]:
# This function will calculate the compounded monthly growth rate
def calculate_growth_rate(group):
    group = group.sort_values(by='safra_ref')

    for row in range(1, len(group)):
        initial_value = group['renda_mes_anterior'].iloc[row]
        final_value = group['renda_mes_anterior'].iloc[row-1]
        start_date = group['safra_ref'].iloc[row]
        end_date = group['safra_ref'].iloc[row-1]

        # Calculate the number of periods (in months)
        months_passed = (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month)
        
        if months_passed == 0 or initial_value == 0:
            group['monthly_growth'] = np.nan
        else:
            group['monthly_growth'] = (final_value / initial_value) ** (1/months_passed) - 1
    return group

# Group by client_id
grouped = base_info_date.groupby('id_cliente')
base_info_date = grouped.apply(calculate_growth_rate)
base_info_date


In [None]:
# Obvio que não tem repetido pq é uma primary key
# check if there are repeated dates for each id_cliente in base_info_date
def check_repeated_dates(df):
    unique_counts = df.groupby('id_cliente')['safra_ref'].nunique()
    group_sizes = df.groupby('id_cliente').size()
    result = unique_counts / group_sizes
    return (result < 1).any()

if check_repeated_dates(base_info_date):
    print('There are repeated dates for some id_cliente in base_info_date')
else:
    print('There are no repeated dates for any id_cliente in base_info_date')

In [None]:
# Check if every emission date is within the reference date
base_join.apply(lambda row: (row['data_emissao_documento'].month == row['safra_ref'].month) and 
                            (row['data_emissao_documento'].year == row['safra_ref'].year), axis=1).all()

## Model I

In [None]:
histories = []

In [None]:
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras.layers import BatchNormalization
from keras.metrics import AUC, Precision, Recall, BinaryAccuracy
from tensorflow.keras.losses import BinaryFocalCrossentropy

def create_layers(input_layer, units, activation='relu', dropout_rate=0.1):
    x_dense = Dense(units, activation=activation)(input_layer)
    x_bn = BatchNormalization()(x_dense)
    x_drop = Dropout(dropout_rate)(x_bn)
    return [x_dense, x_bn, x_drop]

def fraud_detection_model(input_dim, units=15, learning_rate=0.001, weight_decay=0.0001, class_weight=None, loss_function='binary_crossentropy'):
    input_layer = Input(shape=(input_dim, ))

    # encoder
    X = create_layers(input_layer, units)
    X_skip = create_layers(X[2], units)
    X = create_layers(X_skip[2] + X[1], units)
    X_skip = create_layers(X[2], units)
    X = create_layers(X_skip[2] + X[1], units)
    X = create_layers(X[2], units)
    X = Dense(1, activation='sigmoid')(X[2])

    model = Model(inputs=input_layer, outputs=X)

    adam = Adam(learning_rate=learning_rate, weight_decay=weight_decay)
    model.compile(optimizer=adam, loss=loss_function, metrics=[AUC(), Precision(), Recall()])

    return model

input_dim = X_train.shape[1]

# base_cadastral_train_eval = X_train.iloc[0:int(len(X_train)*0.85), :]
# base_cadastral_test = X_train.iloc[int(len(X_train)*0.85):, :]

# # use bayes optimization to find the best combination of lr, wd, class_weight, and loss function
# model = fraud_detection_model(input_dim=input_dim, units=15, learning_rate=0.001, weight_decay=0.0001, loss_function='binary_crossentropy')
# history = model.fit(X_train, y_train, epochs=30, batch_size=512, validation_split=0.2, class_weight={0:1, 1:1})

# # autoencoder.evaluate(base_cadastral_test, base_cadastral_test)

# y_pred = model.predict(X_train)


In [None]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

histories = []
params_list = []

# define the search space for hyperparameters
space = {
    'units': hp.choice('units', [15, 30, 60]), # sample units per layer
    'learning_rate': hp.loguniform('learning_rate', -4*np.log(10), -2*np.log(10)),  # vary learning rate on a log scale
    'weight_decay': hp.loguniform('weight_decay', -5*np.log(10), -3*np.log(10)),    # vary weight decay on a log scale
    'class_weight': hp.choice('class_weight', [{0:1, 1:1}, {0:1, 1:2}]),  # sample class weights
    'loss_function': hp.choice('loss_function', ['binary_crossentropy', BinaryFocalCrossentropy(gamma=1)])  # sample loss functions
}

def objective(params):
    # create the model
    model = fraud_detection_model(input_dim=input_dim, **params)
    
    # fit the model
    history = model.fit(X_train, y_train, epochs=30, batch_size=512, validation_split=0.2, class_weight=params['class_weight'])

    # save the history and parameters
    histories.append(history)
    params_list.append(params)

    # calculate the loss as the average of the last 3 epochs
    validation_loss = np.mean(history.history['val_loss'][-5:])

    # return the loss
    return {'loss': validation_loss, 'status': STATUS_OK}

# create a trials object
trials = Trials()

# find the best hyperparameters
best = fmin(
    fn=objective,  # objective function
    space=space,  # hyperparameter space
    algo=tpe.suggest,  # surrogate algorithm
    max_evals=8,  # number of iterations
    trials=trials,  # trials object to store details of the iteration
    rstate=np.random.default_rng(1)  # for reproducibility
)

print(best)


In [None]:
# histories.append(history)

# loss: 0.1509 - auc_8: 0.8766 - precision_8: 0.7465 - recall_8: 0.3551 - binary_accuracy: 0.9517 - val_loss: 0.6307 - val_auc_8: 0.8885 - val_precision_8: 0.0889 - val_recall_8: 0.8822 - val_binary_accuracy: 0.5962 before
# loss: 0.0440 - auc_9: 0.9928 - precision_9: 0.8875 - recall_9: 0.8423 - binary_accuracy: 0.9833 - val_loss: 0.0422 - val_auc_9: 0.9972 - val_precision_9: 0.9939 - val_recall_9: 0.6293 - val_binary_accuracy: 0.9835 as in 3 but extra layer
# loss: 0.0515 - auc_10: 0.8003 - precision_10: 0.7173 - recall_10: 0.1958 - binary_accuracy: 0.9444 - val_loss: 1.5347 - val_auc_10: 0.7662 - val_precision_10: 0.0447 - val_recall_10: 0.9942 - val_binary_accuracy: 0.0627 gamma=2
# loss: 0.0458 - auc_11: 0.9708 - precision_11: 0.8109 - recall_11: 0.6371 - binary_accuracy: 0.9677 - val_loss: 0.1299 - val_auc_11: 0.7191 - val_precision_11: 1.0000 - val_recall_11: 0.1699 - val_binary_accuracy: 0.9634 gamma=1
# loss: 0.5288 - auc_14: 0.9595 - precision_14: 0.2976 - recall_14: 0.9596 - binary_accuracy: 0.8545 - val_loss: 0.0925 - val_auc_14: 0.9189 - val_precision_14: 0.8729 - val_recall_14: 0.5039 - val_binary_accuracy: 0.9749 class_weight={0:1, 1:20}
# loss: 0.0699 - auc_15: 0.9925 - precision_15: 0.8263 - recall_15: 0.8908 - binary_accuracy: 0.9813 - val_loss: 0.3131 - val_auc_15: 0.9623 - val_precision_15: 0.4399 - val_recall_15: 0.8687 - val_binary_accuracy: 0.9455 class_weight={0:1, 1:2}




In [None]:
params_list

In [None]:
# plot losses
fig = plt.figure(figsize=(20, 20))
for i, history in enumerate(histories):
    ax = fig.add_subplot(4, 4, i+1)
    ax.plot(history.history['loss'])
    ax.plot(history.history['val_loss'])
    # ax.plot(history.history['recall'])
    # ax.plot(history.history['val_recall'])
    title = f'u{ params_list[i]["units"]}, lr {params_list[i]["learning_rate"]:.2e}, wd {params_list[i]["weight_decay"]:.2e}, cw {params_list[i]["class_weight"][1]}'
    plt.title(title)
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation', 'recall', 'val_recall'])
plt.show()

# 1:   cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 256, epochs 100, Total params: 1,441
# 2:   cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,441
# 3:   cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,441
# 4:   cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.003, wd 0.0001, batch 512, epochs 30, Total params: 1,441
# 5:   cl 30, cl 30, cl 30, cl 30, cl 30 dense 1 sigmoid, lr 0.003, wd 0.0001, batch 512, epochs 30, Total params: 4,681
# 6:   cl 30, cl 30, cl 30, cl 30, cl 30 dense 1 sigmoid, lr 0.003, wd 0.001, batch 512, epochs 30, Total params: 4,681
# 7:   cl 15, cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,741
# 8:   cl 15, cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,741
# 9:   cl 15, cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,741, gama=2
# 10:  cl 15, cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,741, gama=1
# 11:  cl 15, cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,741
# 12:  cl 15, cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,741, gama=1class_weight={0:1, 1:20}
# 13:  cl 15, cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,741, gama=1class_weight={0:1, 1:2}

# # save the collection of histories to a file
# import pickle
# with open('histories.pickle', 'wb') as f:
#     pickle.dump(histories, f)

# # load the collection of histories from a file
# import pickle
# with open('histories.pickle', 'rb') as f:
#     histories_loaded = pickle.load(f)


# loss: 0.3269 - mae: 0.3568 - val_loss: 0.4145 - val_mae: 0.3445
# loss: 0.5927 - mae: 0.4031
# model.summary()

## Model II

In [None]:
histories = []

In [None]:
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras.layers import BatchNormalization
from keras.metrics import AUC, Precision, Recall, BinaryAccuracy
from tensorflow.keras.losses import BinaryFocalCrossentropy

def create_layers(input_layer, units, activation='relu', dropout_rate=0.1):
    x_dense = Dense(units, activation=activation)(input_layer)
    x_bn = BatchNormalization()(x_dense)
    x_drop = Dropout(dropout_rate)(x_bn)
    return [x_dense, x_bn, x_drop]

def fraud_detection_model(input_dim, units=15, learning_rate=0.001, weight_decay=0.0001, class_weight=None, loss_function='binary_crossentropy'):
    input_layer = Input(shape=(input_dim, ))

    # encoder
    X = create_layers(input_layer, units)
    X_skip = create_layers(X[2], units)
    X = create_layers(X_skip[2] + X[1], units)
    X_skip = create_layers(X[2], units)
    X = create_layers(X_skip[2] + X[1], units)
    X = create_layers(X[2], units)
    X = Dense(1, activation='sigmoid')(X[2])

    model = Model(inputs=input_layer, outputs=X)

    adam = Adam(learning_rate=learning_rate, weight_decay=weight_decay)
    model.compile(optimizer=adam, loss=loss_function, metrics=[AUC(), Precision(), Recall()])

    return model

input_dim = X_train.shape[1]

# base_cadastral_train_eval = X_train.iloc[0:int(len(X_train)*0.85), :]
# base_cadastral_test = X_train.iloc[int(len(X_train)*0.85):, :]

# # use bayes optimization to find the best combination of lr, wd, class_weight, and loss function
# model = fraud_detection_model(input_dim=input_dim, units=15, learning_rate=0.001, weight_decay=0.0001, loss_function='binary_crossentropy')
# history = model.fit(X_train, y_train, epochs=30, batch_size=512, validation_split=0.2, class_weight={0:1, 1:1})

# # autoencoder.evaluate(base_cadastral_test, base_cadastral_test)

# y_pred = model.predict(X_train)


In [None]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

histories = []
params_list = []

# define the search space for hyperparameters
space = {
    'units': hp.choice('units', [15, 30, 60]), # sample units per layer
    'learning_rate': hp.loguniform('learning_rate', -4*np.log(10), -2*np.log(10)),  # vary learning rate on a log scale
    'weight_decay': hp.loguniform('weight_decay', -5*np.log(10), -3*np.log(10)),    # vary weight decay on a log scale
    'class_weight': hp.choice('class_weight', [{0:1, 1:1}, {0:1, 1:2}]),  # sample class weights
    'loss_function': hp.choice('loss_function', ['binary_crossentropy', BinaryFocalCrossentropy(gamma=1)])  # sample loss functions
}

def objective(params):
    # create the model
    model = fraud_detection_model(input_dim=input_dim, **params)
    
    # fit the model
    history = model.fit(X_train, y_train, epochs=30, batch_size=512, validation_split=0.2, class_weight=params['class_weight'])

    # save the history and parameters
    histories.append(history)
    params_list.append(params)

    # calculate the loss as the average of the last 3 epochs
    validation_loss = np.mean(history.history['val_loss'][-5:])

    # return the loss
    return {'loss': validation_loss, 'status': STATUS_OK}

# create a trials object
trials = Trials()

# find the best hyperparameters
best = fmin(
    fn=objective,  # objective function
    space=space,  # hyperparameter space
    algo=tpe.suggest,  # surrogate algorithm
    max_evals=8,  # number of iterations
    trials=trials,  # trials object to store details of the iteration
    rstate=np.random.default_rng(1)  # for reproducibility
)

print(best)


In [None]:
# histories.append(history)

# loss: 0.1509 - auc_8: 0.8766 - precision_8: 0.7465 - recall_8: 0.3551 - binary_accuracy: 0.9517 - val_loss: 0.6307 - val_auc_8: 0.8885 - val_precision_8: 0.0889 - val_recall_8: 0.8822 - val_binary_accuracy: 0.5962 before
# loss: 0.0440 - auc_9: 0.9928 - precision_9: 0.8875 - recall_9: 0.8423 - binary_accuracy: 0.9833 - val_loss: 0.0422 - val_auc_9: 0.9972 - val_precision_9: 0.9939 - val_recall_9: 0.6293 - val_binary_accuracy: 0.9835 as in 3 but extra layer
# loss: 0.0515 - auc_10: 0.8003 - precision_10: 0.7173 - recall_10: 0.1958 - binary_accuracy: 0.9444 - val_loss: 1.5347 - val_auc_10: 0.7662 - val_precision_10: 0.0447 - val_recall_10: 0.9942 - val_binary_accuracy: 0.0627 gamma=2
# loss: 0.0458 - auc_11: 0.9708 - precision_11: 0.8109 - recall_11: 0.6371 - binary_accuracy: 0.9677 - val_loss: 0.1299 - val_auc_11: 0.7191 - val_precision_11: 1.0000 - val_recall_11: 0.1699 - val_binary_accuracy: 0.9634 gamma=1
# loss: 0.5288 - auc_14: 0.9595 - precision_14: 0.2976 - recall_14: 0.9596 - binary_accuracy: 0.8545 - val_loss: 0.0925 - val_auc_14: 0.9189 - val_precision_14: 0.8729 - val_recall_14: 0.5039 - val_binary_accuracy: 0.9749 class_weight={0:1, 1:20}
# loss: 0.0699 - auc_15: 0.9925 - precision_15: 0.8263 - recall_15: 0.8908 - binary_accuracy: 0.9813 - val_loss: 0.3131 - val_auc_15: 0.9623 - val_precision_15: 0.4399 - val_recall_15: 0.8687 - val_binary_accuracy: 0.9455 class_weight={0:1, 1:2}




In [None]:
params_list

In [None]:
# plot losses
fig = plt.figure(figsize=(20, 20))
for i, history in enumerate(histories):
    ax = fig.add_subplot(4, 4, i+1)
    ax.plot(history.history['loss'])
    ax.plot(history.history['val_loss'])
    # ax.plot(history.history['recall'])
    # ax.plot(history.history['val_recall'])
    title = f'u{ params_list[i]["units"]}, lr {params_list[i]["learning_rate"]:.2e}, wd {params_list[i]["weight_decay"]:.2e}, cw {params_list[i]["class_weight"][1]}'
    plt.title(title)
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation', 'recall', 'val_recall'])
plt.show()

# 1:   cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 256, epochs 100, Total params: 1,441
# 2:   cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,441
# 3:   cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,441
# 4:   cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.003, wd 0.0001, batch 512, epochs 30, Total params: 1,441
# 5:   cl 30, cl 30, cl 30, cl 30, cl 30 dense 1 sigmoid, lr 0.003, wd 0.0001, batch 512, epochs 30, Total params: 4,681
# 6:   cl 30, cl 30, cl 30, cl 30, cl 30 dense 1 sigmoid, lr 0.003, wd 0.001, batch 512, epochs 30, Total params: 4,681
# 7:   cl 15, cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,741
# 8:   cl 15, cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,741
# 9:   cl 15, cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,741, gama=2
# 10:  cl 15, cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,741, gama=1
# 11:  cl 15, cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,741
# 12:  cl 15, cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,741, gama=1class_weight={0:1, 1:20}
# 13:  cl 15, cl 15, cl 15, cl 15, cl 15, cl 15 dense 1 sigmoid, lr 0.001, wd 0.0001, batch 512, epochs 30, Total params: 1,741, gama=1class_weight={0:1, 1:2}

# # save the collection of histories to a file
# import pickle
# with open('histories.pickle', 'wb') as f:
#     pickle.dump(histories, f)

# # load the collection of histories from a file
# import pickle
# with open('histories.pickle', 'rb') as f:
#     histories_loaded = pickle.load(f)


# loss: 0.3269 - mae: 0.3568 - val_loss: 0.4145 - val_mae: 0.3445
# loss: 0.5927 - mae: 0.4031
# model.summary()

## Model Training II

In [None]:
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras.layers import BatchNormalization
from keras.metrics import AUC, Precision, Recall, BinaryAccuracy
from tensorflow.keras.losses import BinaryFocalCrossentropy

def create_layers(input_layer, units, activation='relu', dropout_rate=0.1):
    x_dense = Dense(units, activation=activation)(input_layer)
    x_bn = BatchNormalization()(x_dense)
    x_drop = Dropout(dropout_rate)(x_bn)
    return [x_dense, x_bn, x_drop]

def fraud_detection_model(input_dim, units=15, learning_rate=0.001, weight_decay=0.0001, loss_function='binary_crossentropy'):
    input_layer = Input(shape=(input_dim, ))

    # encoder
    X = create_layers(input_layer, units)
    X_skip = create_layers(X[2], units)
    X = create_layers(X_skip[2] + X[1], units)
    X_skip = create_layers(X[2], units)
    X = create_layers(X_skip[2] + X[1], units)
    X = create_layers(X[2], units)
    X = Dense(1, activation='sigmoid')(X[2])

    model = Model(inputs=input_layer, outputs=X)

    adam = Adam(learning_rate=learning_rate, weight_decay=weight_decay)
    model.compile(optimizer=adam, loss=loss_function, metrics=[AUC(), Precision(), Recall()])

    return model

input_dim = X_train.shape[1]

# # use bayes optimization to find the best combination of lr, wd, class_weight, and loss function

# model = fraud_detection_model(input_dim=input_dim, units=15, learning_rate=0.001, weight_decay=0.0001, loss_function='binary_crossentropy')
# history = model.fit(X_train, y_train, epochs=30, batch_size=512, validation_split=0.2, class_weight={0:1, 1:1})



In [None]:
from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold
from skopt.space import Real, Categorical, Integer
from keras.wrappers.scikit_learn import KerasClassifier

# define a function to create the model
def create_model(units=15, learning_rate=0.001, weight_decay=0.0, class_weight={0:1, 1:1}, loss_function='binary_crossentropy'):
    model = fraud_detection_model(input_dim=input_dim, 
                                  units=units,
                                  learning_rate=learning_rate, 
                                  weight_decay=weight_decay, 
                                  loss_function=loss_function)
    return model

model = KerasClassifier(build_fn=create_model, verbose=0, epochs=30, batch_size=512)

# define the search spaces
search_spaces = {
    'learning_rate': Real(1e-5, 1e-1, prior='log-uniform'),
    'weight_decay': Real(1e-5, 1e-1, prior='log-uniform'),
    'class_weight': Categorical([{0:1, 1:1}, {0:1, 1:2}, {0:1, 1:3}, {0:1, 1:4}, {0:1, 1:5}]),
    'loss_function': Categorical(['binary_crossentropy', 'hinge'])
}

# define the cross-validation strategy
cv = StratifiedKFold(n_splits=5)

# create the BayesSearchCV object
opt = BayesSearchCV(model,
                    search_spaces,
                    n_iter=50,
                    cv=cv,
                    n_jobs=1,  # use parallel computing if possible
                    random_state=42)

# perform the search
opt.fit(X_train, y_train)


In [None]:
from sklearn.ensemble import IsolationForest
from skopt import BayesSearchCV
from sklearn.base import BaseEstimator
from sklearn.metrics import f1_score, confusion_matrix
import numpy as np

class IsolationForestWrapper(BaseEstimator):
    def __init__(self, n_estimators=10, max_samples='auto', contamination=0.1):
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.contamination = contamination
        self.model = IsolationForest(n_estimators=self.n_estimators, 
                                     max_samples=self.max_samples, 
                                     contamination=self.contamination)
        self.y_train = None

    def fit(self, X, y=None):
        self.y_train = (y == 1).astype(int) if y is not None else None
        self.model.fit(X)
        return self

    def predict(self, X):
        pred = self.model.predict(X)
        return (pred == 1).astype(int)

    def score(self, X, y=None):
        if y is not None:
            y = (y == 1).astype(int)
            y_pred = self.predict(X)
            return f1_score(y, y_pred)
        elif self.y_train is not None:
            y_pred = self.predict(X)
            return f1_score(self.y_train, y_pred)
        else:
            raise ValueError("No y values were given during fitting, cannot calculate score.")

    def set_params(self, **params):
        super().set_params(**params)
        self.model.set_params(**params)
        return self

iso_forest = IsolationForestWrapper(n_estimators=30)

params = {
    'contamination': np.linspace(0.01, 0.2, 20)
}

opt = BayesSearchCV(iso_forest, params, n_iter=25, cv=3, n_jobs=-1)

opt.fit(X_train, y_train)

y_pred = opt.predict(X_eval)

print(confusion_matrix(y_eval, y_pred))

accuracy = accuracy_score(y_eval, y_pred)
precision = precision_score(y_eval, y_pred)
recall = recall_score(y_eval, y_pred)
f1 = f1_score(y_eval, y_pred)
roc_auc = roc_auc_score(y_eval, y_pred)

with open('results.txt', 'a') as f:
    print(f"Best parameters for IsolationForest: {opt.best_params_}", file=f)
    print(f"Classifier: IsolationForest", file=f)
    print(f"Accuracy: {accuracy}", file=f)
    print(f"Precision: {precision}", file=f)
    print(f"Recall: {recall}", file=f)
    print(f"F1 Score: {f1}", file=f)
    print(f"ROC AUC: {roc_auc}", file=f)
    print("------------------------", file=f)


In [None]:
# # loop to print the loss and the metrics of all the models history in the histories list
# for i in range(len(histories)):
#     with open('nn_results.txt', 'a') as f:
#         print(f"Parameters: {params_list[i]}", file=f)
#         history_keys = list(histories[i].history.keys())
#         print(f"Loss: {np.round(np.mean(histories[i].history[history_keys[0]][-3:]),4)} --> {np.round(np.mean(histories[i].history[history_keys[4]][-3:]),4)}", file=f)
#         print(f"AUC: {np.round(np.mean(histories[i].history[history_keys[1]][-3:]),4)} --> {np.round(np.mean(histories[i].history[history_keys[5]][-3:]),4)}", file=f)
#         print(f"Precision: {np.round(np.mean(histories[i].history[history_keys[2]][-3:]),4)} --> {np.round(np.mean(histories[i].history[history_keys[6]][-3:]),4)}", file=f)
#         print(f"Recall: {np.round(np.mean(histories[i].history[history_keys[3]][-3:]),4)} --> {np.round(np.mean(histories[i].history[history_keys[7]][-3:]),4)}", file=f)
#         print("------------------------", file=f)