In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
import matplotlib.pyplot  as plt
import seaborn as sns

# Load the data from the CSV files
base_info = pd.read_csv('base_info.csv')
base_pagamentos = pd.read_csv('base_pagamentos_desenvolvimento.csv')
base_cadastral = pd.read_csv('base_cadastral.csv')

# lower case all column names
base_info.columns = base_info.columns.str.lower()
base_pagamentos.columns = base_pagamentos.columns.str.lower()
base_cadastral.columns = base_cadastral.columns.str.lower()

# give a new id to each client based on registrartion date order
base_cadastral = base_cadastral.sort_values('data_cadastro').reset_index(drop=True)
base_cadastral['new_id'] = range(1, len(base_cadastral) + 1)

# save a corresponding dictionary of old_id and new_id
id_dict = base_cadastral[['id_cliente', 'new_id']].set_index('id_cliente').to_dict()

# convert str to date
def convert_date(df, to_date_columns, format):
  df_copy = df.copy()
  for col in to_date_columns:
    df_copy[col] = pd.to_datetime(df_copy[col], format=format)#.dt.date
  return df_copy

date_columns = ['data_emissao_documento', 'data_pagamento', 'data_vencimento']
base_pagamentos_date = convert_date(base_pagamentos, date_columns, '%Y-%m-%d')
base_pagamentos_date = convert_date(base_pagamentos_date, ['safra_ref'], '%Y-%m')
base_info_date = convert_date(base_info, ['safra_ref'], '%Y-%m')
base_cadastral_date = convert_date(base_cadastral, ['data_cadastro'], '%Y-%m-%d')

# ajust values in base_cadastral
base_cadastral_date.flag_pf = (base_cadastral_date.flag_pf == 'X').astype(int)
base_cadastral_date.segmento_industrial = base_cadastral_date.segmento_industrial.fillna('NAN')
base_cadastral_date.dominio_email = base_cadastral_date.dominio_email.fillna('NAN')
base_cadastral_date.porte = base_cadastral_date.porte.fillna('NAN')
base_cadastral_date.cep_2_dig = base_cadastral_date.cep_2_dig.fillna('NA')
base_cadastral_date.ddd = base_cadastral_date.ddd.fillna('-1')
base_cadastral_date.loc[base_cadastral_date['ddd'].str.contains("\("), 'ddd'] = '-2'

# Add a binary column, fraud, to base_pagamentos_desenvolvimento_coherent and assing 1 to rows where DATA_PAGAMENTO > DATA_VENCIMENTO + 5
base_pagamentos_date['late_payment'] = (base_pagamentos_date['data_pagamento'] - base_pagamentos_date['data_vencimento']).dt.days
base_pagamentos_date['fraud'] = np.where(base_pagamentos_date['late_payment'] > 5, 1, 0)

In [None]:
## base_cadastral encodings

# date encoding
def encode_date(df, columns):
    df_copy = df.copy()
    for col in columns:
        min_date = df_copy[col].min()
        df_copy[col + '_since_min'] = (df_copy[col] - min_date).dt.days + 1
        # df_copy[col + '_year'] = df_copy[col].dt.year
        # df_copy[col + '_month'] = df_copy[col].dt.month
        # df_copy[col + '_day'] = df_copy[col].dt.day
        # df_copy[col + '_dayofweek'] = df_copy[col].dt.dayofweek
        # df_copy[col + '_dayofyear'] = df_copy[col].dt.dayofyear
        # df_copy[col + '_weekofyear'] = df_copy[col].dt.weekofyear
        # df_copy[col + '_quarter'] = df_copy[col].dt.quarter
        df_copy.drop(col, inplace=True, axis=1)
    return df_copy

# One-hot encoding for categorical variables
def encode_categorical(df, columns):
    df_copy = df.copy()
    for col in columns:
        df_copy = pd.concat([df_copy, pd.get_dummies(df_copy[col], prefix=col)], axis=1)
        df_copy.drop(col, inplace=True, axis=1)
    return df_copy

# ordinal encoding for categorical variables
def encode_ordinal(df, columns, mapping):
    df_copy = df.copy()
    for i, col in enumerate(columns):
        df_copy[col + '_encoded'] = df_copy[col].map(mapping[i])
        df_copy.drop(col, inplace=True, axis=1)
    return df_copy

ordinal_encode_map = [{'NAN': 0, 'PEQUENO': 1, 'MEDIO': 2, 'GRANDE': 3}]
base_cadastral_date_encoded = encode_date(base_cadastral_date, ['data_cadastro'])
base_cadastral_one_hot_encoded = encode_categorical(base_cadastral_date_encoded, ['segmento_industrial', 'dominio_email'])
base_cadastral_ordinal_encoded = encode_ordinal(base_cadastral_one_hot_encoded, ['porte'], ordinal_encode_map)
base_cadastral_droped = base_cadastral_ordinal_encoded.drop(['id_cliente', 'ddd', 'cep_2_dig'], axis=1)
base_cadastral_droped

# standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
base_cadastral_scaled = scaler.fit_transform(base_cadastral_droped)
base_cadastral_scaled = pd.DataFrame(base_cadastral_scaled, columns=base_cadastral_droped.columns)
display(base_cadastral_scaled.head(40))

In [None]:
# import TSNE
from sklearn.manifold import TSNE
base_cadastral_tsne = TSNE(n_components=3).fit_transform(base_cadastral_scaled)
print(base_cadastral_tsne.shape)

# plot 16 TSNEs with different columns as color
fig = plt.figure(figsize=(20, 20))
for i, col in enumerate(base_cadastral_scaled.columns):
    ax = fig.add_subplot(4, 4, i+1, projection='3d')
    ax.scatter(base_cadastral_tsne[:, 0], base_cadastral_tsne[:, 1], base_cadastral_tsne[:, 2], c=base_cadastral_scaled[col], cmap='coolwarm')
    plt.title(col)
plt.show()

In [None]:
base_cadastral_tsne_df = pd.DataFrame(base_cadastral_tsne, columns=['tsne_1', 'tsne_2', 'tsne_3'])

In [None]:
histories = []

In [None]:
# autoencoder
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras.layers import BatchNormalization

input_dim = base_cadastral_scaled.shape[1]
encoding_dim = 3

input_layer = Input(shape=(input_dim, ))
# make encoder
encoder0 = Dense(25, activation='relu')(input_layer)
encoder0 = BatchNormalization()(encoder0)
encoder0 = Dropout(0.1)(encoder0)
encoder0 = Dense(15, activation='relu')(encoder0)
encoder0 = BatchNormalization()(encoder0)
encoder0 = Dropout(0.1)(encoder0)
encoder0 = Dense(15, activation='relu')(encoder0 + input_layer)
encoder1 = BatchNormalization()(encoder0)
encoder1 = Dropout(0.1)(encoder1)
encoder1 = Dense(15, activation='relu')(encoder1)
encoder1 = BatchNormalization()(encoder1)
encoder1 = Dropout(0.1)(encoder1)
encoder1 = Dense(15, activation='relu')(encoder1 + encoder0)
encoder1 = BatchNormalization()(encoder1)
encoder1 = Dropout(0.1)(encoder1)
encoder = Dense(encoding_dim, activation='linear')(encoder1)
# make decoder
decoder0 = Dense(15, activation='relu')(encoder)
decoder1 = BatchNormalization()(decoder0)
decoder1 = Dropout(0.1)(decoder1)
decoder1 = Dense(15, activation='relu')(decoder1)
decoder1 = BatchNormalization()(decoder1)
decoder1 = Dropout(0.1)(decoder1)
decoder = Dense(15, activation='relu')(decoder1 + decoder0)
decoder = BatchNormalization()(decoder)
decoder = Dropout(0.1)(decoder)
decoder = Dense(15, activation='relu')(decoder)
decoder = BatchNormalization()(decoder)
decoder = Dropout(0.1)(decoder)
decoder = Dense(25, activation='relu')(decoder)
decoder = BatchNormalization()(decoder)
decoder = Dropout(0.1)(decoder)
decoder = Dense(input_dim, activation='linear')(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

adam = Adam(learning_rate=0.002, weight_decay=0.0001)
autoencoder.compile(optimizer=adam, loss='mean_squared_error', metrics=['accuracy'])

history = autoencoder.fit(base_cadastral_scaled, base_cadastral_scaled, epochs=200, batch_size=64, shuffle=True, validation_split=0.2)

encoder = Model(inputs=input_layer, outputs=encoder)
encoded_data = encoder.predict(base_cadastral_scaled)

In [None]:
# loss: 0.3845 - accuracy: 0.8669 - val_loss: 0.5378 - val_accuracy: 0.7452
# loss: 0.3414 - accuracy: 0.7405 - val_loss: 0.4845 - val_accuracy: 0.8745
# loss: 0.3297 - accuracy: 0.7357 - val_loss: 0.4748 - val_accuracy: 0.8745
encoder.summary()

In [None]:
encoded_data_df = pd.DataFrame(encoded_data, columns=['encoded_1', 'encoded_2', 'encoded_3'])
encoded_data_df['segmento_industrial'] = base_cadastral['segmento_industrial']
encoded_data_df['porte'] = base_cadastral['porte']
encoded_data_df['dominio_email'] = base_cadastral['dominio_email']
encoded_data_df['data_cadastro_since_min'] = base_cadastral_scaled['data_cadastro_since_min']
encoded_data_df

In [None]:
# check uniqueness of each encoded representation
encoded_data_df.groupby(['encoded_1', 'encoded_2', 'encoded_3']).size().reset_index().rename(columns={0:'count'}).sort_values(by='count', ascending=False)

In [None]:
# check if the encoded data is linearly separable unsing 16 plots
fig = plt.figure(figsize=(20, 20))
for i, col in enumerate(base_cadastral_scaled.columns):
    ax = fig.add_subplot(4, 4, i+1, projection='3d')
    ax.scatter(encoded_data_df['encoded_1'], encoded_data_df['encoded_2'], encoded_data_df['encoded_3'], c=base_cadastral_scaled[col], cmap='coolwarm')
    plt.title(col)
plt.show()

In [None]:
histories.append(history)

In [None]:
# plot loss
fig = plt.figure(figsize=(20, 20))
for i, history in enumerate(histories):
    ax = fig.add_subplot(4, 4, i+1)
    ax.plot(history.history['loss'])
    ax.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'])
plt.show()

In [None]:
# 1:  encoder 15, decoder 15, sigmoid
# 2:  encoder 15, decoder 15, linear
# 3:  encoder 10, 10, 3, decoder 10, 10, 15, sigmoid
# 4:  encoder 10, dropout, 10, dropout, 3, decoder 10, dropout, 10, dropout, 15, sigmoid
# 5:  encoder 10, dropout, 10, dropout, 3, decoder 10, dropout, 10, dropout, 15, linear
# 6:  encoder 15, dropout, 10, dropout, 10, dropout, 3, decoder 10, dropout, 10, dropout, 10, dropout, 15, linear, bach_size=64 (double)
# 7:  encoder 15, BN, dropout, 10, BN, dropout, 10, BN, dropout, 3, decoder 10, BN, dropout, 10, BN, dropout, 10, BN, dropout, 15, linear | 17: like 7 w_16 perks, but with BN an linear f. encoder yielded zero repeated representations, v_loss=0.58
# 8:  encoder 25, BN, dropout, 15, BN, dropout, 15, BN, dropout, 3, decoder 15, BN, dropout, 15, BN, dropout, 25, BN, dropout, 15, linear
# 9:  encoder 25, BN, dropout, 15, BN, dropout, 15, BN, dropout, 3, decoder 15, BN, dropout, 15, BN, dropout, 25, BN, dropout, 15, linear
# 10: encoder 25, BN, dropout, 15, BN, dropout, 15, BN, dropout, 3, decoder 15, BN, dropout, 15, BN, dropout, 25, BN, dropout, 15, linear (1/0 skip enc/dec) (double trainning)
# 11: enc.b 25, enc.b 15, enc.b 15, enc.b 15, enc.b 15, 3, dec.b 15, dec.b 15, dec.b 15, dec.b 25, 15, linear (2/1 skip enc/dec)
# 12: enc.b 25, enc.b 15, enc.b 15, enc.b 15, enc.b 15, 3, dec.b 15, dec.b 15, dec.b 15, dec.b 25, 15, linear (2/1 skip enc/dec) (double lr)
# 13: enc.b 25, enc.b 15, enc.b 15, enc.b 15, enc.b 15, 3, dec.b 15, dec.b 15, dec.b 15, dec.b 25, 15, linear (2/1 skip enc/dec) (L2 0.0001)
# 14: enc.b 25, enc.b 15, enc.b 15, enc.b 15, enc.b 15, 3, dec.b 15, dec.b 15, dec.b 15, dec.b 25, 15, linear (2/1 skip enc/dec) ???
# 15: enc.b 25, enc.b 25, enc.b 25, enc.b 25, enc.b 15, 3, dec.b 15, dec.b 25, dec.b 25, dec.b 25, 15, linear (2/1 skip enc/dec) (double depth, "double" units)
# 16: enc.b 25, enc.b 15, enc.b 15, enc.b 15, enc.b 15, 3, dec.b 15, dec.b 15, dec.b 15, dec.b 25, 15, linear (2/1 skip enc/dec) (13 but with linear act.fun. for last layer in the encoder) ==> yeilded 11 repeated representations

# # save the collection of histories to a file
# import pickle
# with open('histories.pickle', 'wb') as f:
#     pickle.dump(histories, f)

# # load the collection of histories from a file
# import pickle
# with open('histories.pickle', 'rb') as f:
#     histories_loaded = pickle.load(f)




In [None]:
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras.layers import BatchNormalization

def create_layers(input_layer, units, activation='relu', dropout_rate=0.1):
    x_dense = Dense(units, activation=activation)(input_layer)
    x_bn = BatchNormalization()(x_dense)
    x_drop = Dropout(dropout_rate)(x_bn)
    return [x_dense, x_bn, x_drop]

def autoencoder_model(input_dim, encoding_dim, learning_rate=0.002, weight_decay=0.0001):
    input_layer = Input(shape=(input_dim, ))

    # encoder
    encoder = create_layers(input_layer, 25)
    encoder = create_layers(encoder[2], 15)
    encoder = create_layers(encoder[2] + input_layer, 15)
    encoder_skip = create_layers(encoder[2], 15)
    encoder = create_layers(encoder_skip[2] + encoder[1], 15)
    encoded = Dense(encoding_dim, activation='linear')(encoder[2])

    # decoder
    decoder = create_layers(encoded, 15)
    decoder_skip = create_layers(decoder[2], 15)
    decoder = create_layers(decoder_skip[2] + decoder[1], 15)
    decoder = create_layers(decoder[2], 15)
    decoder = create_layers(decoder[2], 25)
    decoded = Dense(input_dim, activation='linear')(decoder[2])

    autoencoder = Model(inputs=input_layer, outputs=decoded)

    adam = Adam(learning_rate=learning_rate, weight_decay=weight_decay)
    autoencoder.compile(optimizer=adam, loss='mean_squared_error', metrics=['accuracy'])

    return autoencoder

input_dim = base_cadastral_scaled.shape[1]
encoding_dim = 3

autoencoder = autoencoder_model(input_dim, encoding_dim)
history = autoencoder.fit(base_cadastral_scaled, base_cadastral_scaled, epochs=200, batch_size=64, shuffle=True, validation_split=0.2)

encoder = Model(inputs=autoencoder.input, outputs=autoencoder.layers[-18].output)
encoded_data = encoder.predict(base_cadastral_scaled)
