In [1]:
## Balanced AutoEncoder
# encoder_dense_layers_trial = [[10, 8], [12, 10], [14, 12], [16, 14], [18, 16], [20, 18] ,[22,20]]
# decoder_dense_layers_trial = [[8, 10], [10, 12], [12, 14], [14, 16], [16, 18], [18, 20], [20,22]]
# bottle_neck_trial = [8, 10, 12, 14, 16, 18]

In [2]:
#pip install tf2onnx
from tensorflow import keras
import time
import pickle
import pandas as pd

In [3]:
run_summary = []
FILE ="Combined_AM_Data.csv"

In [4]:
def generate_autoencoder_reports(encoder_dense_layers, bottle_neck, decoder_dense_layers):

    # Load Data
    df_am = pd.read_csv(FILE)

    columns_needed = ['income', 'road_dist', 'cooking', 'y_am_pef', 'tempin', 
                    'humidin', 'pm25in', 'co2in', 'tempdiffin', 'humidiffin', 
                    'pm25diffin', 'pm10', 'pm25', 'o3', 'no2', 'co', 'so2', 'temp', 
                    'windsd', 'humid', 'varp', 'dewpt', 'airp', 'seap', 'solrhr', 'solramnt', 'grdt', 'class']

    cleaned_df = df_am.filter(columns_needed)
    df = cleaned_df.sample(frac=1, random_state=42).reset_index(drop=True)

    df = df.dropna()
    minority_class = df[df['class'] == 0]
    X_train = minority_class.drop('class', axis=1)

    # Define the Input shape
    INPUT_SHAPE = X_train.shape[1]
    FILE_NAME = f"L{INPUT_SHAPE}_E{'_'.join(map(str, encoder_dense_layers))}_B{bottle_neck}_D{'_'.join(map(str, decoder_dense_layers))}"
    
    def build_autoencoder(input_shape, **kwargs):

      encoder_dense_layers = kwargs.get('encoder_dense_layers', [])
      bottle_neck = kwargs.get('bottle_neck', input_shape // 2)
      decoder_dense_layers = kwargs.get('decoder_dense_layers', [])
      decoder_activation = kwargs.get('decoder_activation', 'sigmoid')

      # Autoencoder Model
      encoder_input = keras.Input(shape=(input_shape,), name="encoder")
      x = keras.layers.Flatten()(encoder_input)

      # Encoder Dense Layers
      for units in encoder_dense_layers:
          x = keras.layers.Dense(units, activation="relu")(x)

      encoder_output = keras.layers.Dense(bottle_neck, activation="relu")(x)
      encoder = keras.Model(encoder_input, encoder_output, name="encoder")

      # Decoder Model
      decoder_input = keras.Input(shape=(bottle_neck,), name="decoder")
      x = decoder_input

      # Decoder Dense Layers
      for units in decoder_dense_layers:
          x = keras.layers.Dense(units, activation="relu")(x)

      decoder_output = keras.layers.Dense(input_shape, activation=decoder_activation)(x)
      decoder = keras.Model(decoder_input, decoder_output, name="decoder")

      # Autoencoder Model
      autoencoder_input = keras.Input(shape=(input_shape,), name="input")
      encoded = encoder(autoencoder_input)
      decoded = decoder(encoded)
      autoencoder = keras.Model(autoencoder_input, decoded, name="autoencoder")

      return autoencoder, encoder, decoder

    autoencoder, encoder, decoder = build_autoencoder(INPUT_SHAPE, encoder_dense_layers=encoder_dense_layers,
                                                      bottle_neck=bottle_neck,
                                                      decoder_dense_layers=decoder_dense_layers)
    opt = keras.optimizers.Adam(learning_rate=0.001)
    autoencoder.compile(opt, loss="mse")

    history = autoencoder.fit(X_train, X_train, epochs=125, batch_size=16, validation_split=0.25, verbose=0)
    
    # Extract the loss values
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    # Print the last epoch's loss values
    last_epoch_loss = loss[-1]
    last_epoch_val_loss = val_loss[-1]

    # Saving history
    with open(FILE_NAME + '_history.pickle', 'wb') as file:
        pickle.dump(history.history, file)

    # Generate synthetic data
    num_samples = len(X_train)
    #input_data = np.random.normal(size=(num_samples, INPUT_SHAPE))


    # IMP
    generated_data = autoencoder.predict(X_train) #################################################################



    reshaped_data = generated_data.reshape(num_samples, -1)
    df_generated = pd.DataFrame(reshaped_data, columns=X_train.columns)

    # Calculate mean and standard deviation of original and synthetic datasets
    common_columns = set(X_train.columns) & set(df_generated.columns)
    results = {}

    for column in common_columns:
        mean_df1 = X_train[column].mean()
        std_df1 = X_train[column].std()
        mean_df2 = df_generated[column].mean()
        std_df2 = df_generated[column].std()

        results[column] = {'Mean_df1': mean_df1, 'Std_df1': std_df1,
                           'Mean_df2': mean_df2, 'Std_df2': std_df2}

    comparison_df = pd.DataFrame(results)
    comparison_df.to_csv(FILE_NAME + 'mean_std.csv', index=True)

    # Add back the class label
    X_train['class'] = 0.0
    df_generated['class'] = 0.0
    X_train.to_csv(FILE_NAME + '_Original_minority_data.csv', index=False)
    df_generated.to_csv(FILE_NAME + '_Synthetic_minority_data.csv', index=False)
    print(FILE_NAME, last_epoch_loss, last_epoch_val_loss)

    run_summary.append([FILE_NAME, last_epoch_loss, last_epoch_val_loss])

In [5]:
def runner(encoder_dense_layers, bottle_neck, decoder_dense_layers):
    generate_autoencoder_reports(encoder_dense_layers, bottle_neck, decoder_dense_layers)

In [6]:
encoder_dense_layers_trial = [[10, 8], [12, 10], [14, 12], [16, 14], [18, 16], [20, 18] ,[22,20]]
bottle_neck_trial = [8, 10, 12, 14, 16, 18]
decoder_dense_layers_trial = [[8, 10], [10, 12], [12, 14], [14, 16], [16, 18], [18, 20], [20,22]]

In [7]:
total_iterations = len(bottle_neck_trial) * len(encoder_dense_layers_trial) * len(decoder_dense_layers_trial)
print("Total Model in Pipeline:",total_iterations)

# Print total models
for bn in bottle_neck_trial:
  for enc_layers in encoder_dense_layers_trial:
      for dec_layers in decoder_dense_layers_trial:
          print(enc_layers, bn, dec_layers)

Total Model in Pipeline: 294
[10, 8] 8 [8, 10]
[10, 8] 8 [10, 12]
[10, 8] 8 [12, 14]
[10, 8] 8 [14, 16]
[10, 8] 8 [16, 18]
[10, 8] 8 [18, 20]
[10, 8] 8 [20, 22]
[12, 10] 8 [8, 10]
[12, 10] 8 [10, 12]
[12, 10] 8 [12, 14]
[12, 10] 8 [14, 16]
[12, 10] 8 [16, 18]
[12, 10] 8 [18, 20]
[12, 10] 8 [20, 22]
[14, 12] 8 [8, 10]
[14, 12] 8 [10, 12]
[14, 12] 8 [12, 14]
[14, 12] 8 [14, 16]
[14, 12] 8 [16, 18]
[14, 12] 8 [18, 20]
[14, 12] 8 [20, 22]
[16, 14] 8 [8, 10]
[16, 14] 8 [10, 12]
[16, 14] 8 [12, 14]
[16, 14] 8 [14, 16]
[16, 14] 8 [16, 18]
[16, 14] 8 [18, 20]
[16, 14] 8 [20, 22]
[18, 16] 8 [8, 10]
[18, 16] 8 [10, 12]
[18, 16] 8 [12, 14]
[18, 16] 8 [14, 16]
[18, 16] 8 [16, 18]
[18, 16] 8 [18, 20]
[18, 16] 8 [20, 22]
[20, 18] 8 [8, 10]
[20, 18] 8 [10, 12]
[20, 18] 8 [12, 14]
[20, 18] 8 [14, 16]
[20, 18] 8 [16, 18]
[20, 18] 8 [18, 20]
[20, 18] 8 [20, 22]
[22, 20] 8 [8, 10]
[22, 20] 8 [10, 12]
[22, 20] 8 [12, 14]
[22, 20] 8 [14, 16]
[22, 20] 8 [16, 18]
[22, 20] 8 [18, 20]
[22, 20] 8 [20, 22]
[10, 

In [8]:
start_time = time.time()

for bn in bottle_neck_trial:
  for enc_layers in encoder_dense_layers_trial:
      for dec_layers in decoder_dense_layers_trial:
          runner(enc_layers, bn, dec_layers)
                
end_time = time.time()
total_time = end_time - start_time                
total_time_minutes = total_time / 60
print(f"Total time: {total_time_minutes} minutes") 

L27_E10_8_B8_D8_10 0.01635112054646015 0.017381971701979637
L27_E10_8_B8_D10_12 0.019483068957924843 0.02041078358888626
L27_E10_8_B8_D12_14 0.01637430302798748 0.01734296791255474
L27_E10_8_B8_D14_16 0.016229839995503426 0.01712253876030445
L27_E10_8_B8_D16_18 0.016545629128813744 0.017699316143989563
L27_E10_8_B8_D18_20 0.016038134694099426 0.017354749143123627
L27_E10_8_B8_D20_22 0.014380166307091713 0.016145983710885048
L27_E12_10_B8_D8_10 0.017415393143892288 0.018367087468504906
L27_E12_10_B8_D10_12 0.012957308441400528 0.01372518576681614
L27_E12_10_B8_D12_14 0.016922010108828545 0.017908046022057533
L27_E12_10_B8_D14_16 0.01672045700252056 0.017842574045062065
L27_E12_10_B8_D16_18 0.01625795289874077 0.0171748585999012
L27_E12_10_B8_D18_20 0.01188098918646574 0.012830845080316067
L27_E12_10_B8_D20_22 0.01589135266840458 0.017331404611468315
L27_E14_12_B8_D8_10 0.018680056557059288 0.01959582231938839
L27_E14_12_B8_D10_12 0.014646124094724655 0.015776008367538452
L27_E14_12_B8_D

In [9]:
run_summary

[['L27_E10_8_B8_D8_10', 0.01635112054646015, 0.017381971701979637],
 ['L27_E10_8_B8_D10_12', 0.019483068957924843, 0.02041078358888626],
 ['L27_E10_8_B8_D12_14', 0.01637430302798748, 0.01734296791255474],
 ['L27_E10_8_B8_D14_16', 0.016229839995503426, 0.01712253876030445],
 ['L27_E10_8_B8_D16_18', 0.016545629128813744, 0.017699316143989563],
 ['L27_E10_8_B8_D18_20', 0.016038134694099426, 0.017354749143123627],
 ['L27_E10_8_B8_D20_22', 0.014380166307091713, 0.016145983710885048],
 ['L27_E12_10_B8_D8_10', 0.017415393143892288, 0.018367087468504906],
 ['L27_E12_10_B8_D10_12', 0.012957308441400528, 0.01372518576681614],
 ['L27_E12_10_B8_D12_14', 0.016922010108828545, 0.017908046022057533],
 ['L27_E12_10_B8_D14_16', 0.01672045700252056, 0.017842574045062065],
 ['L27_E12_10_B8_D16_18', 0.01625795289874077, 0.0171748585999012],
 ['L27_E12_10_B8_D18_20', 0.01188098918646574, 0.012830845080316067],
 ['L27_E12_10_B8_D20_22', 0.01589135266840458, 0.017331404611468315],
 ['L27_E14_12_B8_D8_10', 0.

In [10]:
run_summary_df = pd.DataFrame(run_summary, columns=['Model', 'Train Loss', 'Validation Loss'])  
run_summary_df.to_csv('run_summary.csv', index=False)    