In [1]:
import pandas as pd
import torch
import torch.optim as optim
from torch.utils.data import DataLoader

import pandas as pd
import numpy as np

from course_small_datasets import training, loader, model
from course_small_datasets import TestModel

In [2]:
path_data = 'course_small_datasets/starter_code/part2-synthetic/data/'

In [3]:
df = pd.read_csv(path_data + 'loan_continuous.csv')

In [4]:
def describe(df: pd.DataFrame):
    col_summary = lambda col, fn, df: getattr(df[col], fn)()
    cols_summary = lambda fn, df: [col_summary(col, fn, df) for col in df.columns]
    return pd.DataFrame(
        {
        'col name': df.columns,
        'dtype': list(df.dtypes),
        'min': cols_summary('min', df),
        'max': cols_summary('max', df),
        'mean': cols_summary('mean', df),
        'std': cols_summary('std', df),
    }
    )

In [5]:
describe(df)

Unnamed: 0,col name,dtype,min,max,mean,std
0,Loan Amount,int64,1014.0,35000.0,16848.902776,8367.865726
1,Funded Amount,int64,1014.0,34999.0,15770.599114,8150.992662
2,Funded Amount Investor,float64,1114.590204,34999.75,14621.799323,6785.34517
3,Term,int64,36.0,59.0,58.173814,3.327441
4,Home Ownership,float64,14573.53717,406561.5,80541.502522,45029.120366
5,Debit to Income,float64,0.675299,39.62986,23.299241,8.451824
6,Delinquency - two years,int64,0.0,8.0,0.327127,0.800888
7,Inquires - six months,int64,0.0,5.0,0.145754,0.473291
8,Open Account,int64,2.0,37.0,14.266561,6.22506
9,Public Record,int64,0.0,4.0,0.081437,0.346606


In [6]:
df_low_count = df[df['Loan Status'] == 1]

In [7]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')
device

'cuda'

In [8]:
trainset = loader.DataBuilder(df_low_count)
testset = loader.DataBuilder(df_low_count, train=False)

In [9]:
batch_size = 1024

trainloader = DataLoader(trainset, batch_size=batch_size) 
testloader = DataLoader(testset, batch_size=batch_size)

In [10]:
TestModel.test_model('course_small_datasets/starter_code/part2-synthetic/data/loan_continuous.csv')



Best parameters found:
 {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (5, 10), 'learning_rate': 'constant', 'solver': 'sgd'}
Results on the test set:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     61222
           1       0.00      0.00      0.00      6241

    accuracy                           0.91     67463
   macro avg       0.45      0.50      0.48     67463
weighted avg       0.82      0.91      0.86     67463



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Both **recall** and **f1-score** for `class 1` are zero, which very likely stem from the class imbalance in the dataset.

## Model Testing

In [11]:
shallow_params = {'D_in': trainset.x.shape[1], 'hidden_dims': [50, 12], 'latent_dim': 3}
deep_params = {'D_in': trainset.x.shape[1], 'hidden_dims': [64, 32, 16], 'latent_dim': 6}

This is a wrapper for the training. For evaluation we need to reloade the model from the returned path from the function.

In [12]:
def train(network_params, save_path, train_loader, test_loader, device):
    vae = model.VAE(**network_params).to(device)
    optimizer = optim.Adam(vae.parameters(), lr=1e-3)
    loss_fn = model.MSE_KLD()
    path_best_model = training.train(3000, vae, optimizer, loss_fn, train_loader, test_loader,
                                     save_every=-1, print_every=200, save_path=save_path,
                                     grace_for_overfit=500,
                                     patience_for_overfit=10,
                                     relative_val_loss_threshold_for_overfit=0.02,
                                     device=device)
    return path_best_model



#### Shallow network **without** residual blocks

In [13]:
best_model_path_shallow = train(
    {**shallow_params, 'use_shortcut': False},
    'synthetic_data_shallow_vae',
    trainloader, testloader, device
)

====> Epoch: 200 Average training loss 15.85
====> Epoch: 200 Average validation loss 15.50
====> Epoch: 400 Average training loss 15.38
====> Epoch: 400 Average validation loss 15.07
====> Epoch: 600 Average training loss 15.12
====> Epoch: 600 Average validation loss 15.02
====> Epoch: 800 Average training loss 14.99
====> Epoch: 800 Average validation loss 15.06
====> Epoch: 1000 Average training loss 14.90
====> Epoch: 1000 Average validation loss 15.07
====> Epoch: 1200 Average training loss 14.86
====> Epoch: 1200 Average validation loss 15.05
====> Epoch: 1400 Average training loss 14.80
====> Epoch: 1400 Average validation loss 15.06
====> Epoch: 1600 Average training loss 14.76
====> Epoch: 1600 Average validation loss 15.13
====> Epoch: 1800 Average training loss 14.75
====> Epoch: 1800 Average validation loss 15.11
====> Epoch: 2000 Average training loss 14.67
====> Epoch: 2000 Average validation loss 15.20
====> Epoch: 2200 Average training loss 14.67
====> Epoch: 2200 Aver

#### Shallow network with residual blocks

In [14]:
best_model_path_shallow_res = train(
    {**shallow_params, 'use_shortcut': True},
    'synthetic_data_shallow_vae_res',
    trainloader, testloader, device
)

====> Epoch: 200 Average training loss 16.22
====> Epoch: 200 Average validation loss 16.04
====> Epoch: 400 Average training loss 15.72
====> Epoch: 400 Average validation loss 15.71
====> Epoch: 600 Average training loss 15.10
====> Epoch: 600 Average validation loss 15.21
====> Epoch: 800 Average training loss 14.96
====> Epoch: 800 Average validation loss 15.10
====> Epoch: 1000 Average training loss 14.83
====> Epoch: 1000 Average validation loss 15.10
====> Epoch: 1200 Average training loss 14.77
====> Epoch: 1200 Average validation loss 15.16
====> Epoch: 1400 Average training loss 14.76
====> Epoch: 1400 Average validation loss 15.12
====> Epoch: 1600 Average training loss 14.74
====> Epoch: 1600 Average validation loss 15.22
====> Epoch: 1800 Average training loss 14.62
====> Epoch: 1800 Average validation loss 15.16
====> Epoch: 2000 Average training loss 14.61
====> Epoch: 2000 Average validation loss 15.13
====> Epoch: 2200 Average training loss 14.56
====> Epoch: 2200 Aver

#### Deep network without residual blocks

In [15]:
best_model_path_deep = train(
    {**deep_params, 'use_shortcut': False},
    'synthetic_data_deep_vae',
    trainloader, testloader, device
)

====> Epoch: 200 Average training loss 15.63
====> Epoch: 200 Average validation loss 15.34
====> Epoch: 400 Average training loss 14.68
====> Epoch: 400 Average validation loss 14.34
====> Epoch: 600 Average training loss 14.43
====> Epoch: 600 Average validation loss 14.12
====> Epoch: 800 Average training loss 14.15
====> Epoch: 800 Average validation loss 13.92
====> Epoch: 1000 Average training loss 14.00
====> Epoch: 1000 Average validation loss 13.81
====> Epoch: 1200 Average training loss 13.85
====> Epoch: 1200 Average validation loss 13.84
====> Epoch: 1400 Average training loss 13.81
====> Epoch: 1400 Average validation loss 13.87
====> Epoch: 1600 Average training loss 13.73
====> Epoch: 1600 Average validation loss 13.88
====> Epoch: 1800 Average training loss 13.67
====> Epoch: 1800 Average validation loss 13.89
====> Epoch: 2000 Average training loss 13.63
====> Epoch: 2000 Average validation loss 13.97
====> Epoch: 2200 Average training loss 13.58
====> Epoch: 2200 Aver

#### Deep network with residual blocks

In [16]:
best_model_path_deep_res = train(
    {**deep_params, 'use_shortcut': True},
    'synthetic_data_deep_vae_res',
    trainloader, testloader, device
)

====> Epoch: 200 Average training loss 14.68
====> Epoch: 200 Average validation loss 13.47
====> Epoch: 400 Average training loss 14.16
====> Epoch: 400 Average validation loss 13.25
====> Epoch: 600 Average training loss 13.94
====> Epoch: 600 Average validation loss 13.09
====> Epoch: 800 Average training loss 13.79
====> Epoch: 800 Average validation loss 12.91
====> Epoch: 1000 Average training loss 13.61
====> Epoch: 1000 Average validation loss 12.94
====> Epoch: 1200 Average training loss 13.50
====> Epoch: 1200 Average validation loss 12.81
====> Epoch: 1400 Average training loss 13.47
====> Epoch: 1400 Average validation loss 12.95
====> Epoch: 1600 Average training loss 13.44
====> Epoch: 1600 Average validation loss 12.84
====> Epoch: 1800 Average training loss 13.26
====> Epoch: 1800 Average validation loss 12.79
====> Epoch: 2000 Average training loss 13.27
====> Epoch: 2000 Average validation loss 12.75
====> Epoch: 2200 Average training loss 13.29
====> Epoch: 2200 Aver

## Data generation and model evaluation

Each model run from previous section contains a saved model weights. In this section, we load these weights into their respective models and using the provided `generate_fake` routine, will generate `50_000` additional data points for class 1. We then evaluate the quality of these generated data using the provided test routines. 

With increased model capacity, `hidden_dims = [64, 32, 16]` and `latent_dim = 6`, the validation loss becomes smaller than the training loss. Since I don't understnad this behaviour, we will discard this model from further evaluation. We are left therefore with two models: 
- Vanilla VAE with hidden layers 50 and 12 and latent dimension of 3
- VAE with residual blocks of the same dimensions as vanilla VAE. 

Next we read the best saved models of the two experiements. 

In [17]:
def generate_fake(model, loader, scaler, no_samples, device):
    mus = []
    logvars = []

    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            _, mu, logvar = model(data)
            mus.append(mu)
            logvars.append(logvar)
    
    mu = torch.cat(mus, dim=0)
    logvar = torch.cat(logvars, dim=0)

    sigma = torch.exp(logvar / 2)
    q = torch.distributions.Normal(mu.mean(dim=0), sigma.mean(dim=0))
    z = q.rsample(sample_shape=torch.Size([no_samples]))

    with torch.no_grad():
        pred = model.decoder(z).cpu().numpy()

    fake_data = scaler.inverse_transform(pred)
    return fake_data

We modify the tests slightly by increasing the `max_iteration` to 1000 from 100 to avoid convegence issues with optimizers. 

In [18]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


def run_test(x_df, y_df):
    mlp = MLPClassifier(max_iter=1000)
    ## Feel free to play with these parameters if you want
    parameter_space = {
        'hidden_layer_sizes': [(5,10), (12), (2,5,10, 15)],
        'activation': ['tanh', 'relu', 'logistic'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.05, 0.01],
        'learning_rate': ['constant','adaptive'],
    }

    clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
    clf.fit(x_df, y_df)

    print('Best parameters found:\n', clf.best_params_)

    #Compare actuals with predicted values
    y_true, y_pred = y_df , clf.predict(x_df)

    print('Results on the test set:')
    print(classification_report(y_true, y_pred))

and mix with original data as requested by the project:

In [47]:
def load_model(model_params, model_path):
    vae = model.VAE(**model_params)
    vae.load_state_dict(torch.load(model_path)['model_state_dict'])    
    return vae

In [54]:
def generate_fake_data_and_run_test(model_params, model_path, loader, scaler, num_samples, device, df_original):
    model = load_model(model_params, model_path).to(device)
    fake_data = generate_fake(model, loader, scaler, num_samples, device)
    df_fake = pd.DataFrame(fake_data, columns=df_original.columns)
    # set the 'Loan Status' to 1
    df_fake['Loan Status'] = 1
    df_augmented = pd.concat([df_original, df_fake], ignore_index=True).sample(frac=1).reset_index(drop=True)
    print('Running GridSearchCV with max iteration = 1000 ...')
    run_test(*TestModel.load_xy(df_augmented))
    return df_augmented

In [55]:
df_aug_shallow = generate_fake_data_and_run_test(shallow_params, best_model_path_shallow, testloader, trainset.scaler, 50_000, device, df)

Running GridSearchCV with max iteration = 1000 ...




Best parameters found:
 {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (5, 10), 'learning_rate': 'constant', 'solver': 'adam'}
Results on the test set:
              precision    recall  f1-score   support

           0       0.89      0.82      0.85     61222
           1       0.82      0.88      0.85     56241

    accuracy                           0.85    117463
   macro avg       0.85      0.85      0.85    117463
weighted avg       0.86      0.85      0.85    117463



In [56]:
df_aug_shallow_res = generate_fake_data_and_run_test({**shallow_params, 'use_shortcut': True},
                                                    best_model_path_shallow_res,
                                                    testloader, trainset.scaler, 50_000, device, df)

Running GridSearchCV with max iteration = 1000 ...




Best parameters found:
 {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (2, 5, 10, 15), 'learning_rate': 'adaptive', 'solver': 'adam'}
Results on the test set:
              precision    recall  f1-score   support

           0       0.81      0.89      0.85     61222
           1       0.87      0.78      0.82     56241

    accuracy                           0.83    117463
   macro avg       0.84      0.83      0.83    117463
weighted avg       0.84      0.83      0.83    117463



In [57]:
df_aug_deep = generate_fake_data_and_run_test({**deep_params, 'use_shortcut': False},
                                                best_model_path_deep,
                                                testloader, trainset.scaler, 50_000, device, df)

Running GridSearchCV with max iteration = 1000 ...




Best parameters found:
 {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': 12, 'learning_rate': 'constant', 'solver': 'adam'}
Results on the test set:
              precision    recall  f1-score   support

           0       0.72      0.56      0.63     61222
           1       0.62      0.76      0.68     56241

    accuracy                           0.66    117463
   macro avg       0.67      0.66      0.66    117463
weighted avg       0.67      0.66      0.66    117463



In [58]:
df_aug_deep_res = generate_fake_data_and_run_test({**deep_params, 'use_shortcut': True},
                                                    best_model_path_deep_res,
                                                    testloader, trainset.scaler, 50_000, device, df)

Running GridSearchCV with max iteration = 1000 ...




Best parameters found:
 {'activation': 'logistic', 'alpha': 0.05, 'hidden_layer_sizes': 12, 'learning_rate': 'constant', 'solver': 'adam'}
Results on the test set:
              precision    recall  f1-score   support

           0       0.73      0.57      0.64     61222
           1       0.62      0.77      0.69     56241

    accuracy                           0.67    117463
   macro avg       0.68      0.67      0.66    117463
weighted avg       0.68      0.67      0.66    117463



## Conclusion

In the work presented above, four networks were tested for generation of synthetic data:
1. Shallow network without residual connections
2. Shallow network with residual connections
3. Deep network without residual connections
4. Deep network with residual connections

The following observations have been made:
- The impact of residual connections on the quality of generated data, as indicated by F1 scores, has been inconclusive. For the shallow networks, the one without residual connections performed better, while for the deep network, the one with residual connections was slightly better.
- When comparing deep and shallow networks, the latter significantly outperformed the former, as evidenced by much higher F1 scores.
