## **Adversarial PCA** - Generate Synthetic Data

File:  gen_synthetic_data.ipynb

Author:  Billy Carson

Date written:  04-10-2021

Last modified:  04-11-2021

> Description: This script generates two synthetic datasets to use as an examples for decomposition with adversarial Principal Component Analysis (aPCA) as well as demonstrate the difference in behavior of aPCA variants.


### **Import modules**

In [1]:
# Import modules
import sys
import numpy as np
from numpy.random import RandomState, SeedSequence
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

# Import aPCA and utilities modules
sys.path.append('..')
from adv_pca import *
from utils import *


### **Set random seed**

In [2]:
# Set random state
random_state = 4
rng = np.random.default_rng(random_state)


## **Synthetic data example 1** - Concomitant data independent of class

### **Generate synthetic data**

In [3]:
# Primary data - Function of both class and concomitant data
# Concomitant data - Random function, no relation to class

# Number of total samples
n_samp = 10000

# Number of samples in class 1 and class 2
n_samp_1 = int(n_samp / 2)
n_samp_2 = n_samp - int(n_samp / 2)

# Latent factor phi mean arrays
phi_1_mean = np.array([0.2, -0.1, 0.5])
phi_2_mean = np.array([-0.3, 0.1, -0.4])

# Latent factor phi covariance matrices
# phi_1_sigma = np.array([[1.0, 0.9, 0.0],
#                       [0.9, 1.0, 0.1],
#                       [0.0, 0.1, 1.0]])
# phi_2_sigma = np.array([[1.0, 0.1, 0.9],
#                       [0.1, 1.0, 0.5],
#                       [0.9, 0.5, 1.0]])
phi_1_sigma = np.array([[1.0, 0.7, 0.0],
                      [0.7, 1.0, 0.1],
                      [0.0, 0.1, 1.0]])
phi_2_sigma = np.array([[1.0, 0.1, 0.6],
                      [0.1, 1.0, 0.4],
                      [0.6, 0.4, 1.0]])

# Generate latent data uncorrupted by noise
phi_1 = rng.multivariate_normal(phi_1_mean, phi_1_sigma, size=(n_samp_1))
phi_2 = rng.multivariate_normal(phi_2_mean, phi_2_sigma, size=(n_samp_2))

# Observed concomitant data is a linear combination of the true concomitant data and some Gaussian noise
Y_sigma = 7.0
Y_true = rng.normal(0.0, Y_sigma, size=(n_samp, 1))
noise_sigma = 0.2
Y_eps = rng.normal(0.0, noise_sigma, size=(n_samp, 1))
Y = Y_true.copy() + Y_eps

# Observed primary data is a linear combination of latent data phi, concomitant data Y, and Gaussian noise
X_eps = rng.normal(0.0, noise_sigma, size=(n_samp, 1))
X = np.concatenate((phi_1, phi_2), axis=0) + Y_true + X_eps

# Create primairy data labels
labels_1 = np.full(shape=(n_samp_1), fill_value=0)
labels_2 = np.full(shape=(n_samp_2), fill_value=1)
labels = np.hstack((labels_1, labels_2))

# Save synthetic data
np.save('synthetic_data/X_synth_1', X)
np.save('synthetic_data/Y_synth_1', Y)
np.save('synthetic_data/labels_synth_1', labels)


## **Synthetic data example 2** - Concomitant data *dependent* on class

### **Generate synthetic data**

In [4]:
# Primary data - Function of both class and concomitant data
# Concomitant data - Function of class

# Number of total samples
n_samp = 10000

# Number of samples in class 1 and class 2
n_samp_1 = int(n_samp / 2)
n_samp_2 = n_samp - int(n_samp / 2)

# Latent factor phi mean arrays
phi_1_mean = np.array([0.2, -0.1, 0.5])
phi_2_mean = np.array([-0.3, 0.1, -0.4])

# Latent factor phi covariance matrices
phi_1_sigma = np.array([[1.0, 0.9, 0.0],
                      [0.9, 1.0, 0.1],
                      [0.0, 0.1, 1.0]])
phi_2_sigma = np.array([[1.0, 0.1, 0.9],
                      [0.1, 1.0, 0.5],
                      [0.9, 0.5, 1.0]])

# Generate latent data uncorrupted by noise
phi_1 = rng.multivariate_normal(phi_1_mean, phi_1_sigma, size=(n_samp_1))
phi_2 = rng.multivariate_normal(phi_2_mean, phi_2_sigma, size=(n_samp_2))

# Concomitant data now has mean dependent on class
Y_sigma = 7.0
Y_1_true = rng.normal(1.0, Y_sigma, size=(n_samp_1, 1))
Y_2_true = rng.normal(-1.0, Y_sigma, size=(n_samp_2, 1))
noise_sigma = 0.2
Y_eps = rng.normal(0.0, noise_sigma, size=(n_samp, 1))
Y_true = np.concatenate((Y_1_true, Y_2_true), axis=0)
Y = Y_true + Y_eps

# Observed primary data is a linear combination of latent data phi, concomitant data Y, and Gaussian noise
X_eps = rng.normal(0.0, noise_sigma, size=(n_samp, 1))
X = np.concatenate((phi_1, phi_2), axis=0) + Y_true + X_eps

# Create primairy data labels
labels_1 = np.full(shape=(n_samp_1), fill_value=0)
labels_2 = np.full(shape=(n_samp_2), fill_value=1)
labels = np.hstack((labels_1, labels_2))

# Save synthetic data
np.save('synthetic_data/X_synth_2', X)
np.save('synthetic_data/Y_synth_2', Y)
np.save('synthetic_data/labels_synth_2', labels)
