In [1]:
from GaussianCopulaImp.gaussian_copula import GaussianCopula
from GaussianCopulaImp.low_rank_gaussian_copula import LowRankGaussianCopula
from GaussianCopulaImp.helper_data_generation import generate_sigma, generate_mixed_from_gc, generate_LRGC
from GaussianCopulaImp.helper_evaluation import get_smae, get_scaled_error, grassman_dist
from GaussianCopulaImp.helper_mask import mask_types, mask
import numpy as np

In [2]:
seed = 101
copula_corr = generate_sigma(seed=seed, p=15)
X = generate_mixed_from_gc(sigma=copula_corr, n=2000, seed=seed)
X_mask = mask_types(X, mask_num=2, seed=seed)

In [3]:
model = GaussianCopula(verbose = 1)
Ximp = model.fit_transform(X=X_mask)

Iteration 1: copula correlation update ratio 0.126, likelihood -3.181
Copula correlation change ratio:  0.126
Iteration 2: copula correlation update ratio 0.086, likelihood -2.978
Copula correlation change ratio:  0.0856
Iteration 3: copula correlation update ratio 0.057, likelihood -2.817
Copula correlation change ratio:  0.0569
Iteration 4: copula correlation update ratio 0.038, likelihood -2.690
Copula correlation change ratio:  0.0382
Iteration 5: copula correlation update ratio 0.026, likelihood -2.591
Copula correlation change ratio:  0.0264
Iteration 6: copula correlation update ratio 0.019, likelihood -2.513
Copula correlation change ratio:  0.0188
Iteration 7: copula correlation update ratio 0.014, likelihood -2.451
Copula correlation change ratio:  0.0138
Iteration 8: copula correlation update ratio 0.010, likelihood -2.402
Copula correlation change ratio:  0.0105
Iteration 9: copula correlation update ratio 0.008, likelihood -2.363
Copula correlation change ratio:  0.0081
Co

In [4]:
smae = get_smae(Ximp, X, X_mask)
print(f'The SMAE across 5 exponential variables has: mean {smae[:5].mean():.3f} and std {smae[:5].std():.3f}')
print(f'The SMAE across 5 1-5 oridnal variables has: mean {smae[5:10].mean():.3f} and std {smae[5:10].std():.3f}')
print(f'The SMAE across 5 boolean variables has: mean {smae[10:].mean():.3f} and std {smae[10:].std():.3f}')

The SMAE across 5 exponential variables has: mean 0.740 and std 0.048
The SMAE across 5 1-5 oridnal variables has: mean 0.812 and std 0.121
The SMAE across 5 boolean variables has: mean 0.590 and std 0.047


In [5]:
copula_corr_est = model.get_params()['copula_corr']
cor_error = get_scaled_error(copula_corr_est, copula_corr)
print(f'The scaled correlation error is: {cor_error:.3f}')

The scaled correlation error is: 0.149


In [6]:
model_minibatch = GaussianCopula(training_mode='minibatch-offline')
Ximp = model_minibatch.fit_transform(X=X_mask)

In [7]:
smae = get_smae(Ximp, X, X_mask)

print(f'The SMAE across 5 exponential variables has: mean {smae[:5].mean():.3f} and std {smae[:5].std():.3f}')
print(f'The SMAE across 5 1-5 oridnal variables has: mean {smae[5:10].mean():.3f} and std {smae[5:10].std():.3f}')
print(f'The SMAE across 5 boolean variables has: mean {smae[10:].mean():.3f} and std {smae[10:].std():.3f}')

The SMAE across 5 exponential variables has: mean 0.738 and std 0.048
The SMAE across 5 1-5 oridnal variables has: mean 0.818 and std 0.124
The SMAE across 5 boolean variables has: mean 0.589 and std 0.048


In [8]:
copula_corr_est = model_minibatch.get_params()['copula_corr']

cor_error = get_scaled_error(copula_corr_est, copula_corr)
print(f'The scaled correlation error is: {cor_error:.3f}')

The scaled correlation error is: 0.151


In [9]:
model_online = GaussianCopula(training_mode='minibatch-online', cont_indices=[True]*5+[False]*10)
Ximp = model_online.fit_transform(X=X_mask)

In [10]:
smae = get_smae(Ximp, X, X_mask)

print(f'The SMAE across 5 exponential variables has: mean {smae[:5].mean():.3f} and std {smae[:5].std():.3f}')
print(f'The SMAE across 5 1-5 oridnal variables has: mean {smae[5:10].mean():.3f} and std {smae[5:10].std():.3f}')
print(f'The SMAE across 5 boolean variables has: mean {smae[10:].mean():.3f} and std {smae[10:].std():.3f}')

The SMAE across 5 exponential variables has: mean 0.821 and std 0.034
The SMAE across 5 1-5 oridnal variables has: mean 0.908 and std 0.067
The SMAE across 5 boolean variables has: mean 0.665 and std 0.033


In [11]:
copula_corr_est = model_online.get_params()['copula_corr']

cor_error = get_scaled_error(copula_corr_est, copula_corr)
print(f'The scaled correlation error is: {cor_error:.3f}')

The scaled correlation error is: 0.192


In [12]:
model_online_decay = GaussianCopula(training_mode='minibatch-online', 
                                    cont_indices=[True]*5+[False]*10, 
                                    const_stepsize=None, 
                                    verbose = 1)
Ximp = model_online_decay.fit_transform(X=X_mask)

In [13]:
smae = get_smae(Ximp, X, X_mask)

print(f'The SMAE across 5 exponential variables has: mean {smae[:5].mean():.3f} and std {smae[:5].std():.3f}')
print(f'The SMAE across 5 1-5 oridnal variables has: mean {smae[5:10].mean():.3f} and std {smae[5:10].std():.3f}')
print(f'The SMAE across 5 boolean variables has: mean {smae[10:].mean():.3f} and std {smae[10:].std():.3f}')

The SMAE across 5 exponential variables has: mean 0.817 and std 0.037
The SMAE across 5 1-5 oridnal variables has: mean 0.896 and std 0.074
The SMAE across 5 boolean variables has: mean 0.657 and std 0.042


In [14]:
copula_corr_est = model_online_decay.get_params()['copula_corr']

cor_error = get_scaled_error(copula_corr_est, copula_corr)
print(f'The scaled correlation error is: {cor_error:.3f}')

The scaled correlation error is: 0.196


In [15]:
seed = 101
var_types = {'cont':list(range(100)), 'ord':list(range(100, 150, 1)), 'bin':list(range(150, 200, 1))}
Xtrue, Wtrue = generate_LRGC(var_types=var_types, 
                             rank=10, 
                             sigma=0.1, 
                             n=2000, 
                             cont_transform=lambda x: np.power(x,3),
                             seed=seed)

In [16]:
np.random.seed(seed)
X_masked = mask(Xtrue, mask_fraction = 0.4, seed=seed)

In [17]:
model_lrgc = LowRankGaussianCopula(rank=10, verbose=1)

In [18]:
Ximp = model_lrgc.fit_transform(X=X_masked)

Ater initialization, W has shape (200, 10) and sigma is 0.21016169823236375
Interation 1: noise ratio estimate 0.136, copula parameter update ratio 0.063, likelihood 126.387
Interation 2: noise ratio estimate 0.114, copula parameter update ratio 0.017, likelihood 172.965
Interation 3: noise ratio estimate 0.106, copula parameter update ratio 0.008, likelihood 190.359
Interation 4: noise ratio estimate 0.103, copula parameter update ratio 0.005, likelihood 197.334
Interation 5: noise ratio estimate 0.102, copula parameter update ratio 0.004, likelihood 200.288
early stop because changed likelihood below 1%


In [19]:
smae = get_smae(Ximp, Xtrue, X_masked)

print(f'The SMAE across 5 exponential variables has: mean {smae[:100].mean():.3f} and std {smae[:100].std():.3f}')
print(f'The SMAE across 5 1-5 oridnal variables has: mean {smae[100:150].mean():.3f} and std {smae[100:150].std():.3f}')
print(f'The SMAE across 5 boolean variables has: mean {smae[150:].mean():.3f} and std {smae[150:].std():.3f}')

The SMAE across 5 exponential variables has: mean 0.500 and std 0.024
The SMAE across 5 1-5 oridnal variables has: mean 0.365 and std 0.036
The SMAE across 5 boolean variables has: mean 0.262 and std 0.033


In [20]:
copula_factor_loading = model_lrgc.get_params()['copula_factor_loading']
cor_error = grassman_dist(copula_factor_loading, Wtrue)
print(f'The latent subspace estimation error is: {cor_error[0]:.3f}')

The latent subspace estimation error is: 0.159
