# Variational Learning of Posteriors for Gaussian Mixture Model using Mixture of Discrete Normalizing Flows

Gaussian mixture model (GMM) is an example of a model with both discrete and continuous latent variables.
We train it using Variational EM, i.e., by alternating between optimization of continuous (M-step) and discrete (E-step) variables. In our implementation, we replace E-step with gradient-based optimization of an approximation to clusters’ posterior allocations using MDNF.

## Imports

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd

In [2]:
tf.__version__, tfp.__version__

('2.2.0', '0.9.0')

In [3]:
import scipy
import sklearn
scipy.__version__, sklearn.__version__

('1.2.1', '0.21.2')

In [4]:
import time
import gc
import collections

In [5]:
import sys
sys.path.append('../mdnf')

In [6]:
import aux
import time_profiling

import base_constructors
import flows_mixture
import inference
import gmvi

In [7]:
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(relativeCreated)6d %(message)s')

In [8]:
np.random.seed(1234)

## Configuration

In [9]:
# can be run as a script with args in format KEY=VAL,KEY=[STRVAL],...
args = aux.parse_script_args() 

  4239 parsing: <-f>


In [10]:
SEED = args.get("SEED", 1)

DATA = args.get("DATA", "ARTIFICIAL")  # "GOOGLE" / "TRIP" / "ARTIFICIAL"

# If data set to ARTIFICIAL
NC = args.get("NC", 3) # how many clusters
M1 = args.get("CLUSTER_SIZE", 100) # nsamples per cluster in data
R = args.get("R", 2.0) # distance of clusters' centers to 0

# "D(0.01)" / "D(0.1)" / "D(1)" / "D(10)"
BASE_SPECIFICATION = args.get("BASE_SPECIFICATION", "p") # base distribution type

B = args.get("B", 5) # how many flows in mixture
FLOW_TYPE = args.get("FLOW_TYPE", "FU")  # flows' type
FACTORIZED_BASE = bool(args.get("FACTORIZED_BASE", 1)) # 1 means factorized flows (assume independence)

# number of components (=how many clusters to fit), 
K = args.get("K", 3) # None means = true number of clusters (only if available)
FLOWS_K = args.get("K", K) # K used in flows 

INIT = args.get("INIT", "random") # inifialization: "random"/"kmeans"

# outer loop:
MIN_NITER = args.get("MIN_NITER", 10)
MAX_NITER = args.get("MAX_NITER", 100)
NOIMPROV_NITER = args.get("NOIMPROV_NITER", 10) # stop if no improvement in niters
EVAL_BATCH_SIZE = args.get("EVAL_BATCH_SIZE", 10240) # how many data pts per minibatch
EVAL_NSAMPLES = args.get("EVAL_NSAMPLES", 100) # how many samples to estimate ELBO (only if sampling used)

# inner E-step loop:
E_NITER = args.get("E_NITER", 1000) # how many iterations in E-step
E_NOIMPROV_NITER = args.get("E_NOIMPROV_NITER", 100) # stop if no improvement in niters
E_MIN_NITER = args.get("E_MIN_NITER", 0)
E_BATCH_SIZE = args.get("E_BATCH_SIZE", 10240) # how many data pts per minibatch
E_DROP_REMAINDER = bool(args.get("E_DROP_REMAINDER", 0))
NSAMPLES = args.get("NSAMPLES", 100) # how many samples for E-step
E_HYBRID_ELBO = bool(args.get("E_HYBRID_ELBO", 1))

INFERENCE = args.get("INFERENCE", 0) # 0=VIF, 1=BVIF
OPTIMIZER = args.get("OPTIMIZER", "RMS")
LR = args.get("LR", 0.1)

# Temperature hyperparameter
BASE_TEMP = args.get("BASE_TEMP", 10.0)
ANNEAL_RATE = args.get("ANNEAL_RATE", 0.01)
E_ANNEAL_RATE = args.get("E_ANNEAL_RATE", 0.01)
MIN_TEMP = args.get("MIN_TEMP", 0.001)

CLUSTER_MIN_WEIGHT = 1.0 # min probability mass to assume that cluster hasn't disappeared
VISUALIZATIONS = False # plot assignments

# Where to store results
OUT = args.get("OUT", "GMVI_%s_%s_%s.csv" % \
               (BASE_SPECIFICATION, B, SEED))
logger.info("Results output file: %s" % OUT)

  4261 Results output file: GMVI_p_5_1.csv


In [11]:
# Store & print configuration
CFG =      [SEED, DATA, NC, M1, R,
            BASE_SPECIFICATION, int(FACTORIZED_BASE), B, K, FLOW_TYPE, 
            E_HYBRID_ELBO, INFERENCE, OPTIMIZER, LR, 
            NSAMPLES, EVAL_BATCH_SIZE, EVAL_NSAMPLES,
            INIT, MIN_NITER, MAX_NITER, NOIMPROV_NITER,
            E_MIN_NITER, E_NITER, E_NOIMPROV_NITER, E_BATCH_SIZE, E_DROP_REMAINDER,
            BASE_TEMP, ANNEAL_RATE, E_ANNEAL_RATE, MIN_TEMP,]
CFGNAMES = ["SEED", "DATA", "NUM_CLUSTERS", "CLUSTERS_SIZE", "CLUSTERS_DIST",
            "BASE_SPECIFICATION", "FACTORIZED_BASE", "B", "K", "FLOW_TYPE", 
            "E_HYBRID_ELBO", "INFERENCE", "OPTIMIZER", "LR", 
            "NSAMPLES", "EVAL_BATCH_SIZE", "EVAL_NSAMPLES",
            "INIT", "MIN_NITER", "MAX_NITER", "NOIMPROV_NITER", 
            "E_MIN_NITER", "E_NITER", "E_NOIMPROV_NITER", "E_BATCH_SIZE", "E_DROP_REMAINDER",
            "BASE_TEMP", "ANNEAL_RATE", "E_ANNEAL_RATE", "MIN_TEMP",]            

logger.info("CONFIGURATION:\n "+"\n ".join("%s=%s" % (name, val) 
                                     for name, val in zip(CFGNAMES, CFG)) )

  4273 CONFIGURATION:
 SEED=1
 DATA=ARTIFICIAL
 NUM_CLUSTERS=3
 CLUSTERS_SIZE=100
 CLUSTERS_DIST=2.0
 BASE_SPECIFICATION=p
 FACTORIZED_BASE=1
 B=5
 K=3
 FLOW_TYPE=FU
 E_HYBRID_ELBO=True
 INFERENCE=0
 OPTIMIZER=RMS
 LR=0.1
 NSAMPLES=100
 EVAL_BATCH_SIZE=10240
 EVAL_NSAMPLES=100
 INIT=random
 MIN_NITER=10
 MAX_NITER=100
 NOIMPROV_NITER=10
 E_MIN_NITER=0
 E_NITER=1000
 E_NOIMPROV_NITER=100
 E_BATCH_SIZE=10240
 E_DROP_REMAINDER=False
 BASE_TEMP=10.0
 ANNEAL_RATE=0.01
 E_ANNEAL_RATE=0.01
 MIN_TEMP=0.001


## Data selection / generation

In [12]:
if DATA.upper().startswith("ART"):
    np.random.seed(0)

    clusters = []
    cluster_nos = []
    for cno, angle in enumerate(np.arange(0, 2*np.pi, 2*np.pi / NC)):
        if cno>=NC: break
        x, y = R*np.sin(angle), R*np.cos(angle)

        c1 = np.random.normal(size=(M1, 2))
        c1 += np.array([x, y])
        clusters.append(c1)
        cluster_nos.append(np.ones(M1)*cno)
    x_train = np.vstack(clusters)
    z_train = np.hstack(cluster_nos).astype(int)
    
elif DATA.upper().startswith("TRIP"):
    d = pd.read_csv("datasets/tripadvisor_review.csv.gz", compression='gzip')    
    FEATURES = ['Category 1', 'Category 2', 'Category 3', 'Category 4',
                'Category 5', 'Category 6', 'Category 7', 'Category 8', 'Category 9',
                'Category 10']
    x_train = d[FEATURES].to_numpy().astype('float64')
    z_train = np.zeros(x_train.shape[0])    

elif DATA.upper().startswith("GOOGLE"):
    d = pd.read_csv("datasets/google_review_ratings_clean.csv.gz", compression="gzip")    
    FEATURES = ['Category 1', 'Category 2', 'Category 3', 'Category 4',
           'Category 5', 'Category 6', 'Category 7', 'Category 8', 'Category 9',
           'Category 10', 'Category 11', 'Category 12', 'Category 13',
           'Category 14', 'Category 15', 'Category 16', 'Category 17',
           'Category 18', 'Category 19', 'Category 20', 'Category 21',
           'Category 22', 'Category 23', 'Category 24']
    x_train = d[FEATURES].to_numpy().astype('float64')
    z_train = np.zeros(x_train.shape[0])        
    
else:
    raise ValueError("Wrong DATA=%s set!" % DATA)  

In [13]:
logger.info("x_train=%s clusters=%s" % (x_train.shape, collections.Counter(z_train)))
if K is None:
    K = len(collections.Counter(z_train))
    logger.info("assigning K := %s" % K)

  4298 x_train=(300, 2) clusters=Counter({0: 100, 1: 100, 2: 100})


In [14]:
# PDF visualization grid
x0, x1 = np.meshgrid(np.linspace(-R*2, R*2, 100), np.linspace(-R*2, R*2, 100))
x = np.array([x0, x1]).reshape(2, -1).T

try: # fails when run on a cluster
    import matplotlib.pyplot as plt
    
    plt.scatter(x_train[:, 0], x_train[:, 1], c=z_train)
    plt.xlim(-R*2, R*2, 100)
    plt.ylim(-R*2, R*2, 100)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.show()
except Exception as e:
    logger.warn("Plotting failed: %s" % e)

<Figure size 640x480 with 1 Axes>

## Accurracy of clustering as classification

In [15]:
def acc(allocations, true_zs):
    """ Returns clustering accurracy where assigned cluster label:=dominant label. """
    total_num_errors = 0
    clusters = np.argmax(allocations, 1)
    for cluster_no in set(clusters):
        # match cluster by selecting the most popular one from the true ones
        true_clusters = true_zs[clusters==cluster_no]
        true_clusters_counts = collections.Counter(true_clusters)   
        true_cluster = sorted(true_clusters_counts.items(), key=lambda kv: -kv[1])[0][0] 
        num_errors = len(true_clusters) - true_clusters_counts[true_cluster]
        total_num_errors += num_errors
        #logger.debug("[acc] cluster no=%s is mapped to true cluster no=%s num_errors=%s/%s"\
        #            % (cluster_no, true_cluster, num_errors, len(true_clusters)))
    #logger.debug("[acc] total_num_errors=%s / %s" % (total_num_errors, len(true_zs)))
    return (len(true_zs)-total_num_errors) / len(true_zs)

## Baseline: closed-form VI

In [16]:
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [17]:
vgmm0 = gmvi.VariationalGaussianMixtureELBO(n_components=K, elbo_nsamples=EVAL_NSAMPLES,
                                           initialization=INIT)

In [18]:
# Storing results with a callback functions

RESULTS = []        
COLS = ["wall_time", "time", "iteration", "temp", "loss", "kl", "accurracy", "cdists", "rec_nclusters"]

start_time = time.time()
def callback_iter(status, step_no, elbo):  
    accurracy = acc(status.r, z_train)
    homogenity = sklearn.metrics.homogeneity_score(z_train, np.argmax(status.r, 1))
    recovered_nclusters = np.sum(np.sum(status.r, 0)>CLUSTER_MIN_WEIGHT)
    wall_time = time.time()-start_time

    RESULTS.append( (wall_time, wall_time,  
                     step_no, -1, 
                     float(-elbo), -1, accurracy,
                     0., recovered_nclusters,) )    
    
    logger.info("[VGM-ELBO.fit][%.2fs] iter=%s ELBO=%.2f acc=%.4f homo=%.4f" % \
                (wall_time, step_no, elbo, accurracy, homogenity))

In [19]:
best_allocations_std = vgmm0.fit(x_train, callback_iter=callback_iter,
            min_niter=MIN_NITER, max_niter=MAX_NITER, noimprov_niter=NOIMPROV_NITER,
            batch_size=EVAL_BATCH_SIZE, e_batch_size=E_BATCH_SIZE)

  4535 [VGM-ELBO.fit][0.04s] iter=-1 ELBO=-196974.37 acc=0.3667 homo=0.0035
  4555 [VGM-ELBO.fit][0.06s] iter=0 ELBO=-1297.17 acc=0.5300 homo=0.1876
  4574 [VGM-ELBO.fit][0.08s] iter=1 ELBO=-1220.81 acc=0.5767 homo=0.2710
  4592 [VGM-ELBO.fit][0.10s] iter=2 ELBO=-1206.34 acc=0.6000 homo=0.3472
  4609 [VGM-ELBO.fit][0.12s] iter=3 ELBO=-1198.27 acc=0.6033 homo=0.3554
  4622 [VGM-ELBO.fit][0.13s] iter=4 ELBO=-1193.78 acc=0.6200 homo=0.4005
  4637 [VGM-ELBO.fit][0.15s] iter=5 ELBO=-1191.65 acc=0.6233 homo=0.4110
  4655 [VGM-ELBO.fit][0.16s] iter=6 ELBO=-1190.64 acc=0.6133 homo=0.3737
  4670 [VGM-ELBO.fit][0.18s] iter=7 ELBO=-1189.97 acc=0.6133 homo=0.3737
  4683 [VGM-ELBO.fit][0.19s] iter=8 ELBO=-1189.35 acc=0.6133 homo=0.3737
  4701 [VGM-ELBO.fit][0.21s] iter=9 ELBO=-1188.66 acc=0.6100 homo=0.3657
  4720 [VGM-ELBO.fit][0.23s] iter=10 ELBO=-1187.82 acc=0.6100 homo=0.3657
  4744 [VGM-ELBO.fit][0.25s] iter=11 ELBO=-1186.73 acc=0.6100 homo=0.3657
  4765 [VGM-ELBO.fit][0.27s] iter=12 ELBO=-118

In [20]:
results_pd = pd.DataFrame(RESULTS).rename(columns=dict(enumerate(COLS)))
for name, val in zip(CFGNAMES, CFG):
    results_pd[name] = str(val)
logger.info("Storing baseline results to %s_baseline" % OUT)
results_pd.to_csv(OUT+"_baseline", header=True, index=False)

  5231 Storing baseline results to GMVI_p_5_1.csv_baseline


In [21]:
logger.info("Clusters' sums using the standard VI:\n%s" % np.round(np.sum(best_allocations_std, 0), 3))

  5249 Clusters' sums using the standard VI:
[2.08403e+02 4.00000e-03 9.15930e+01]


In [22]:
if VISUALIZATIONS:
    try:
        plt.scatter(x_train[:, 0], x_train[:, 1], c=vgmm0.classify(x_train))
        plt.contour(x0, x1, vgmm0.pdf(x).reshape(100, 100))
        plt.xlim(-R*2, R*2, 100)
        plt.ylim(-R*2, R*2, 100)
        plt.gca().set_aspect('equal', adjustable='box')
        plt.show()
    except Exception as e:
        logger.warn("Plotting failed: %s" % e)

## Configuration & results recording

In [23]:
CFG += [float(-vgmm0.elbo(x_train, batch_size=EVAL_BATCH_SIZE)),
        np.sum(np.sum(best_allocations_std, 0)>CLUSTER_MIN_WEIGHT),
        acc(vgmm0.r, z_train)]
CFGNAMES += ["REFERENCE_LOSS", "REFERENCE_RECOVERED_NUM_CLUSTERS", "REFERENCE_ACCURRACY"]

In [24]:
def _store_results(RESULTS, COLS, CFG, CFGNAMES, OUT):
    results_pd = pd.DataFrame(RESULTS).rename(columns=dict(enumerate(COLS)))
    for name, val in zip(CFGNAMES, CFG):
        results_pd[name] = str(val)

    logger.info("Writing %i data rows to: %s" % (len(RESULTS), OUT))
    results_pd.to_csv(OUT, header=True, index=False)    

In [25]:
def minimal_matching(mu0, allocations0, muf, allocationsf):    
    # non-empty clusters according to the standard approach
    retained_clusters = np.sum(allocations0, 0) > CLUSTER_MIN_WEIGHT
    nonempty_nclusters = sum(retained_clusters)
    # most-populated and least-populated clusters according to flow-based approach
    cno2weight = enumerate(np.sum(allocationsf, 0))
    cno2weight = sorted(cno2weight, key=lambda k2v: -k2v[1])
    cno2weight = cno2weight[ : nonempty_nclusters]
    heaviest_clusters_nos = [cno for cno, weight in cno2weight]
    all_clusters_nos = set(range(allocationsf.shape[1]))
    lightest_clusters_nos = sorted(all_clusters_nos - set(heaviest_clusters_nos))
    
    # reordering: empty clusters are moved to the end
    reordering_std = list(np.nonzero(retained_clusters)[0]) + \
                     list(np.nonzero(~retained_clusters)[0])
    mu0 = mu0[reordering_std, : ]
    allocations0 = allocations0[ : , reordering_std]
    
    reordering_flows = heaviest_clusters_nos+lightest_clusters_nos        
    muf = muf[reordering_flows, : ]  
    allocationsf = allocationsf[ : , reordering_flows]
                    
    # matching between clusters (match centers of non-empty ones)
    matching_order = gmvi.minimal_matching(mu0[ : nonempty_nclusters, : ], 
                                           muf[ : nonempty_nclusters, : ])
    matching_order = list(matching_order)+sorted(all_clusters_nos-set(matching_order))
    muf = muf[matching_order]
    allocationsf = allocationsf[ : , matching_order] 
    
    return mu0, allocations0, muf, allocationsf

In [26]:
RESULTS = []        
COLS = ["wall_time", "time", "iteration", "temp", "loss", "kl", "accurracy", "cdists", "rec_nclusters"]

start_time = time.time()
callbacks_total_time = 0.
def _callback(vgmmf, step_no, elbo):
    global callbacks_total_time
    callback_start_time = time.time()
    gc.collect()
    
    # extract params
    mu0, allocations0  = vgmm0.mu, best_allocations_std    
    muf, allocationsf  = vgmmf.mu, vgmmf.r         
    mu0, allocations0, muf, allocationsf = \
        minimal_matching(mu0, allocations0, muf, allocationsf) # match params
    
    # how many existing clusters
    nonempty_nclusters = sum(np.sum(allocations0, 0)>CLUSTER_MIN_WEIGHT)
    recovered_nclusters = np.sum(np.sum(allocationsf, 0)>CLUSTER_MIN_WEIGHT)
    
    # discrepancy between centers
    between_centers_dists = np.sqrt( np.sum( (mu0-muf)**2, -1) )
    between_centers_dists = between_centers_dists[ : nonempty_nclusters] # only non-empty
    centers_mean_dist = np.mean(between_centers_dists)    
    
    kl_mc = float('nan') # ignored
    
    # how close we are to the true labels
    accurracy = acc(allocationsf, z_train)
    homogenity = sklearn.metrics.homogeneity_score(z_train, np.argmax(allocationsf, 1))
    
    logger.info(("""[VGMF.fit][%.2fs] step=%s. Elbo=%.2f KL=%.2f acc=%.4f homo=%.4f 
                     Temp=%.3f Avg-Dist-Mus:%.2f Rec-NClust:%s (min-weight=%.1f)""") % \
                (time.time()-start_time, step_no, 
                 elbo, kl_mc, accurracy, homogenity, 
                 vgmmf.temperature, centers_mean_dist, recovered_nclusters, 
                 min(np.sum(allocationsf, 0)), )
               )
    
    callbacks_total_time += time.time()-callback_start_time        
    wall_time = time.time()-start_time
    RESULTS.append( (wall_time, wall_time-callbacks_total_time,  
                     step_no, vgmmf.temperature, 
                     float(-elbo), float(kl_mc), accurracy,
                     centers_mean_dist, recovered_nclusters,) )
    
    #_store_results(RESULTS, COLS, CFG, CFGNAMES, OUT)    

## Approximating family: bases and flows

In [27]:
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [28]:
M, B = x_train.shape[0], B
if FLOWS_K is None: FLOWS_K = K
logger.info("FLOWS_K = %s" % FLOWS_K)

base = base_constructors.create_categorical_blocks(M, FLOWS_K, B, BASE_SPECIFICATION, 
                                                independent_variables=FACTORIZED_BASE); 


flow = flows_mixture.DiscreteFlowsMixture(M, FLOWS_K, B, temperature=BASE_TEMP,
                     components_specification=[(FLOW_TYPE, M) for _ in range(B)])


  5347 FLOWS_K = 3
                       There is too many combinations of variables and categories! 
                       Generating diagonal and then random instead of all possible!


## Inference configuration

In [29]:
OPTIMIZERS = {"RMS": tf.keras.optimizers.RMSprop,
              "ADAM": tf.keras.optimizers.Adam}
if OPTIMIZER not in OPTIMIZERS: raise ValueError("Unknown optimizer!")
optimizer_class = OPTIMIZERS[OPTIMIZER]
optimizer = optimizer_class(learning_rate=LR)

logger.info("optimizer=%s lr=%s" % (optimizer, LR))

  5426 optimizer=<tensorflow.python.keras.optimizer_v2.rmsprop.RMSprop object at 0x7f1bec5bab90> lr=0.1


In [30]:
ID2INFERENCE = {
    0: inference.VariationalInference,
    1: inference.BoostingVariationalInference,
}
inference_class = ID2INFERENCE[INFERENCE]

vi = inference_class(base=base, flow=flow, 
 optimizer=optimizer, nsamples=NSAMPLES, 
 max_niter=E_NITER, noimprov_niter=E_NOIMPROV_NITER, min_niter=E_MIN_NITER)
    
logger.info("VI inference type: %s" % vi)    

  5437 VI inference type: <inference.VariationalInference object at 0x7f1bec592050>


## Variational EM with Flows

In [31]:
time_profiling.reset()
start_time = time.time()

In [32]:
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [33]:
vgmf_class = gmvi.VariationalGaussianMixtureFlows if E_HYBRID_ELBO else gmvi.VariationalGaussianMixtureFlowsSamples
vgmmf = vgmf_class(inference=vi, n_components=K, elbo_nsamples=EVAL_NSAMPLES, initialization=INIT)
logger.info("vgmmf = %s" % vgmmf)

  5458 vgmmf = VariationalGaussianMixtureFlows(
)


In [34]:
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [35]:
temperature_annealing = lambda step_no, iteration: \
  max(BASE_TEMP*np.exp(-ANNEAL_RATE*step_no)*np.exp(-E_ANNEAL_RATE*iteration), MIN_TEMP)

In [36]:
best_allocations_flows = vgmmf.fit(x_train, 
            min_niter=MIN_NITER, max_niter=MAX_NITER, noimprov_niter=NOIMPROV_NITER, 
            batch_size=EVAL_BATCH_SIZE, 
            e_batch_size=E_BATCH_SIZE, e_batch_drop_remainder=E_DROP_REMAINDER,
            temperature_annealing=temperature_annealing, 
            callback_iter = _callback,            
            )

  5604 [VGMF.fit][0.16s] step=-1. Elbo=-196974.37 KL=nan acc=0.3667 homo=0.0035 
                     Temp=10.000 Avg-Dist-Mus:2.34 Rec-NClust:3 (min-weight=99.8)
 42779 [VariationalInference.fit] No improvement in recent 100 iterations. Stop.
 43210 [VGMF.fit][37.76s] step=0. Elbo=-1297.33 KL=nan acc=0.5300 homo=0.1876 
                     Temp=2.808 Avg-Dist-Mus:0.74 Rec-NClust:3 (min-weight=16.8)
128374 [VariationalInference.fit] No improvement in recent 100 iterations. Stop.
128767 [VGMF.fit][123.32s] step=1. Elbo=-1238.42 KL=nan acc=0.5833 homo=0.2842 
                     Temp=0.308 Avg-Dist-Mus:0.51 Rec-NClust:3 (min-weight=12.2)
183453 [VariationalInference.fit] No improvement in recent 100 iterations. Stop.
183842 [VGMF.fit][178.40s] step=2. Elbo=-1216.28 KL=nan acc=0.6033 homo=0.3383 
                     Temp=1.395 Avg-Dist-Mus:0.34 Rec-NClust:3 (min-weight=9.2)
256688 [VariationalInference.fit] No improvement in recent 100 iterations. Stop.
257202 [VGMF.fit][251.76s] step=

1032941 [VGMF.fit][1027.50s] step=33. Elbo=-1197.03 KL=nan acc=0.6100 homo=0.3680 
                     Temp=4.190 Avg-Dist-Mus:0.12 Rec-NClust:3 (min-weight=2.0)
1087475 [VariationalInference.fit] No improvement in recent 100 iterations. Stop.
1088211 [VGMF.fit][1082.77s] step=34. Elbo=-1196.32 KL=nan acc=0.6133 homo=0.3859 
                     Temp=3.499 Avg-Dist-Mus:0.12 Rec-NClust:3 (min-weight=1.7)
1119885 [VariationalInference.fit] No improvement in recent 100 iterations. Stop.
1120584 [VGMF.fit][1115.14s] step=35. Elbo=-1195.96 KL=nan acc=0.6133 homo=0.3859 
                     Temp=7.047 Avg-Dist-Mus:0.12 Rec-NClust:3 (min-weight=1.6)
1152633 [VariationalInference.fit] No improvement in recent 100 iterations. Stop.
1153364 [VGMF.fit][1147.92s] step=36. Elbo=-1196.02 KL=nan acc=0.6100 homo=0.3780 
                     Temp=2.541 Avg-Dist-Mus:0.12 Rec-NClust:3 (min-weight=1.9)
1190018 [VariationalInference.fit] No improvement in recent 100 iterations. Stop.
1190767 [VGMF.fit][1

In [37]:
_store_results(RESULTS, COLS, CFG, CFGNAMES, OUT)

2064564 Writing 63 data rows to: GMVI_p_5_1.csv


In [38]:
if VISUALIZATIONS:
    try:
        plt.scatter(x_train[:, 0], x_train[:, 1], c=np.argmax(best_allocations_flows, -1))
        plt.contour(x0, x1, vgmmf.pdf(x).reshape(100, 100))
        plt.xlim(-R*2, R*2, 100)
        plt.ylim(-R*2, R*2, 100)
        plt.gca().set_aspect('equal', adjustable='box')
        plt.show()
    except Exception as e:
        logger.warn("Plotting failed: %s" % e)

In [39]:
time_profiling.get_report().round(3)

Unnamed: 0,func,count,total,median,mean,min,max,q=.8,#max
0,DiscreteFlowsMixture.call,9350,321.855,0.034,0.034,0.014,0.092,0.045,44
1,DiscreteFlowsMixture.reverse,8730,251.742,0.028,0.029,0.011,0.083,0.038,18
2,FactorizedCategoricalMixture.sample_extm,9350,86.535,0.009,0.009,0.004,0.035,0.012,10
3,FactorizedIndependentCategoricalMixture.log_pr...,8730,52.493,0.006,0.006,0.002,0.029,0.008,4
4,VariationalGaussianMixtureFlows.variational_ex...,62,2051.281,32.157,33.085,15.415,90.879,38.243,2


## Compare fits

In [40]:
print("Compare allocations (first half cols are std, then flows):")    
mu0, allocations0  = vgmm0.mu, best_allocations_std    
muf, allocationsf  = vgmmf.mu, best_allocations_flows
mu0, allocations0, muf, allocationsf = \
        minimal_matching(mu0, allocations0, muf, allocationsf) 
for i in range(allocations0.shape[0]):
    row0, rowf = allocations0[i,:], allocationsf[i,:]
    s = "  ".join("%.1f" % v for v in row0)+" | "+"  ".join("%.1f" % v for v in rowf)
    print(s)    

Compare allocations (first half cols are std, then flows):
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
0.6  0.4  0.0 | 0.8  0.2  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
0.9  0.1  0.0 | 1.0  0.0  0.0
0.1  0.9  0.0 | 0.2  0.8  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0.0 | 1.0  0.0  0.0
1.0  0.0  0

In [41]:
print("cluster means:\nstandard:\n%s\nflows:\n%s" % (mu0, muf))

cluster means:
standard:
[[-0.88086553  0.4737298 ]
 [ 1.71336377 -1.15105518]
 [-0.08544704 -0.02469083]]
flows:
[[-0.80970073  0.45104052]
 [ 1.73312647 -1.25603988]
 [ 1.92622034 -0.29875795]]
