In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import pystan
import pandas as pd
import numpy as np
import pickle

In [None]:
import matplotlib
import matplotlib.pyplot as plt
font = {'family' : 'serif',
        'weight' : 'normal',
        'size'   : 22}
matplotlib.rc('text', usetex=True)
matplotlib.rc('font', **font)

In [None]:
module_path = os.path.abspath(os.path.join('/users/dli44/tool-presence'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src import constants as c
from src import utils
from src import visualization as v
from src import model as m

In [None]:
from scipy.stats import norm
from scipy.special import logsumexp


def get_inference_results(result, test_labels, metric=None, **kwargs):
    posteriors = np.zeros((len(test_labels), 2))
    predictions = np.zeros((len(test_labels), ))
    for i, row in enumerate(test_labels.itertuples()):
        logpz = np.log(np.mean(result['theta'][:], axis=0))  # mixing probabilities
#         print(logpz.shape)
        logpy_z0 = norm.logpdf(row[:len(test_labels.columns)-2],
                               loc=np.mean(result['mu'][:,0]),
                               scale=np.mean(result['sigma'][:,0]))
        logpy_z1 = norm.logpdf(row[:len(test_labels.columns)-2],
                               loc=np.mean(result['mu'][:,1]),
                               scale=np.mean(result['sigma'][:,1]))
        posterior0 = logpz[:, 0] + logpy_z0
        posterior1 = logpz[:, 1] + logpy_z1
#         print(logsumexp(posterior0), logsumexp(posterior1))
        posteriors[i] = np.array([logsumexp(posterior0), logsumexp(posterior1)])
        predictions[i] = int(logsumexp(posterior0) < logsumexp(posterior1))

#     return posteriors
    return metric(test_labels['Tool'].values, predictions, **kwargs)


def inference_prediction(result, test_labels, metric=None, **kwargs):
    posteriors = np.zeros((len(test_labels), 2))
    predictions = np.zeros((len(test_labels), ))
    for i, row in enumerate(test_labels.itertuples()):
        logpz = np.log(np.mean(result['theta'][:], axis=0))  # mixing probabilities
        logpy_z0 = norm.logpdf(row[:len(test_labels.columns)-2],
                               loc=np.mean(result['mu'][:,0]),
                               scale=np.mean(result['sigma'][:,0]))
        logpy_z1 = norm.logpdf(row[:len(test_labels.columns)-2],
                               loc=np.mean(result['mu'][:,1]),
                               scale=np.mean(result['sigma'][:,1]))
        posterior0 = logpz[:, 0] + logpy_z0
        posterior1 = logpz[:, 1] + logpy_z1
        posteriors[i] = np.array([logsumexp(posterior0), logsumexp(posterior1)])
        predictions[i] = int(logsumexp(posterior0) < logsumexp(posterior1))
    
    return predictions

In [None]:
root = '/users/dli44/tool-presence/'



In [None]:
fits_dir = os.path.join(root, "mmd_no_sigmoid", "fits")

In [None]:
model = pickle.load(open(os.path.join(root, 'mmd_no_sigmoid', 'model.pkl'), 'rb'))

In [None]:
from sklearn import metrics


test = pd.read_csv(os.path.join(root, 'mmd_no_sigmoid/csv', 'beta_5.0_zdim_20_test.csv'), index_col=0)
test_labels=os.path.join(root, "data/youtube_data/val/labels.csv")
test_labels = pd.read_csv(test_labels, index_col=0)
test_labels = pd.concat([test, test_labels], axis=1).dropna()
fit = pickle.load(open(os.path.join(fits_dir, "beta_5.0_zdim_20_nuts_fit.pkl"), 'rb'))

result = fit.extract()

inf_results = get_inference_results(result, test_labels, metric=metrics.average_precision_score)
print(inf_results)

In [None]:
print(test_labels)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(20,6))
fig.subplots_adjust(hspace=.3)

ax[0].hist(result['mu'][:,0].flatten(), bins=100, label='Cluster 1', fc=[0,0,1,.5]);
ax[0].hist(result['mu'][:,1].flatten(), bins=100, label='Cluster 2', fc=[1,0,0,.5]);
ax[0].set_title(r"$\mu_1, \mu_2$ vs Frequency");
ax[0].set_xlabel(r"$\mu$");
ax[0].set_ylabel("Frequency");
ax[0].legend()

ax[1].hist(result['sigma'][:,0].flatten(), bins=100, label='Cluster 1', fc=[0,0,1,.5]);
ax[1].hist(result['sigma'][:,1].flatten(), bins=100, label='Cluster 2', fc=[1,0,0,.5]);
ax[1].set_title(r"$\sigma_1, \sigma_2$ vs Frequency");
ax[1].set_xlabel(r"$\sigma$");
ax[1].set_ylabel("Frequency");
ax[1].legend()
# 
plt.savefig("learned_params_beta_5_zdim_20.pdf", dpi=100, bbox_inches='tight')

In [None]:
means = np.mean(result['mu'], axis=0)
variances = np.square(np.mean(result['sigma'], axis=0))
means, variances

In [None]:
import scipy.stats as stats

for i in range(2):
    mu, sigma = result['mu'][:,i], result['sigma'][:,i]
    mu_ci = stats.t.interval(0.95, len(mu)-1, loc=np.mean(mu), scale=stats.sem(mu))
    sigma_ci = stats.t.interval(0.95, len(sigma)-1, loc=np.mean(sigma), scale=stats.sem(sigma))
    print("$\hat{{\mu}}_{{Z_{}}}$\t: 95% confidence interval is [{:.4f}, {:.4f}]".format(i, *mu_ci))
    print("$\hat{{\sigma}}_{{Z_{}}}$\t: 95% confidence interval is [{:.4f}, {:.4f}]".format(i, *sigma_ci))

In [None]:
# plt.hist(posteriors[:,1], bins=50)

In [None]:
import sklearn.metrics as metrics
import matplotlib.pyplot as plt

history = {}

for f in os.listdir(fits_dir):
    if f.endswith("_fit.pkl"):
        params = f.split("_")[:4]
        data_name = "_".join(params)
        l,z = int(float(params[1])), int(params[3])
#         if z != 80: continue
        if z not in history:
            history[z] = {}
    
        test = pd.read_csv(os.path.join(root, 'mmd_no_sigmoid/csv', data_name + '_test.csv'), index_col=0)
        test_labels=os.path.join(root, "data/youtube_data/val/labels.csv")
        test_labels = pd.read_csv(test_labels, index_col=0)
        test_labels = pd.concat([test, test_labels], axis=1).dropna()
        fit = pickle.load(open(os.path.join(fits_dir, f), 'rb'))
        if 'nuts' in f:
#             continue
            result = fit.extract()
        else:
            continue
#             result = utils.pystan_vb_extract(fit)
            
        print(f,l,z)
        inf_results = get_inference_results(result, test_labels, metric=metrics.precision_recall_fscore_support, average='binary')
        history[z][l] = inf_results
        precision, recall, fscore, support = inf_results
        print(precision, recall, fscore, support)

In [None]:
history

In [None]:
pd.DataFrame(history).dropna(axis=1)