In [None]:
import os
import argparse
import pystan
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle

In [None]:
from scipy.special import logsumexp
from scipy.stats import norm
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
import seaborn as sns

In [None]:
np.random.seed(101)
matplotlib.rc('text', usetex=True)
matplotlib.rcParams['figure.dpi'] = 200

In [None]:
def confusion(result, test_labels):
    predictions = np.zeros((len(test_labels), ))
    for i, row in enumerate(test_labels.itertuples()):
        logpz = np.log(result['theta'][-1]) #mixing probabilities
        logpy_z0 = norm.logpdf(row[:10], 
                        loc=result['mu'][-1][0],
                        scale=result['sigma'][-1][0])
        logpy_z1 = norm.logpdf(row[:10], 
                        loc=result['mu'][-1][1],
                        scale=result['sigma'][-1][1])
        posterior0 = logpz[:,0] + logpy_z0
        posterior1 = logpz[:,1] + logpy_z1
        predictions[i] = int(logsumexp(posterior0) > logsumexp(posterior1))

    confusion = confusion_matrix(test_labels['Tool'].values, predictions)
    accuracy = accuracy_score(test_labels['Tool'].values, predictions)
    f1 = f1_score(test_labels['Tool'].values, predictions)
    return confusion, accuracy, f1

In [None]:
# set up argparse
parser = argparse.ArgumentParser(
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument('--root', type=str,
                    default=os.path.abspath('.'),
                    help='Root directory of tool-presence')
parser.add_argument('--train', type=str, default='')
parser.add_argument('--test', type=str, default='')
parser.add_argument('--model-path', type=str, default='')
parser.add_argument('--fit-path', type=str, default='')
parser.add_argument('-v', '--verbose', help="increase output verbosity",
                    action="store_true")
args = parser.parse_args([]);
args.train="mmd/csv/beta_1.0_zdim_10_train.csv"
args.test="mmd/csv/beta_1.0_zdim_10_test.csv"
args.root='~/tool-presence/'
args.model_path='inference/prediction_model.pkl'
args.fit_path='inference/prediction_fit.pkl'
args.verbose=True

In [None]:
if args.verbose:
    print("Reading train data from:",
          os.path.join(args.root, args.train))
    print("Reading test data from:",
          os.path.join(args.root, args.test))
    print("Saving inference model to:",
          os.path.join(args.root, args.model_path))
    print("Saving fit to:",
          os.path.join(args.root, args.fit_path))

In [None]:
train_data_file = os.path.join(args.root, args.train) #"../mmd/csv/beta_1.0_zdim_10_train.csv"
test_data_file = os.path.join(args.root, args.test) # "../mmd/csv/beta_1.0_zdim_10_test.csv"
test_labels_file = os.path.join(args.root, 'data/youtube_data/val/labels.csv')
compiled_model = os.path.join(args.root, args.model_path) #"../inference/predictions.pkl"
sampled_fit = os.path.join(args.root, args.fit_path) #"../inference/predictions.pkl

In [None]:
# Read data into pandas dataframe
train = pd.read_csv(train_data_file, index_col=0)
test = pd.read_csv(test_data_file, index_col=0)
test_labels = pd.read_csv(test_labels_file, index_col=0)
test_labels = pd.concat([test, test_labels], axis=1).dropna()

# Want to learn tool/no tool (2 latent groups)
data = {"N": len(train.index),
        "N2": len(test_labels),
        "x":train,
        "x_test":test_labels.values[:,:10],
        "K":2,
        "D":len(train.columns)}

# stan parameters
iters = 1000

In [None]:
model = """
    data {
    int N; // number of observations
    int N2; // number of test_observations
    int D; // dimension of observed vars
    int K; // number of clusters
    vector[D] x[N]; // training data
    vector[D] x_test[N2]; //test data
    }

    parameters {
    ordered[K] mu; // locations of hidden states
    vector<lower=0>[K] sigma; // variances of hidden states
    simplex[K] theta[D]; // mixture components
    }

    model {
    matrix[K,D] obs = rep_matrix(0.0, K, D);
    // priors
    for(k in 1:K){
      mu[k] ~ normal(0,10);
      sigma[k] ~ inv_gamma(1,1); //prior of normal distribution
    }
    for (d in 1:D){
      theta[d] ~ dirichlet(rep_vector(5.0, K)); //prior of categorical distribution
    }
    // likelihood
    for(i in 1:N) {
      vector[D] increments;
      for(d in 1:D){
        increments[d]=log_mix(theta[d][1],
            normal_lpdf(x[i][d] | mu[1], sigma[1]), normal_lpdf(x[i][d] | mu[2], sigma[2]));
      }
      target += log_sum_exp(increments);
    }
    }

    generated quantities {
      vector[N2] log_p_y_tilde;
      for(i in 1:N2) {
          vector[D] increments;
          for(d in 1:D){
            increments[d]=log_mix(theta[d][1],
                normal_lpdf(x_test[i][d] | mu[1], sigma[1]), normal_lpdf(x[i][d] | mu[2], sigma[2]));
          }
          log_p_y_tilde[i] = log_sum_exp(increments);
      }
    }
    """
# sm = pystan.StanModel(model_code=model)
# with open(compiled_model, 'wb') as f:
#     pickle.dump(sm, f)
with open(compiled_model, 'rb') as f:
    sm = pickle.load(f)

In [None]:
fit = sm.sampling(data=data, iter=5000, chains=4, thin=1)

In [None]:
print(fit)

In [None]:
# results = [fits[i].extract() for i in fits]
result = fit.extract()

In [None]:
c,a,f = confusion(result, test_labels)
print(a, f)
sns.heatmap(c.astype('float') / c.sum(axis=1)[:, np.newaxis], 
            cmap=sns.color_palette("Blues"),
            xticklabels=['No Tool', 'Tool'], 
            yticklabels=['No Tool', 'Tool'], 
            annot=c, annot_kws={"size": 28}, 
            fmt='g',cbar=False)

plt.ylabel("Predictions")
plt.xlabel("Actual")
plt.title(r"$\beta$" "-VAE Confusion Matrix\n" + 
          r"$\beta=1, z=10$");
plt.savefig('beta_vae_beta1_confusion.png')

In [None]:
fig = plt.figure()
plt.hist(result['mu'].flatten(), bins=50);
plt.title("Posterior distribution\n"+
          r"$\beta$-VAE $\beta=1, z=10$")
plt.ylabel('Frequency')
plt.xlabel(r'$\mu$')
plt.savefig('mcmc_4chains_elbo_beta1.png')

In [None]:
# Accuracy?????

#elbo beta = 1: 60.9%
#mmd lambda=1: 68%
#mmd lambda=10: 58%

In [None]:
c = np.array([[27,71],[50,159]])