## Setup

In [1]:
%run utils.ipynb

import collections
import time
import pickle

import gpflow
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

value.Reading Comprehension       3.88
value.Active Listening            4.12
value.Writing                     3.88
value.Speaking                    4.00
value.Mathematics Skill           2.62
                                  ... 
value.Law and Government          2.86
value.Telecommunications          2.94
value.Communications and Media    3.63
value.Transportation              2.66
work_num                          4.00
Name: (Accommodation service managers, 4), Length: 121, dtype: float64


W0730 15:39:57.912104 139766515070720 deprecation_wrapper.py:119] From /home/ubuntu/anaconda3/envs/python3/lib/python3.7/site-packages/gpflow/session_manager.py:31: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

W0730 15:39:57.916664 139766515070720 deprecation_wrapper.py:119] From /home/ubuntu/anaconda3/envs/python3/lib/python3.7/site-packages/gpflow/misc.py:27: The name tf.GraphKeys is deprecated. Please use tf.compat.v1.GraphKeys instead.

W0730 15:39:57.984275 139766515070720 deprecation_wrapper.py:119] From /home/ubuntu/anaconda3/envs/python3/lib/python3.7/site-packages/gpflow/saver/coders.py:80: The name tf.data.Iterator is deprecated. Please use tf.compat.v1.data.Iterator instead.



In [2]:
# Constants
datasets = {'agg': {'x': {'cont':         x_cont_agg, 
                          'disc':         x_disc_agg}, 
                    'y': {# 'abs':          y_abs_agg['y'],
                          # 'abs_binned':   y_abs_agg['binned_y'],
                          'share':        y_share_agg['y'],
                          'share_binned': y_share_agg['binned_y']}
                   },
            'ind': {'x': {'cont':         x_cont_ind,
                          'disc':         x_disc_ind},
                    'y': {# 'abs':          y_abs_ind,
                          # 'abs_binned':   y_abs_bin_ind,
                          'share':        y_share_ind,
                          'share_binned': y_share_bin_ind}
                   }
           }

kernels = ['Matern12', 'Matern32', 'Matern52', 'RBF']

agg_level = 'both'

## Functions

In [3]:
def get_kern(kern):
    if kern == 'Matern12': return gpflow.kernels.Matern12(1)
    if kern == 'Matern32': return gpflow.kernels.Matern32(1) 
    if kern == 'Matern52': return gpflow.kernels.Matern52(1)
    if kern == 'RBF':      return gpflow.kernels.RBF(1)

In [4]:
# create x and y arrays
def x_and_y(x, y):
    change = {'decrease':     2,
              'constant':     1, 
              'increase':     0,
              'fewer':        2,
              'same':         1,
              'more':         0,
              'not_increase': 1}
    x = np.array(x)
    y = np.array(pd.Series(y).replace(change).values.astype('int64')).reshape(y.shape[0], 1)
    return x, y

In [5]:
# get bin edges and likelihood
def create_likelihood(y):
    bin_edges = np.array(np.arange(np.unique(y).size + 1), dtype=float)
    # Need to check in on this, tutorial does the below which ends up with negative bins
    # bin_edges = bin_edges - bin_edges.mean()
    bin_edges = bin_edges - .5
    return gpflow.likelihoods.Ordinal(bin_edges)

In [6]:
# build a model with this likelihood
def build_model(x_train, y_train, likelihood, kernel):
    gaussian_model = gpflow.models.VGP(tf.cast(x_train, tf.float64), 
                                       y_train, 
                                       kern=kernel,
                                       likelihood=likelihood)
    # fit the model
    gpflow.train.ScipyOptimizer().minimize(gaussian_model)
    return gaussian_model

In [7]:
# get predictive densities
def test_model(model, x_test):
    densities = []
    # Predictive density for a single input x
    for x in x_test:
        ys = np.arange(np.max(model.Y.value+1)).reshape([-1, 1])
        x_new_vec = x*np.ones_like(ys)
        # for predict_density x and y need to have the same number of rows
        dens = np.exp(model.predict_density(x_new_vec, ys))
        densities.append(dens)
    return densities

In [8]:
# get accuracy
def accuracy(y_test, densities):
    score = 0
    for index, y in enumerate(y_test):
        if y == np.argmax(densities[index]): score += 1
    return score/len(y_test)

In [9]:
# do cross validation
def cv(x, y, x_all, y_all, kern, var_type):
    kf = KFold(n_splits=5)
    scores = []
    results = []

    for train_index, test_index in kf.split(x):
        start = time.time()
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        kernel = get_kern(kern)
        likelihood = create_likelihood(y_all)
        model = build_model(x_train, y_train, likelihood, kernel)
        densities = test_model(model, x_test)
        scores.append(accuracy(y_test, densities))
        results.append(create_results(x_test, densities))
        end = time.time()
        print('Fold tested in (sec):', end - start)
        del kernel, likelihood, model

    return {'Avg. score':  sum(scores)/len(scores),
            'All scores':  scores,
            'All results': results}

In [10]:
def pickle_results(results, params):
    with open('gp_results/{}.pkl'.format(params), 'wb') as f:
        pickle.dump(results, f)

In [11]:
def organize_results(results):
    best_result = {'Matern12': ['', 0],
                   'Matern32': ['', 0],
                   'Matern52': ['', 0],
                   'RBF':      ['', 0]}

    for kern in results:
        for agg_name in results[kern]:
            for var_type in results[kern][agg_name]:
                for result_dist in results[kern][agg_name][var_type]:
                    params = ' '.join([kern, agg_name, var_type, result_dist])
                    score = results[kern][agg_name][var_type][result_dist]['Avg. score']
                    if score == best_result[kern][1]: 
                        best_result[kern][0] += ', ' + params
                    if score > best_result[kern][1]: 
                        best_result[kern] = [params, score]
        
    return best_result

In [12]:
def grid_search(agg_level):
    def rec_dd():
        return collections.defaultdict(rec_dd)
    
    results = rec_dd()

    for kern in kernels:
        for agg_name, top_level in datasets.items():
            if agg_level != 'both' and agg_level != agg_name:
                continue
            for var_type, x_all in top_level['x'].items():
                for result_dist, y_all in top_level['y'].items():
                    params = '_'.join([kern, agg_name, var_type, result_dist])
                    print('Testing:', params)
                    start = time.time()
                    x, y = x_and_y(x_all, y_all)
                    result = cv(x, y, x_all, y_all, kern, var_type)
                    results[kern][agg_name][var_type][result_dist] = result
                    end = time.time()
                    print('Accuracy:', result['Avg. score'])
                    print('Time elapsed (sec):', end - start)
                    pickle_results(result, params)
                    print('Results saved to disk')
    
    return results

In [13]:
def create_results(x_test, densities):
    results = {}
    for index, x in enumerate(x_test):
        results[(tuple(x))] = densities[index]
    return results

## Run through

In [14]:
results = grid_search(agg_level)

W0730 15:39:58.209649 139766515070720 deprecation_wrapper.py:119] From /home/ubuntu/anaconda3/envs/python3/lib/python3.7/site-packages/gpflow/core/node.py:109: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0730 15:39:58.232968 139766515070720 deprecation_wrapper.py:119] From /home/ubuntu/anaconda3/envs/python3/lib/python3.7/site-packages/gpflow/params/parameter.py:388: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0730 15:39:58.237183 139766515070720 deprecation_wrapper.py:119] From /home/ubuntu/anaconda3/envs/python3/lib/python3.7/site-packages/gpflow/params/parameter.py:394: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.



Testing: Matern12_agg_cont_share


W0730 15:39:58.972429 139766515070720 deprecation.py:323] From /home/ubuntu/anaconda3/envs/python3/lib/python3.7/site-packages/tensorflow/python/ops/array_grad.py:199: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Fold tested in (sec): 3.0604515075683594
Fold tested in (sec): 3.0964407920837402
Fold tested in (sec): 4.364148855209351
Fold tested in (sec): 4.925512790679932
Fold tested in (sec): 5.727909326553345
Accuracy: 0.2416666666666667
Time elapsed (sec): 21.181183338165283
Results saved to disk
Testing: Matern12_agg_cont_share_binned
Fold tested in (sec): 6.94967794418335
Fold tested in (sec): 7.104523181915283
Fold tested in (sec): 8.55574655532837
Fold tested in (sec): 8.908057451248169
Fold tested in (sec): 9.60049819946289
Accuracy: 0.6666666666666666
Time elapsed (sec): 41.126593828201294
Results saved to disk
Testing: Matern12_agg_disc_share
Fold tested in (sec): 10.888101577758789
Fold tested in (sec): 12.252338886260986
Fold tested in (sec): 11.98802924156189
Fold tested in (sec): 14.063151836395264
Fold tested in (sec): 14.391908645629883
Accuracy: 0.23333333333333334
Time elapsed (sec): 63.587482929229736
Results saved to disk
Testing: Matern12_agg_disc_share_binned
Fold tested i

Fold tested in (sec): 124.21686816215515
Fold tested in (sec): 126.16918802261353
Fold tested in (sec): 125.201908826828
Fold tested in (sec): 126.81310081481934
Accuracy: 0.7416666666666666
Time elapsed (sec): 627.7611107826233
Results saved to disk
Testing: RBF_agg_disc_share
Fold tested in (sec): 126.556312084198
Fold tested in (sec): 128.01990509033203
Fold tested in (sec): 127.65163612365723
Fold tested in (sec): 129.75132179260254
Fold tested in (sec): 128.52014136314392
Accuracy: 0.23333333333333334
Time elapsed (sec): 640.5022282600403
Results saved to disk
Testing: RBF_agg_disc_share_binned
Fold tested in (sec): 129.8426640033722
Fold tested in (sec): 129.83310866355896
Fold tested in (sec): 131.13194608688354
Fold tested in (sec): 130.0711522102356
Fold tested in (sec): 131.87554669380188
Accuracy: 0.7333333333333332
Time elapsed (sec): 652.7569487094879
Results saved to disk
Testing: RBF_ind_cont_share
Fold tested in (sec): 395.1233284473419
Fold tested in (sec): 412.8745145

In [None]:
best_result = organize_results(results)

In [None]:
pickle_results(best_results, 'best')

## Scratchpad

In [None]:
with open('gp_results/Matern12_agg_cont_share.pkl', 'rb') as f:
    results = pickle.load(f)

In [None]:
len(results[2][1])

In [None]:
len(results[2])

In [None]:
df = pd.DataFrame(results[2][0][1])
df.head()

In [None]:
kern = gpflow.kernels.Matern32(1)

In [None]:
x = datasets[0][0][0] 
y = datasets[0][1][0]

In [None]:
x, y = x_and_y(x, y)

In [None]:
likelihood = create_likelihood(y_all)

In [None]:
kf = KFold(n_splits=5)
scores = []
all_densities = []

for train_index, test_index in kf.split(x):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = build_model(x_train, y_train, kern, likelihood)
    densities = test_model(model, x_test)
    scores.append(accuracy(y_test, densities))
    all_densities.append((x_test, densities))

In [None]:
for index, kern in enumerate(kernels):
    for top_level in datasets:
        for x_all in top_level[0]:
            for y_all in top_level[1]:
                print(index)
                x, y = x_and_y(x_all, y_all)
                x = [int(a) for a in x]
                y = [int(a) for a in y]
                print(pd.DataFrame(y).isstr().values.any())
                break

In [None]:
# construct ordinal likelihood - bin_edges is the same as unique(Y) but centered

bin_edges = np.array(np.arange(np.unique(y_share['y']).size + 1), dtype=float)
# Need to check in on this, tutorial does the below which ends up with negative bins
# bin_edges = bin_edges - bin_edges.mean()
bin_edges = bin_edges - .5
likelihood = gpflow.likelihoods.Ordinal(bin_edges)

# create X and y

X = np.array(x)
y = np.array(y_share['y'].replace(['decrease', 'constant', 'increase'], [0, 1, 2])).reshape(120,1)

In [None]:
x[:1].shape

In [None]:
y_abs['y'][0]

In [None]:
mu, var = gaussian_model.predict_y(x[:1])

In [None]:
mu

In [None]:
var

In [None]:
bin_edges

In [None]:
mu = []
var = []

for x_test in np.array(x):
    m, v = gaussian_model.predict_y([x_test])
    mu.append(m)
    var.append(v)

In [None]:
mu

In [None]:
gaussian_model

In [None]:
gaussian_model.Y.read_value()

In [None]:
import matplotlib
%matplotlib inline
plt = matplotlib.pyplot

In [None]:
m = gaussian_model

# Predictive density for a single input x=0.5
x_new = X[0]
ys = np.arange(np.max(m.Y.value+1)).reshape([-1, 1])
x_new_vec = x_new*np.ones_like(ys)
# for predict_density x and y need to have the same number of rows
dens_new = np.exp(m.predict_density(x_new_vec, ys))
fig = plt.figure(figsize=(8, 4))
plt.bar(x=ys.flatten(), height=dens_new.flatten())

In [None]:
dens_new

In [None]:
import time

print('Testing: Matern12 ind disc share_binned')
kern = 'Matern12'
start = time.time()
x, y = x_and_y(x_disc_ind, y_share_bin_ind)
result = cv(x, y, y_share_bin_ind, kern)
end = time.time()
print('Accuracy:', result[0])
print('Time elapsed (sec):', end - start)