In [1]:
import time

import gpflow
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, confusion_matrix

from ordinal_likelihood import Ordinal

W0908 14:26:11.805272 4489426368 deprecation_wrapper.py:119] From /Users/rob/.pyenv/versions/brookfield/lib/python3.5/site-packages/gpflow/session_manager.py:31: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

W0908 14:26:11.809288 4489426368 deprecation_wrapper.py:119] From /Users/rob/.pyenv/versions/brookfield/lib/python3.5/site-packages/gpflow/misc.py:27: The name tf.GraphKeys is deprecated. Please use tf.compat.v1.GraphKeys instead.

W0908 14:26:12.030620 4489426368 deprecation_wrapper.py:119] From /Users/rob/.pyenv/versions/brookfield/lib/python3.5/site-packages/gpflow/training/tensorflow_optimizer.py:169: The name tf.train.AdadeltaOptimizer is deprecated. Please use tf.compat.v1.train.AdadeltaOptimizer instead.

W0908 14:26:12.031362 4489426368 deprecation_wrapper.py:119] From /Users/rob/.pyenv/versions/brookfield/lib/python3.5/site-packages/gpflow/training/tensorflow_optimizer.py:156: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1

In [3]:
# This class handles each dataset, both the x and the y. This is done mainly for grid search 
# reasons as it allows the creation of lists of datasets and the automatic transformation
# of them into the right form for GPFlow
class GaussianDataset():
    # x: DataFrame or Numpy array, the data features
    # y_labels: DataFrame or Numpy array, the data labels (2 columns, 1st: distribution, 2nd: class)
    # group: Array, the noc label for each skill vector
    # agg_level: String, 'agg' or 'ind' for whether the data is aggregated
    # value_type: String, 'disc' or 'cont' (discrete or continuous data)
    # binned: String, 'binned' or 'not_binned' whether the data labels are binned
    def __init__(self, x, y_labels, y_true_dist, group, agg_level, value_type, binned):
        # Tranformation of string data into integer data
        self.data_transforms = change = {'decrease':     2,
                                         'constant':     1, 
                                         'increase':     0,
                                         'fewer':        2,
                                         'same':         1,
                                         'more':         0,
                                         'not_increase': 1}
        # Transformations necessary to get in proper TensorFlow format
        self.x = np.array(x)
        # This converts the target data into a pandas Series, tarnsforms the type to int 
        # and then casts it as 'int64' (because evidently that is different than pythons
        # built in int). That series is then transformed into a numpy array before being
        # reshaped. I'm not exactly sure why all of these steps were necessary but 
        # TensorFlow was very finicky. 
        self.y = np.array(pd.Series(y_labels)
                          .replace(self.data_transforms)
                          .values.astype('int64')
                         ).reshape(y_labels.shape[0], 1)
        # Characteristics of the data
        self.y_true_dist = y_true_dist
        self.group = group
        self.agg_level = agg_level
        self.value_type = value_type
        self.binned = binned

In [4]:
# Handles one set of results from a testing cycle of the model
class Result():
    # y_true_dist: Array of Tuples, true confidence in each label
    # y_pred_dist: Array of Tuples, predicted confidence in each label
    def __init__(self, y_true_dist, y_pred_dist):
        self.y_true_dist = y_true_dist
        self.y_pred_dist = y_pred_dist
        self.y_true_class = [np.argmax(dist) for dist in y_true_dist]
        self.y_pred_class = [np.argmax(dist) for dist in y_pred_dist]
        self.mse = mean_squared_error(y_true_dist, y_pred_dist)
        self.mae = mean_absolute_error(y_true_dist, y_pred_dist)
        self.roc = self.multiclass_roc_auc_score()
        self.confusion = confusion_matrix(self.y_true_class, self.y_pred_class)
    
    # From stackoverflow: 
    # https://stackoverflow.com/questions/39685740/calculate-sklearn-roc-auc-score-for-multi-class
    # Baseline assumption is that the average of one-vs-all for each of the classes will be
    # similar to the ROC-AUC score for a binary label.
    def multiclass_roc_auc_score(self, average="macro"):
        # Creating a set of all the unique classes using the actual class list
        unique_class = set(self.y_true_class)
        class_roc_auc = []
        for per_class in unique_class:
            # Creating a list of all the classes except the current class 
            other_class = [x for x in unique_class if x != per_class]

            # Marking the current class as 1 and all other classes as 0
            new_actual_class = [0 if x in other_class else 1 for x in self.y_true_class]
            new_pred_class = [0 if x in other_class else 1 for x in self.y_pred_class]

            # Using the sklearn metrics method to calculate the roc_auc_score
            roc_auc = roc_auc_score(new_actual_class, new_pred_class, average=average)
            class_roc_auc.append(roc_auc)

        return sum(class_roc_auc)/len(class_roc_auc)

In [5]:
# Handles all of the aspects of the model from building to testing
class GaussianModel():
    # x_train: x vector containing the training data
    # y_train: y vector containing the training labels
    # kernel: String, representing the kernel type
    def __init__(self, x_train, y_train, kernel, n_classes):
        self.x_train = x_train
        self.y_train = y_train
        self.kernel_name = kernel
        self.n_classes = n_classes
        self.model = None
        self.results = None
    
    # Builds, trains and then tests the model. Expects y_test to be a vector of the 
    # predicted distributions over the potential labels of y
    def build_train_test(self, x_test, y_test):
        self.build_train()
        return self.test_model(x_test, y_test)
    
    ### Functions for building and training the model ###
    
    # This builds and trains the model, could potentially be done asynchronously 
    def build_train(self):
        # Building the components of the model
        kernel = self.get_kern(self.x_train.shape[1])
        likelihood = self.create_likelihood()
        # Build the model itself from GPFlow. Again, TensorFlow is finicky and require the
        # x matrix to be cast to tf.float64 at this stage rather than any stage beforehand.
        gaussian_model = gpflow.models.VGP(tf.cast(self.x_train, tf.float64),
                                           self.y_train, 
                                           kern=kernel,
                                           likelihood=likelihood)
        # Train the model using Scipy (recommended from GPFlow documentation)
        gpflow.train.ScipyOptimizer().minimize(gaussian_model)
        self.model = gaussian_model
    
    # Creates the kernel for the model, this is based off a string passed in as a param 
    # during object creation. Can be expanded with more kernels offered by GPFlow.
    def get_kern(self, dims):
        # Assumption: kernel will be 'xxx_yyy' where yyy is Linear or does not exist
        kern = self.kernel_name.split('_')
        with gpflow.defer_build():
            # Dims here defines the columns that the kernel looks at
            if kern[0] == 'Matern12': kernel = gpflow.kernels.Matern12(input_dim=dims)
            if kern[0] == 'Matern32': kernel = gpflow.kernels.Matern32(input_dim=dims) 
            if kern[0] == 'Matern52': kernel = gpflow.kernels.Matern52(input_dim=dims) 
            if kern[0] == 'RBF':      kernel = gpflow.kernels.RBF(input_dim=dims) 
            # Prior decided because it performed the best and was used by Nesta
            kernel.variance.prior = gpflow.priors.Gamma(scale=1,shape=self.n_classes)
            # This handles the case where the kernels are summed together
            if len(kern) == 2 and kern[1] == 'Linear':
                linear = gpflow.kernels.Linear(input_dim=dims)
                linear.variance.prior = gpflow.priors.Gamma(scale=1,shape=self.n_classes)
                kernel += linear
        return kernel
    
    # Creation of the likelihood can be done with either GPFlow or the code from Johnathon.
    # Bin edges are important because they define the categories for the predictions since
    # a GP simply returns a predicted value on a continuous scale.
    def create_likelihood(self):
        bin_edges = np.array(np.arange(self.n_classes + 1), dtype=float)
        bin_edges = bin_edges - .5
        # This is the new ordinal likelihood from Johnathon
        return Ordinal(bin_edges)
    
    ### Functions for testing the model ###
    
    # This creates predictions for each of the rows in x_test and then creates a results 
    # object that is saved in state.
    def test_model(self, x_test, y_test):
        densities = []
        # Predictive density (i.e. the confidence in each category) for a single input x
        for x in x_test:
            # Predictions need to be scaled to 100% because there are theoretically infinite
            # classes that it could predict.
            densities.append(self.scale_pred(self.predictive_density(x)))
        self.results = Result(y_test, densities)
        return self.results
    
    # Create the prediction for each category, from:
    # https://gpflow.readthedocs.io/en/latest/notebooks/ordinal.html
    def predictive_density(self, x):
        ys = np.arange(np.max(self.model.Y.value+1)).reshape([-1, 1])
        x_new_vec = x*np.ones_like(ys)
        # For predict_density x and y need to have the same number of rows
        densities = np.exp(self.model.predict_density(x_new_vec, ys))
        # Need to unpack the densities from [[i], [j], [k]]
        densities = [d for [d] in densities]
        return densities
    
    ### Functions for predicting on new data ### 
    
    def predict_new_nocs(self, nocs):
        mappings = {0: 'increase',
                    1: 'constant/not increase',
                    2: 'decrease'}
        predictions = {}
        classes = defaultdict(int)
        for index, row in nocs.iterrows():
            prediction = self.scale_pred(self.predictive_density(np.asarray(row)))
            category = mappings[np.argmax(prediction)]
            predictions[index] = {'Category':     category,
                                  'Distribution': prediction}
            classes[category] += 1
        return predictions, classes
    
    ### Utility functions ###
    
    # Scales the predictions to each 100%
    def scale_pred(self, pred):
        factor = 1/sum(pred)
        scaled = []
        for p in pred:
            scaled.append(p * factor)
        return tuple(scaled)