In [1]:
import numpy as np
from scipy.special import gammaln, betaln
from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin
from sklearn.metrics import pairwise_kernels
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import check_classification_targets
import csv
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split
import pandas as pd
import random

In [2]:
data_path = '../../data/output/normalized_data_X_5p.csv'
y_path = '../../data/output/y_5p.csv'

data = pd.read_csv(data_path)
y = pd.read_csv(y_path)
print(data.shape)
print(y.shape)
data.head()

(69884, 420)
(69884, 1)


Unnamed: 0,route,tripNum,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,gpsLon,...,alertTypeSB_ACCIDENT,alertTypeSB_CHIT_CHAT,alertTypeSB_HAZARD,alertTypeSB_JAM,alertTypeSB_NORMAL,alertTypeSB_POLICE,alertTypeSB_ROAD_CLOSED,jamBlockTypeSB_-,jamBlockTypeSB_NORMAL,jamBlockTypeSB_ROAD_CLOSED_EVENT
0,0.118068,0.0,0.084945,0.264128,0.4394,0.010487,0.0,6e-06,0.265656,0.439493,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.118068,0.0,0.084965,0.264428,0.443488,0.014304,0.0,9e-06,0.265956,0.444051,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.118068,0.0,0.084972,0.263625,0.448606,0.01803,0.0,1e-05,0.264915,0.448957,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.118068,0.0,0.084979,0.261608,0.458297,0.025182,0.0,1.4e-05,0.263352,0.456961,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.118068,0.0,0.084985,0.260228,0.463266,0.028953,0.0,0.0,0.261792,0.46317,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [3]:
# Making training and test data: 80% Training, 20% Test
random.seed(15) #to get always the same set
train_X, test_X, train_Y, test_Y = train_test_split(data, y, test_size=0.20, random_state=7)

In [4]:

"""
TODO:
- Large scale implementation (with support for sparse matrices)
- Change kernel radius definition from a multiple k of the nearest neighbour
  distance to the k-nearest neighbour distance.
"""


def robust_normalizer(X, one_sided_extrema=0.05, hypercube_edge_length=1.0):
    """Compute a robust translation and scale parameter."""
    m = np.median(X, axis=0)
    s = np.amax(np.abs(np.percentile(
        X, [one_sided_extrema, 1. - one_sided_extrema], axis=0
    ) - m), axis=0)
    s *= 2. / hypercube_edge_length
    s[s <= np.finfo(X.dtype).eps] = 1.
    return m, s


def ball_volume_loginvdthroot(d):
    """Returns log(V**(-1/d)) where V is the d-volume of a unit ball."""
    return -np.log(np.pi) / 2. + gammaln(d / 2. + 1.) / d


def nearestneighbour_distance_lowerbound(n, d):
    """Returns a lower bound on the expected nearest-neighbour distance of
    n points uniformly distributed over a d-dimensional hypercube."""
    return np.exp(ball_volume_loginvdthroot(d) + betaln((n + 1.) / 2., 1. / d)
                  - np.log(d))


def nearestneighbour_distance(n, d):
    """Returns an estimate of the expected nearest neighbour distance of a row
    in a real-world robustly normalized feature matrix of size n x d."""
    factor = 2. if d > 3 else 1.
    return nearestneighbour_distance_lowerbound(d, n) / factor


def knearestneighbour_distance(X, k=1, max_samples=1000):
    """Estimate the k-nearest neighbour distance."""
    S = X if X.shape[0] < max_samples else \
        X[np.random.choice(X.shape[0], max_samples, replace=False), :]
    X2 = (X ** 2).sum(axis=1)[:, np.newaxis]
    S2 = (S ** 2).sum(axis=1)[:, np.newaxis]
    dist = X2 + (S2.T - 2. * (X @ S.T))
    dist[dist <= np.sqrt(np.finfo(X.dtype).eps)] = np.inf
    dist.sort(axis=0)
    return np.median(np.sqrt(dist[k, :]))


def kernel_radius_to_gamma(kernel_radius, n, d, kernel_value_at_radius=0.5):
    """Converts a kernel radius into a gamma value.

    The kernel radius is defined as a multiple of the estimated nearest
    neighbour distance of a robustly normalized feature matrix of size n x d,
    and is the distance at which the kernel function attains the value
    kernel_value_at_radius.

    Gamma is hyperparameter of the RBF kernel exp(-gamma ||x-y||^2). Finding
    a good value for gamma can be hard to reason about, while setting it in
    terms of the kernel radius as a multiple of the nearest neighbour distance
    should be much more intuitive.

    To compute gamma given the kernel radius, we find:
        exp(-gamma (kernel_radius * nn_dist)^2) = kernel_value_at_radius
        gamma = -log(kernel_value_at_radius) (kernel_radius * nn_dist)^-2
    """
    nn_dist = nearestneighbour_distance(n, d)
    gamma = -np.log(kernel_value_at_radius) / (kernel_radius * nn_dist) ** 2.
    return gamma


class BaseAutoLSSVM(BaseEstimator, RegressorMixin):

    def __init__(self, kernel_radius=0.5, kernel_value_at_radius=0.5, mu=0.5):
        self.kernel_radius = kernel_radius
        self.kernel_value_at_radius = kernel_value_at_radius
        self.mu = mu

    def _normalize_X_y(self, X, y=None):
        """Remove median and scale to that 100*(1 - 2 * one_sided_extrema)%
        of the data is approximately between -0.5 and 0.5."""
        if not hasattr(self, 'X_m_'):
            self.X_m_, self.X_s_ = robust_normalizer(X)
        X = (X - self.X_m_) / self.X_s_
        if y is None:
            return X
        if not hasattr(self, 'y_m_'):
            self.y_m_, self.y_s_ = robust_normalizer(y)
        y = (y - self.y_m_) / self.y_s_
        return X, y

    def fit(self, X, y):
        # Validate input.
        X, y = check_X_y(X, y, accept_sparse=None, dtype='numeric')
        # Normalize input.
        self.n_, self.d_ = X.shape
        #X, y = self._normalize_X_y(X, y)
        self.gamma_ = kernel_radius_to_gamma(
            self.kernel_radius, self.n_, self.d_, self.kernel_value_at_radius)
        # Train model.
        self.K_ = pairwise_kernels(
            X, metric='rbf', gamma=self.gamma_, n_jobs=-1)
        return self

    def predict(self, X):
        # Validate input.
        check_is_fitted(self, 'K_')
        X = check_array(X, accept_sparse=None, dtype='numeric')
        # Predict with trained model.
        return self.K_.mean() * np.ones((X.shape[0],))


class AutoLSSVMRegressor(BaseAutoLSSVM):

    def __init__(self, gamma=1.0, eta=1.0):
        super(AutoLSSVMRegressor, self).__init__(gamma=gamma, eta=eta)

    def predict(self, X):
        y = super(AutoLSSVMRegressor, self).predict(X)
        return y

In [5]:
model = BaseAutoLSSVM()
model.fit(train_X, train_Y)
pred_array = model.predict(test_X, test_Y)

  y = column_or_1d(y, warn=True)


MemoryError: Unable to allocate array with shape (55907, 55907) and data type float64

In [None]:
# Bus Bunching
accuracy = accuracy_score(test_Y, pred_array)
precision = precision_score(test_Y, pred_array)
recall = recall_score(test_Y, pred_array)
f_measure = f1_score(test_Y, pred_array)

In [None]:
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F-measure: " + str(f_measure))