pyod/utils/data.py

# -*- coding: utf-8 -*-
"""Utility functions for manipulating data
"""
# Author: Yue Zhao <yuezhao@cs.toronto.edu>
# License: BSD 2 clause

from __future__ import division
from __future__ import print_function

import numpy as np

from sklearn.utils import column_or_1d
from sklearn.utils import check_random_state
from sklearn.metrics import roc_auc_score

from .utility import precision_n_scores

MAX_INT = np.iinfo(np.int32).max


def _generate_data(n_inliers, n_outliers, n_features, coef, offset,
                   random_state):
    """Internal function to generate data samples

    :param n_inliers: The number of inliers.
    :type n_inliers: int

    :param n_outliers: The number of outliers.
    :type n_outliers: int

    :param n_features: The number of features.
    :type n_features: int

    :param coef: The coefficient of data generation
    :type coef: float in range [0,1)+0.001

    :param offset: The offset of data generation
    :type offset: int

    :param random_state: The random number generator
    :type random_state: RandomState instance.

    :return: X,y
    :rtype: numpy array of shape (n_samples, n_features) and (n_features,)
    """

    inliers = coef * random_state.randn(n_inliers, n_features) + offset
    outliers = random_state.uniform(low=-1 * offset, high=offset,
                                    size=(n_outliers, n_features))
    X = np.r_[inliers, outliers]

    y = np.r_[np.zeros((n_inliers,)), np.ones((n_outliers,))]

    return X, y


def generate_data(n_train=1000, n_test=500, n_features=2, contamination=0.1,
                  train_only=False, offset=10, random_state=None):
    """Utility function to generate synthesized data.
    Normal data is generated by a multivariate Gaussian distribution and
    outliers are generated by a uniform distribution.

    :param n_train: number of training points to generate
    :type n_train: int, (default=1000)

    :param n_test: number of test points to generate
    :type n_test: int

    :param contamination: The amount of contamination of the data set,
        i.e. the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.
    :type contamination: float in (0., 0.5), optional (default=0.1)

    :param n_features: The number of features
    :type n_features: int, optional (default=2)

    :param train_only: If true, generate train data only
    :type train_only: bool, optional(default=False)

    :param offset: Adjust the value range of Gaussian and Uniform
    :type offset: int, optional (default=10)

    :param random_state: If int, random_state is the seed used by the
        random number generator; If RandomState instance, random_state is
        the random number generator; If None, the random number generator
        is the RandomState instance used by `np.random`.
    :type random_state: int, RandomState instance or None,
        optional (default=None)

    :return: training data(X_train, y_test) and test data (X_test, y_test)
    :rtype: tuple, (ndarry, ndarry, ndarry, ndarry)
    """
    # initialize a random state and seeds for the instance
    random_state = check_random_state(random_state)
    offset_ = random_state.randint(low=offset)
    coef_ = random_state.random_sample() + 0.001  # in case of underflow

    n_outliers_train = int(n_train * contamination)
    n_inliers_train = int(n_train - n_outliers_train)

    X_train, y_train = _generate_data(n_inliers_train, n_outliers_train,
                                      n_features, coef_, offset_, random_state)

    if train_only:
        return X_train, y_train

    n_outliers_test = int(n_test * contamination)
    n_inliers_test = int(n_test - n_outliers_test)

    X_test, y_test = _generate_data(n_inliers_test, n_outliers_test,
                                    n_features, coef_, offset_, random_state)

    return X_train, y_train, X_test, y_test


def get_color_codes(y):
    """
    Internal function to generate color codes for inliers and outliers
    Inliers (0): blue
    Outlier (1): red

    :param y: The binary labels of the groud truth, where 0 is inlier
    :type y: list, array, numpy array of shape (n_samples,)

    :return: The list of color codes ['r', 'b', ..., 'b']
    :rtype: list
    """
    y = column_or_1d(y)

    # inliers are assigned blue
    c = np.full([len(y)], 'b', dtype=str)
    outliers_ind = np.where(y == 1)

    # outlier are assigned red
    c[outliers_ind] = 'r'

    return c


def evaluate_print(clf_name, y, y_pred):
    """
    Utility function for evaluating and printing the results for examples
    Internal use only

    :param clf_name: The name of the detector
    :type clf_name: str

    :param y: The ground truth
    :type y: list or array of shape (n_samples,)

    :param y_pred: The predicted outlier scores
    :type y: list or array of shape (n_samples,)
    """
    y = column_or_1d(y)
    y_pred = column_or_1d(y_pred)

    print('{clf_name} ROC:{roc}, precision @ rank n:{prn}'.format(
        clf_name=clf_name,
        roc=np.round(roc_auc_score(y, y_pred), decimals=4),
        prn=np.round(precision_n_scores(y, y_pred), decimals=4)))