src/library/stats/src/distance.c

/*
 *  R : A Computer Language for Statistical Data Analysis
 *  Copyright (C) 1998-2017  The R Core Team
 *  Copyright (C) 2002-2017  The R Foundation
 *  Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, a copy is available at
 *  https://www.R-project.org/Licenses/
 */

#ifdef HAVE_CONFIG_H
# include <config.h>
#endif

#include <float.h>

#include <R.h>
#include <Rmath.h>
#include "stats.h"
#ifdef _OPENMP
# include <R_ext/MathThreads.h>
#endif

#define both_FINITE(a,b) (R_FINITE(a) && R_FINITE(b))
#ifdef R_160_and_older
#define both_non_NA both_FINITE
#else
#define both_non_NA(a,b) (!ISNAN(a) && !ISNAN(b))
#endif

static double R_euclidean(double *x, int nr, int nc, int i1, int i2)
{
    double dev, dist;
    int count, j;

    count= 0;
    dist = 0;
    for(j = 0 ; j < nc ; j++) {
	if(both_non_NA(x[i1], x[i2])) {
	    dev = (x[i1] - x[i2]);
	    if(!ISNAN(dev)) {
		dist += dev * dev;
		count++;
	    }
	}
	i1 += nr;
	i2 += nr;
    }
    if(count == 0) return NA_REAL;
    if(count != nc) dist /= ((double)count/nc);
    return sqrt(dist);
}

static double R_maximum(double *x, int nr, int nc, int i1, int i2)
{
    double dev, dist;
    int count, j;

    count = 0;
    dist = -DBL_MAX;
    for(j = 0 ; j < nc ; j++) {
	if(both_non_NA(x[i1], x[i2])) {
	    dev = fabs(x[i1] - x[i2]);
	    if(!ISNAN(dev)) {
		if(dev > dist)
		    dist = dev;
		count++;
	    }
	}
	i1 += nr;
	i2 += nr;
    }
    if(count == 0) return NA_REAL;
    return dist;
}

static double R_manhattan(double *x, int nr, int nc, int i1, int i2)
{
    double dev, dist;
    int count, j;

    count = 0;
    dist = 0;
    for(j = 0 ; j < nc ; j++) {
	if(both_non_NA(x[i1], x[i2])) {
	    dev = fabs(x[i1] - x[i2]);
	    if(!ISNAN(dev)) {
		dist += dev;
		count++;
	    }
	}
	i1 += nr;
	i2 += nr;
    }
    if(count == 0) return NA_REAL;
    if(count != nc) dist /= ((double)count/nc);
    return dist;
}

static double R_canberra(double *x, int nr, int nc, int i1, int i2)
{
    double dev, dist, sum, diff;
    int count, j;

    count = 0;
    dist = 0;
    for(j = 0 ; j < nc ; j++) {
	if(both_non_NA(x[i1], x[i2])) {
	    sum = fabs(x[i1]) + fabs(x[i2]);
	    diff = fabs(x[i1] - x[i2]);
	    if (sum > DBL_MIN || diff > DBL_MIN) {
		dev = diff/sum;
		if(!ISNAN(dev) ||
		   (!R_FINITE(diff) && diff == sum &&
		    /* use Inf = lim x -> oo */ (dev = 1., TRUE))) {
		    dist += dev;
		    count++;
		}
	    }
	}
	i1 += nr;
	i2 += nr;
    }
    if(count == 0) return NA_REAL;
    if(count != nc) dist /= ((double)count/nc);
    return dist;
}

static double R_dist_binary(double *x, int nr, int nc, int i1, int i2)
{
    int total, count, dist;
    int j;

    total = 0;
    count = 0;
    dist = 0;

    for(j = 0 ; j < nc ; j++) {
	if(both_non_NA(x[i1], x[i2])) {
	    if(!both_FINITE(x[i1], x[i2])) {
		warning(_("treating non-finite values as NA"));
	    }
	    else {
		if(x[i1] != 0. || x[i2] != 0.) {
		    count++;
		    if( ! (x[i1] != 0. && x[i2] != 0.) ) dist++;
		}
		total++;
	    }
	}
	i1 += nr;
	i2 += nr;
    }

    if(total == 0) return NA_REAL;
    if(count == 0) return 0;
    return (double) dist / count;
}

static double R_minkowski(double *x, int nr, int nc, int i1, int i2, double p)
{
    double dev, dist;
    int count, j;

    count= 0;
    dist = 0;
    for(j = 0 ; j < nc ; j++) {
	if(both_non_NA(x[i1], x[i2])) {
	    dev = (x[i1] - x[i2]);
	    if(!ISNAN(dev)) {
		dist += R_pow(fabs(dev), p);
		count++;
	    }
	}
	i1 += nr;
	i2 += nr;
    }
    if(count == 0) return NA_REAL;
    if(count != nc) dist /= ((double)count/nc);
    return R_pow(dist, 1.0/p);
}

enum { EUCLIDEAN=1, MAXIMUM, MANHATTAN, CANBERRA, BINARY, MINKOWSKI };
/* == 1,2,..., defined by order in the R function dist */

void R_distance(double *x, int *nr, int *nc, double *d, int *diag,
		int *method, double *p)
{
    int i, j = 0; // suppress clang 9 warning
    size_t  ij;  /* can exceed 2^31 - 1 */
    double (*distfun)(double*, int, int, int, int) = NULL;
#ifdef _OPENMP
    int nthreads;
#endif

    switch(*method) {
    case EUCLIDEAN:
	distfun = R_euclidean;
	break;
    case MAXIMUM:
	distfun = R_maximum;
	break;
    case MANHATTAN:
	distfun = R_manhattan;
	break;
    case CANBERRA:
	distfun = R_canberra;
	break;
    case BINARY:
	distfun = R_dist_binary;
	break;
    case MINKOWSKI:
	if(!R_FINITE(*p) || *p <= 0)
	    error(_("distance(): invalid p"));
	// plus special case below because of extra argument
	break;
    default:
	error(_("distance(): invalid distance"));
    }
    int dc = (*diag) ? 0 : 1; /* diag=1:  we do the diagonal */
#ifdef _OPENMP
    if (R_num_math_threads > 0)
	nthreads = R_num_math_threads;
    else
	nthreads = 1; /* for now */
    if (nthreads == 1) {
	/* do the nthreads == 1 case without any OMP overhead to see
	   if it matters on some platforms */
	ij = 0;
	for(j = 0 ; j <= *nr ; j++)
	    for(i = j+dc ; i < *nr ; i++)
		d[ij++] = (*method != MINKOWSKI) ?
		    distfun(x, *nr, *nc, i, j) :
		    R_minkowski(x, *nr, *nc, i, j, *p);
    }
    else
	/* This produces uneven thread workloads since the outer loop
	   is over the subdiagonal portions of columns.  An
	   alternative would be to use a loop on ij and to compute the
	   i and j values from ij. */
#pragma omp parallel for num_threads(nthreads) default(none)	\
    private(i, j, ij)						\
    firstprivate(nr, dc, d, method, distfun, nc, x, p)
	for(j = 0 ; j <= *nr ; j++) {
	    ij = j * (*nr - dc) + j - ((1 + j) * j) / 2;
	    for(i = j+dc ; i < *nr ; i++)
		d[ij++] = (*method != MINKOWSKI) ?
		    distfun(x, *nr, *nc, i, j) :
		    R_minkowski(x, *nr, *nc, i, j, *p);
	}
#else
    ij = 0;
    for(j = 0 ; j <= *nr ; j++)
	for(i = j+dc ; i < *nr ; i++)
	    d[ij++] = (*method != MINKOWSKI) ?
		distfun(x, *nr, *nc, i, j) : R_minkowski(x, *nr, *nc, i, j, *p);
#endif
}

#include <Rinternals.h>

SEXP Cdist(SEXP x, SEXP smethod, SEXP attrs, SEXP p)
{
    SEXP ans;
    int nr = nrows(x), nc = ncols(x), method = asInteger(smethod);
    int diag = 0;
    R_xlen_t N;
    double rp = asReal(p);
    N = (R_xlen_t)nr * (nr-1)/2; /* avoid int overflow for N ~ 50,000 */
    PROTECT(ans = allocVector(REALSXP, N));
    if(TYPEOF(x) != REALSXP) x = coerceVector(x, REALSXP);
    PROTECT(x);
    R_distance(REAL(x), &nr, &nc, REAL(ans), &diag, &method, &rp);
    /* tack on attributes */
    SEXP names = getAttrib(attrs, R_NamesSymbol);
    for (int i = 0; i < LENGTH(attrs); i++)
	setAttrib(ans, install(translateChar(STRING_ELT(names, i))),
		  VECTOR_ELT(attrs, i));
    UNPROTECT(2);
    return ans;
}