### Numpy Transformations
#### How to summarize the mean value of variables:                           
* Arithemtic mean: for varaiables that can produce negative values
  + examples: efficacy assasy, and LogD
* Geometric mean: variables that span many orders of magnitude and can not be negative
  + examples: potency assays, clearance assays, and ratios such as efflux
* logit: variables that have values between [0, 1] or [0, 100%]
  + examples: protein binding
* Median: varaiables that have discrete values, 
  + example: a feature value corresponding to the max response/target variable
    + this feature value is a discrete value among several possible values
* no summary: 
  + example, calculated pKa, or categorical variables  

In [9]:
import numpy as np
from typing import List, Union, Tuple

#### Summary method
* Geometric mean
* logit

In [5]:
# geometric mean
def geometric_mean(input_array: np.array) -> float:
    input_array = input_array[input_array > 0]
    result = np.power(10, np.mean(np.log10(input_array))) if input_array.size >0 else None
    return result

# logit mean
def logit_mean(input_array: np.array) -> float:
    # logit mean is only for variable in [0, 1]
    input_array = input_array[np.logic_and(input_array >= 0, input_array <= 1)]
    if input_array.size == 0:
        return None
    
    # set value close to 1 and 0 as threshold values
    input_array[input_array > 0.999] = 0.999
    input_array[input_array < 0.001] = 0.001
    sigmoid = np.log(input_array/(1 - input_array))
    alpha = np.mean(sigmoid)
    
    return np.exp(alpha)/(1 + np.exp(alpha))

In [41]:
# transform methods

# transform error settings
transform_np_error_settings = {'divide': 'ignore', 'invalid': 'ignore'}

# transform input to log10
def transform_log10(value: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
    # define a minimum between np.float 0 and 1
    minimum = float(np.log10(np.nextafter(np.float32(0), np.float32(1))))
    
    old_settings = np.seterr(**transform_np_error_settings)
    try:
        result = np.log10(value)
        if isinstance(result, np.ndarray):
            result[np.isinf(result)] = np.nan
            result[result < minimum] = minimum
        else:
            if np.isinf(result):
                result = float('nan')
            elif result < minimum:
                result = minimum
    finally:
        np.seterr(**old_settings)
    return result    

# inverse function of log10 transformation
def untransform_inv_log10(value: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
    maximum = np.finfo(np.float32).max
    old_settings = np.seterr(**transform_np_error_settings)
    try:
        result = np.power(10.0, value)
        if isinstance(result, np.ndarray):
            result[np.isinf(result)] = np.nan
            result[result > maximum] = maximum
        else:
            if result > maximum:
                result = maximum
    finally:
        np.seterr(**old_settings)
    return result    

# logit transformation
def transform_logit(value: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
    if isinstance(value, np.ndarray):
        value[value >=1.0] = 0.999
        value[value <=0.0] = 0.001
    else:
        if value >= 1.0:
            value = 0.999
        elif value <= 0.001:
            value = 0.001
    return np.log10(value/(1-value))        
        
# inverse function of logit transformation
def untransform_inv_logit(value: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
    return np.power(10, value) / (np.power(10, value) + 1)
    

# log normal transformation (return the mu and sigma of tranformed normal distribution)
# https://en.wikipedia.org/wiki/Log-normal_distribution
def transform_log10normal(mu: Union[float, np.ndarray], sigma: Union[float, np.ndarray]) -> \
    Union[Tuple[float, float], Tuple[np.array, np.array]]:
    # define a minimum between np.float 0 and 1
    minimum = float(np.log10(np.nextafter(np.float32(0), np.float32(1))))
    
    old_settings = np.seterr(**transform_np_error_settings)
    try:
        log_mu = np.log10(mu **2 / np.sqrt(sigma ** 2 + mu ** 2))
        log_sigma = np.sqrt(np.log10(sigma ** 2 / mu **2 + 1))
        if isinstance(log_mu, np.ndarray):
            log_mu[log_mu < minimum] = minimum
            log_sigma[log_sigma < minimum] = minimum
        else:
            if log_mu < minimum:
                log_mu = minimum
            if log_sigma < minimum:
                log_sigma = minimum
    finally:
        np.seterr(**old_settings)
    return log_mu, log_sigma  
    

# inverse transformation of log normal
def untransform_inv_log10normal(log_mu: Union[float, np.ndarray], log_sigma: Union[float, np.ndarray]) -> \
    Union[Tuple[float, float], Tuple[np.array, np.array]]:
    
    maximum = np.finfo(np.float32).max
    old_settings = np.seterr(**transform_np_error_settings)
    try:
        mu = np.power(10.0, log_mu + 0.5 * np.power(log_sigma, 2))
        sigma = np.power(10.0, 2 * log_mu + np.power(log_sigma, 2)) * (np.power(10.0, np.power(log_sigma, 2)) -1)
        sigma = np.sqrt(sigma)
        if isinstance(mu, np.ndarray):
            mu[mu > maximum] = maximum
            sigma[sigma > maximum] = maximum
        else:
            if mu > maximum:
                mu = maximum
            if sigma > maximum:
                sigma = maximum
    finally:
        np.seterr(**old_settings)
    return mu, sigma 

In [42]:
test_array_log = np.array(np.linspace(0, 5))

In [43]:
test_array_log

array([0.        , 0.10204082, 0.20408163, 0.30612245, 0.40816327,
       0.51020408, 0.6122449 , 0.71428571, 0.81632653, 0.91836735,
       1.02040816, 1.12244898, 1.2244898 , 1.32653061, 1.42857143,
       1.53061224, 1.63265306, 1.73469388, 1.83673469, 1.93877551,
       2.04081633, 2.14285714, 2.24489796, 2.34693878, 2.44897959,
       2.55102041, 2.65306122, 2.75510204, 2.85714286, 2.95918367,
       3.06122449, 3.16326531, 3.26530612, 3.36734694, 3.46938776,
       3.57142857, 3.67346939, 3.7755102 , 3.87755102, 3.97959184,
       4.08163265, 4.18367347, 4.28571429, 4.3877551 , 4.48979592,
       4.59183673, 4.69387755, 4.79591837, 4.89795918, 5.        ])

In [44]:
log10_transform_array = transform_log10(test_array_log)

In [45]:
inv_log10_array = untransform_inv_log10(log10_transform_array)

In [46]:
inv_log10

array([       nan, 0.10204082, 0.20408163, 0.30612245, 0.40816327,
       0.51020408, 0.6122449 , 0.71428571, 0.81632653, 0.91836735,
       1.02040816, 1.12244898, 1.2244898 , 1.32653061, 1.42857143,
       1.53061224, 1.63265306, 1.73469388, 1.83673469, 1.93877551,
       2.04081633, 2.14285714, 2.24489796, 2.34693878, 2.44897959,
       2.55102041, 2.65306122, 2.75510204, 2.85714286, 2.95918367,
       3.06122449, 3.16326531, 3.26530612, 3.36734694, 3.46938776,
       3.57142857, 3.67346939, 3.7755102 , 3.87755102, 3.97959184,
       4.08163265, 4.18367347, 4.28571429, 4.3877551 , 4.48979592,
       4.59183673, 4.69387755, 4.79591837, 4.89795918, 5.        ])

In [47]:
test_array_logit = np.array(np.linspace(0, 1))
test_array_logit

array([0.        , 0.02040816, 0.04081633, 0.06122449, 0.08163265,
       0.10204082, 0.12244898, 0.14285714, 0.16326531, 0.18367347,
       0.20408163, 0.2244898 , 0.24489796, 0.26530612, 0.28571429,
       0.30612245, 0.32653061, 0.34693878, 0.36734694, 0.3877551 ,
       0.40816327, 0.42857143, 0.44897959, 0.46938776, 0.48979592,
       0.51020408, 0.53061224, 0.55102041, 0.57142857, 0.59183673,
       0.6122449 , 0.63265306, 0.65306122, 0.67346939, 0.69387755,
       0.71428571, 0.73469388, 0.75510204, 0.7755102 , 0.79591837,
       0.81632653, 0.83673469, 0.85714286, 0.87755102, 0.89795918,
       0.91836735, 0.93877551, 0.95918367, 0.97959184, 1.        ])

In [48]:
logit_transform_array = transform_logit(test_array_logit)

In [49]:
inv_logit_array = untransform_inv_logit(logit_transform_array)

In [50]:
inv_logit_array

array([0.001     , 0.02040816, 0.04081633, 0.06122449, 0.08163265,
       0.10204082, 0.12244898, 0.14285714, 0.16326531, 0.18367347,
       0.20408163, 0.2244898 , 0.24489796, 0.26530612, 0.28571429,
       0.30612245, 0.32653061, 0.34693878, 0.36734694, 0.3877551 ,
       0.40816327, 0.42857143, 0.44897959, 0.46938776, 0.48979592,
       0.51020408, 0.53061224, 0.55102041, 0.57142857, 0.59183673,
       0.6122449 , 0.63265306, 0.65306122, 0.67346939, 0.69387755,
       0.71428571, 0.73469388, 0.75510204, 0.7755102 , 0.79591837,
       0.81632653, 0.83673469, 0.85714286, 0.87755102, 0.89795918,
       0.91836735, 0.93877551, 0.95918367, 0.97959184, 0.999     ])

In [51]:
lognorm_mu, lognorm_sigma  = transform_log10normal(mu = test_array_log, sigma = test_array_logit)

In [54]:
mu, sigma = untransform_inv_log10normal(lognorm_mu, lognorm_sigma)

In [55]:
mu

array([3.40282347e+38, 1.02040816e-01, 2.04081633e-01, 3.06122449e-01,
       4.08163265e-01, 5.10204082e-01, 6.12244898e-01, 7.14285714e-01,
       8.16326531e-01, 9.18367347e-01, 1.02040816e+00, 1.12244898e+00,
       1.22448980e+00, 1.32653061e+00, 1.42857143e+00, 1.53061224e+00,
       1.63265306e+00, 1.73469388e+00, 1.83673469e+00, 1.93877551e+00,
       2.04081633e+00, 2.14285714e+00, 2.24489796e+00, 2.34693878e+00,
       2.44897959e+00, 2.55102041e+00, 2.65306122e+00, 2.75510204e+00,
       2.85714286e+00, 2.95918367e+00, 3.06122449e+00, 3.16326531e+00,
       3.26530612e+00, 3.36734694e+00, 3.46938776e+00, 3.57142857e+00,
       3.67346939e+00, 3.77551020e+00, 3.87755102e+00, 3.97959184e+00,
       4.08163265e+00, 4.18367347e+00, 4.28571429e+00, 4.38775510e+00,
       4.48979592e+00, 4.59183673e+00, 4.69387755e+00, 4.79591837e+00,
       4.89795918e+00, 5.00000000e+00])

In [56]:
sigma

array([3.40282347e+38, 2.04081633e-02, 4.08163265e-02, 6.12244898e-02,
       8.16326531e-02, 1.02040816e-01, 1.22448980e-01, 1.42857143e-01,
       1.63265306e-01, 1.83673469e-01, 2.04081633e-01, 2.24489796e-01,
       2.44897959e-01, 2.65306122e-01, 2.85714286e-01, 3.06122449e-01,
       3.26530612e-01, 3.46938776e-01, 3.67346939e-01, 3.87755102e-01,
       4.08163265e-01, 4.28571429e-01, 4.48979592e-01, 4.69387755e-01,
       4.89795918e-01, 5.10204082e-01, 5.30612245e-01, 5.51020408e-01,
       5.71428571e-01, 5.91836735e-01, 6.12244898e-01, 6.32653061e-01,
       6.53061224e-01, 6.73469388e-01, 6.93877551e-01, 7.14285714e-01,
       7.34693878e-01, 7.55102041e-01, 7.75510204e-01, 7.95918367e-01,
       8.16326531e-01, 8.36734694e-01, 8.57142857e-01, 8.77551020e-01,
       8.97959184e-01, 9.18367347e-01, 9.38775510e-01, 9.59183673e-01,
       9.79591837e-01, 9.99000000e-01])