# Deriving derivative of loss class and testing markdown conversion for equation.#

In [1]:
import sympy as sym
import numpy as np

In [2]:
# Setting up residual for replacement, so that weighted residuals can be applied in code.
r ,y, yhat = sym.symbols('r,y yhat')
r_eq = y-yhat

# 1. Square Loss class

In [3]:
square_loss=(y-yhat)**2
display(square_loss,square_loss.subs(r_eq,r))

(y - yhat)**2

r**2

In [4]:
first_derv = sym.diff(square_loss,yhat).simplify()
display(first_derv,first_derv.simplify().subs(r_eq,r))

-2*y + 2*yhat

-2*y + 2*yhat

In [5]:
-2*r

-2*r

In [6]:
scnd_derv= sym.diff(first_derv,yhat).simplify()
scnd_derv

2

# 2. Normal Loss
## 2.1 Logliklihood function

In [7]:
sigma, pi = sym.symbols('sigma pi')

Probability density function (PDF) of the normal distribution ${p}(y; \hat{y},k) = \frac{1}{\sqrt {2 \pi} \sigma}e^{-\frac {(y-\hat{y}) ^{2}}{2 \sigma ^{2}}$.
See Bolker, B. M. (2008). Negative Binomial. In Ecological Models in R (pp. 129–130). Princeton University Press.

In [9]:
normpdf=1/(sym.sqrt(2*pi)*sigma) * sym.E**(-((y-yhat)**2/(2*sigma**2)))
display(normpdf,normpdf.subs(r_eq,r))

sqrt(2)*exp(-(y - yhat)**2/(2*sigma**2))/(2*sqrt(pi)*sigma)

sqrt(2)*exp(-r**2/(2*sigma**2))/(2*sqrt(pi)*sigma)

In [10]:
normpdf.args

(1/2, sqrt(2), 1/sqrt(pi), 1/sigma, exp(-(y - yhat)**2/(2*sigma**2)))

In [11]:
#Our loss function is the negative of the logliklihood.
loglike_args= []
for arg in normpdf.args:
    loglike_args.append(sym.ln(arg).simplify())
    
loglike_args

[-log(2),
 log(2)/2,
 -log(pi)/2,
 log(1/sigma),
 log(exp(-(y - yhat)**2/(2*sigma**2)))]

In [12]:
logpdf_p1= loglike_args[0]
logpdf_p2= loglike_args[1]
logpdf_p3= loglike_args[2]
logpdf_p4= loglike_args[3]
logpdf_p5= -(y - yhat)**2/(2*sigma**2)
# logpdf_p5 has residual in:
logpdf_p5_alt= -(r)**2/(2*sigma**2)
logpdf_p5_alt

-r**2/(2*sigma**2)

In [13]:
logpdf = logpdf_p1+logpdf_p2+logpdf_p3+logpdf_p4+logpdf_p5
logpdf_alt = logpdf_p1+logpdf_p2+logpdf_p3+logpdf_p4+logpdf_p5_alt
display(logpdf,logpdf_alt)

-log(pi)/2 + log(1/sigma) - log(2)/2 - (y - yhat)**2/(2*sigma**2)

-r**2/(2*sigma**2) - log(pi)/2 + log(1/sigma) - log(2)/2

In [14]:
normloss=-logpdf
normloss

log(pi)/2 - log(1/sigma) + log(2)/2 + (y - yhat)**2/(2*sigma**2)

In [15]:
first_derv = sym.diff(normloss,yhat).simplify()
display(first_derv,first_derv.subs(r_eq,r))

(-y + yhat)/sigma**2

-r/sigma**2

In [16]:
scnd_derv = sym.diff(first_derv,yhat).simplify()
scnd_derv

sigma**(-2)

# 3.  Gamma loss class in terms of mean and shape

In [17]:
 a, s, y, yhat= sym.symbols('a s y yhat')

Probability density function (PDF) of the gamma distribution is $\frac{1}{s^a\Gamma(a)}x^{a-1}e^{-x/s}$. However we need this in terms of mean (here $\mu$), luckily we can subistitute in $s=\frac{\mu}{a}$ to get our likelihood function. But lets start with a log tranformation of the pdf.

See Bolker, B. M. (2008). Gamma. In Ecological Models in R (pp. 131–133). Princeton University Press.

In [18]:
pdf_gamma = 1/(s**a*sym.gamma(a))*(y**(a-1)*sym.E**(-y/s))
pdf_gamma

s**(-a)*y**(a - 1)*exp(-y/s)/gamma(a)

In [19]:
pdf_gamma.args

(s**(-a), y**(a - 1), 1/gamma(a), exp(-y/s))

In [20]:
log_pdf_gamma_p1 = -a*sym.ln(s)
log_pdf_gamma_p2 = (a-1)*sym.ln(y)
log_pdf_gamma_p3 = -sym.ln(sym.gamma(a))
log_pdf_gamma_p4 = -y/s
log_pdf_gamma= log_pdf_gamma_p1+log_pdf_gamma_p2+log_pdf_gamma_p3+log_pdf_gamma_p4
log_pdf_gamma

-a*log(s) + (a - 1)*log(y) - log(gamma(a)) - y/s

In [21]:
s_in_terms_mu_a = yhat/a
log_pdf_mu_a_gamma = log_pdf_gamma.subs(s,s_in_terms_mu_a) 
log_pdf_mu_a_gamma

-a*y/yhat - a*log(yhat/a) + (a - 1)*log(y) - log(gamma(a))

In [22]:
log_pdf_mu_a_gamma.args

(-log(gamma(a)), (a - 1)*log(y), -a*log(yhat/a), -a*y/yhat)

In [23]:
sym.print_latex(log_pdf_mu_a_gamma)

- \frac{a y}{\hat{y}} - a \log{\left(\frac{\hat{y}}{a} \right)} + \left(a - 1\right) \log{\left(y \right)} - \log{\left(\Gamma\left(a\right) \right)}


In [24]:
from scipy.special import gamma, factorial, gammaln
def gamma_mu_shape_logpdf(x, mu,shape):
    '''
    The log probability density function (pdf) of gamma distrbution in terms of mean and shape. 

    Parameters
    ----------
    x: array like observation.
    mu: mean or prediction.
    v: variance.
    See Bolker, B. M. (2008). Gamma. In Ecological Models in R (pp. 131–133). Princeton University Press.


    Returns
    -------
    log pdf, :math:`\\mathcal\\ln({p}(x; \\mu,a)) = - a \log{\left(\frac{\mu}{a} \right)} - \frac{a x}{\mu} + \left(a - 1\right) \log{\left(x \right)} - \log{\left(\Gamma\left(a\right) \right)}`
`

    '''

    logpdf_p1= -gammaln(shape)
    logpdf_p2= (shape - 1)*np.log(x)
    logpdf_p3= -shape*np.log(mu/shape)
    logpdf_p4= -shape*x/mu
    logpdf = logpdf_p1+logpdf_p2+logpdf_p3+logpdf_p4
    return logpdf

In [25]:
negloglikli_gamma_mu_a = -log_pdf_mu_a_gamma
negloglikli_gamma_mu_a

a*y/yhat + a*log(yhat/a) - (a - 1)*log(y) + log(gamma(a))

1st derivative of -Loglikelihood (gamma loss) with respect to $\mu$.

In [26]:
gammafirstderv= sym.diff(negloglikli_gamma_mu_a,yhat)
display(gammafirstderv,gammafirstderv.simplify())

-a*y/yhat**2 + a/yhat

a*(-y + yhat)/yhat**2

In [27]:
gammafirstderv=gammafirstderv.simplify()
gammafirstderv.subs(r_eq,r)

-a*r/yhat**2

In [28]:
sym.print_latex(gammafirstderv.simplify())

- \frac{a \left(y - \hat{y}\right)}{\hat{y}^{2}}


2nd derivative of -Loglikelihood (gamma loss) with respect to $\mu$.: 

In [29]:
gammasecderv = sym.diff(gammafirstderv,yhat).simplify()
display(gammasecderv,gammasecderv.simplify())

a*(2*y - yhat)/yhat**3

a*(2*y - yhat)/yhat**3

In [30]:
gammasecderv.subs(r_eq,r)

a*(2*y - yhat)/yhat**3

In [31]:
a*(y+r)/y**3

a*(r + y)/y**3

In [32]:
sym.print_latex(gammasecderv)

\frac{a \left(2 y - \hat{y}\right)}{\hat{y}^{3}}


# 4. Poisson

Probability density function (PDF) of the normal distribution ${p}(n;\lambda) = \frac{e^{-\lambda}\lambda^n}{n!}$.
See Bolker, B. M. (2008). Negative Binomial. In Ecological Models in R (pp. 1222-123). Princeton University Press.

In [33]:
poissonpmf = (sym.E**-yhat*yhat**y)/sym.factorial(y)
poissonpmf 

yhat**y*exp(-yhat)/factorial(y)

In [34]:
poissonpmf.args

(yhat**y, 1/factorial(y), exp(-yhat))

In [35]:
logpmf_p1 = sym.ln(yhat**y)
logpmf_p2 = -yhat
logpmf_p3 = -sym.factorial(y)
logpmf = logpmf_p1+logpmf_p2+logpmf_p3
logpmf

-yhat + log(yhat**y) - factorial(y)

In [36]:
possonloss=-logpmf

In [37]:
possonfirstderv= sym.diff(possonloss,yhat)
display(possonfirstderv,possonfirstderv.simplify())

-y/yhat + 1

(-y + yhat)/yhat

In [38]:
possonfirstderv.simplify().subs(r_eq,r)

-r/yhat

In [39]:
possonsecderv= sym.diff(possonfirstderv,yhat)
display(possonsecderv,possonsecderv.simplify())

y/yhat**2

y/yhat**2

# 5. Negative binomial loss class ##

In [40]:
k = sym.symbols('k')

Probability mass function (PMF) of negative binomial distribution ${p}(x; \mu,k) = \frac{\Gamma \left(k+x\right)}{\Gamma \left(k\right)x!}(\frac{k}{k+\mu})^{k}(\frac{\mu}{k+\mu})^{x}$ 
This definition of the negative binomial distribution is often refered to as negative binomial 2. This parameterisation takes the mean (usually refered as $\mu$, but in pygom $\hat{y}$ as we are looking at a prediction) and $k$ (an overdispersion parameter). The variance = $\mu+\frac{\mu^2}{k}$, some notation uses $\alpha$, ($k=\alpha^{-1}$). 
See Bolker, B. M. (2008). Negative Binomial. In Ecological Models in R (pp. 124–126). Princeton University Press.

In [41]:
nbpmf = (sym.gamma(k+y)/(sym.gamma(k)*sym.factorial(y)))*(k/(k+yhat))**k*(yhat/(k+yhat))**y
nbpmf

(k/(k + yhat))**k*(yhat/(k + yhat))**y*gamma(k + y)/(factorial(y)*gamma(k))

Due to the this PMF containing gamma functions and a factorial it is easier to calculate the sum of it's logged terms than to log it as one object (you end up with infinities otherwise).   

In [42]:
nbpmf.args

((k/(k + yhat))**k,
 (yhat/(k + yhat))**y,
 1/factorial(y),
 1/gamma(k),
 gamma(k + y))

In [43]:
logpmf_p1= k*(sym.ln(k)-sym.ln(k+yhat))
logpmf_p2= y*(sym.ln(yhat)-sym.ln(k+yhat))
logpmf_p3= -sym.ln(sym.factorial(y))
logpmf_p4= -sym.ln(sym.gamma(k))
logpmf_p5= sym.gamma(k+y)
logpmf = logpmf_p1+logpmf_p2+logpmf_p3+logpmf_p4+logpmf_p5
logpmf

k*(log(k) - log(k + yhat)) + y*(log(yhat) - log(k + yhat)) - log(factorial(y)) - log(gamma(k)) + gamma(k + y)

In [44]:
logpmf.args

(-log(factorial(y)),
 -log(gamma(k)),
 k*(log(k) - log(k + yhat)),
 y*(log(yhat) - log(k + yhat)),
 gamma(k + y))

In [45]:
from scipy.special import gammaln
def nb2logpmf(x, mu,k):
    '''
    The log probability mass function (pmf) of Negative Binomial 2 distribution. 

    Parameters
    ----------
    x: array like observation.
    mu: mean or prediction.
    k: overdispersion parameter (variance = mean(1+mean/k)). Note some notation uses $\alpha$, ($k=\alpha^{-1}$).
    See Bolker, B. M. (2008). Negative Binomial. In Ecological Models in R (pp. 124–126). Princeton University Press.

    Returns
    -------
    log pmf:
    math:`\\mathcal\\ln({p}(x; \\mu,k)) = \\ln(\\frac{\\Gamma \\left(k+x\\right)}{\\Gamma \\left(k\\right)x!}(\\frac{k}{k+\\mu})^{k}(\\frac{\\mu}{k+\\mu})^{x})`

    '''
    # note that we input k the overdispersion parameter here


    logpmf_p1= -gammaln(x+1) 
    logpmf_p2= -gammaln(k)
    logpmf_p3= k*(np.log(k) - np.log(k + mu)) 
    logpmf_p4= x*(np.log(mu) - np.log(k + mu))
    logpmf_p5= gammaln(k+x)
    logpmf = logpmf_p1+logpmf_p2+logpmf_p3+logpmf_p4+logpmf_p5
    return logpmf

Our loss function is the negative of the logliklihood above.

In [46]:
negloglikli=-logpmf

1st derivative of -Loglikelihood of negative binomial loss with respect to $\mu$.

In [47]:
nbfirstderv= sym.diff(negloglikli,yhat).simplify()
nbfirstderv

k*(-y + yhat)/(yhat*(k + yhat))

In [48]:
nbfirstderv.subs(r_eq,r)

-k*r/(yhat*(k + yhat))

1st derivative of -Loglikelihood of negative binomial loss with respect to yhat: 
$\frac{k(\mu-y)}{\mu(k + \mu)} $

In [49]:
nbsecderv = sym.diff(nbfirstderv,yhat).simplify()
nbsecderv.simplify()

k*(yhat*(k + yhat) + yhat*(y - yhat) + (k + yhat)*(y - yhat))/(yhat**2*(k + yhat)**2)

2nd derivative of -Loglikelihood of negative binomial loss with respect to yhat: 
$\frac{k(\mu(k + \mu) + \mu(y -\mu) + (k + \mu)(y - \mu)}{\mu^{2}(k + \mu)^{2}} $

In [50]:
nbsecderv_alt=nbsecderv.simplify().subs(r_eq,r)
nbsecderv_alt

k*(r*yhat + r*(k + yhat) + yhat*(k + yhat))/(yhat**2*(k + yhat)**2)

In [51]:
nbsecderv_alt.args

(k, yhat**(-2), (k + yhat)**(-2), r*yhat + r*(k + yhat) + yhat*(k + yhat))