In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pymc as pm
import scipy.stats as sps
import scipy.special as spc
sns.set()

In [None]:
### demo poisson likelihood with one data point
data = [6] 
lower,upper = 0,20

lambda_ = pm.Uniform('lambda_',lower,upper)
obs = pm.Poisson('obs',lambda_,observed=True,value=data)

model = pm.Model([lambda_,obs])
mcmc = pm.MCMC(model)
samples = mcmc.sample(50000,10000,2)
post = pd.DataFrame({'lambda' : mcmc.trace(lambda_)[:]})

In [None]:
print (post.describe())
print (post.head())

In [None]:
# pull lambda samples from posterior, generate simulated data based on those lambdas
nr_rows = 100000

rows = np.random.choice(post.index,replace=True,size=nr_rows)
posterior_samples = pm.rpoisson(post.iloc[rows,0])
posterior_samples

In [None]:
### HOW DOES MCMC FIT THE LIKELIHOOD TO DATA ? 

# this attempts to solve the problem by testing a set of lambda values, and recording which of the lambdas provides 
# highest frequency of 6 (the single data point), given a number of tries per lambda.

# having one data point (6), we want to find out which value for lambda gives highest frequency of 
# matches. We do so by trying each individual value for lambda, that is, the for loop with lambda below
# acts as our (uniform) prior. 

nr_tries = 100000
out = np.zeros((upper-lower+1,len(data) * nr_tries))

r = 0
c = 0

for d in data:
    
    for lambda_ in range(lower,upper+1):
        c = 0
        for tries in range(nr_tries):
            out[r,c] = pm.rpoisson(lambda_)
            c += 1
        r += 1       

out = out.astype(int)
freq = np.count_nonzero(out == data,axis=1)
freq

In [None]:
print (freq)
dist = freq / freq.sum()
#plt.hist(post['lambda'],density=True,alpha=0.5,color='green')
plt.bar(range(len(freq)),dist,alpha=0.5,color='red')
plt.hist(posterior_samples,density=True,alpha=0.5,color='green',bins=50)

In [None]:
from matplotlib.ticker import MaxNLocator

ax = plt.gca()

plt.plot(np.arange(len(dist)).astype(int),dist.cumsum(),'o--')
plt.title('Cumulative Probability')
plt.xlabel('lambda')
plt.ylabel('probability')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))


In [None]:
#### https://sciencehouse.wordpress.com/2010/06/23/mcmc-and-fitting-models-to-data/
#### demo of how to build a likelihood function to fit model to data using Metropolis-Hastings MCMC.
#### The problem is thus to find out what parameter value p for the Binomial Distribution best matches the actual data
###
# error function sum of square errors: computes error between our data and what our model generates
# sigma is an estimate of the error of data. it seems that with a small sigma, most proposals are accepted
# but with too large sigma, runtime errors occur

def X(data,generated,sigma=0.5) :
    return (( (data - generated) ** 2) / 2 * sigma ** 2).sum()
    

In [None]:
# the data we want to fit. Here we KNOW the value for the parameter p we are looking for
# it appears that the more data we have, the clearer difference about the peak there is between PYMC
# and the hack - might be due to the error function not operating optimally

SIZE = 5 # number of data points
N = 6 # number of pulls
true_p = 0.7 # p(success)

data = pm.rbinomial(n=N,p=true_p,size=SIZE)


In [None]:
# this is our model that we want to fit, i.e the likelihood function with parameter param for p
def generator(param,size=SIZE):
    return pm.rbinomial(n=N,p=param,size=size)

In [None]:
# the function used by Metropolis-Hastings to determine wether to accept a proposal.
# it uses the value returned for current and proposed errors,X_current,X_proposed, to determine 
# whether to move to the proposed new value for param p

# it's basically an exponential form of the quotient proposed/current, that is, P(D|proposed_p / P(D|current_p) == 
# exp(-X_proposed **2 + X_current **2) which is based on the Gaussian Likelihood function P(D|param) = exp(-X**2)

def likelihood_ratio(X_current,X_proposed):
    return np.exp(-X_proposed ** 2 + X_current ** 2)



In [None]:
### Metropolis-Hastings MCMC algorithm for fitting binomial data. That is, we are using MCMC to search for 
### the parameter p that best matches our data


steps = 100000 # length of MCMC random sampling walk

walk = np.zeros(steps) # array of samples 

all_proposed = np.zeros(steps)
all_current = np.zeros(steps)

walk[0] = 0.6 #initialize first step with dummy value for param to get MCMC walk started

# the random walk
for i in range(1,steps):
    current = walk[i-1]
    all_current[i] = current
    #print (current)
    
    # make sure proposed value for param is [0..1], which it must be for Binomial Dist.
    while True:
        random_step = pm.rnormal(0, 1 / 0.1 ** 2)
        proposed = current + random_step
        
        if proposed >= 0 and proposed <= 1:
            break
    all_proposed[i] = proposed
    
    X_current = X(data,generator(current)) #compute error of current generated data vs real data
    X_proposed = X(data,generator(proposed)) #compute error of proposed generated data vs real data
    
    A = likelihood_ratio(X_current,X_proposed) # compute ratio, i.e accept ? 
    #print (A)
    
    # ratio above expresses ratio of probabilities for proposed outcome vs current outcome, accoriding to distribution
    # if ratio > 1 : accept always. if ratio < 1, accept if ratio > random number 0..1
    # That is: if P(target) > P(current) : always accept. Else accept if random p is less than ratio. The smaller
    # the ratio, the less chance of accept. 
    
    if pm.runiform(0,1) < A : 
        walk[i] = proposed # accept
    else:
        walk[i] = current
 


In [None]:
plt.figure(figsize=(18,12))
plt.title('Metropolis-Hastings random sampling walk,True parameter value:{} nr of data points: {}'.format(true_p,len(data)))
plt.xlabel ('step number')
plt.ylabel(' acceepted and proposed parameter values')
plt.plot(walk,'o--',label='accepted',color='navy',alpha=0.5)
plt.plot(range(1,len(all_proposed)),all_proposed[1:],'o',color='orange',label='proposed',alpha=0.11)
plt.legend(loc='upper left')

In [None]:
plt.figure(figsize=(18,12))
plt.title('MCMC Metropolis Hastings with Binomial Likelihood'\
          ' True Parameter value: {}\n nr of data points: {} nr of steps: {}' .format(true_p,len(data),steps))
plt.xlabel('parameter value')
plt.ylabel('Relative Frequency')
_=plt.hist(walk,weights=np.ones_like(walk) / len(walk))

In [None]:
### a look at the basics of a Gaussian
x = np.arange(-11,12)
y = np.exp(-x**2)
plt.plot(x,y)
print (y.sum())

In [None]:
# use PYMC for the same fitting, using uniform prior

burn = 1
thin = 1

prior = pm.Uniform('prior',0,1)
obs = pm.Binomial('obs',n=6,p=prior,observed=True,value=data)

model = pm.Model([prior,obs])
mcmc = pm.MCMC(model)
samples = mcmc.sample(10000)

result = pd.DataFrame({'post_prior' :mcmc.trace(prior)[:]})


In [None]:
result.describe()

In [None]:

burn = 10000
plt.figure(figsize=(18,12))
plt.hist(result.post_prior,density=True,label='PYMC',alpha=0.6,color='blue')
plt.hist(walk[burn:],density=True,label='MCMC-hack',alpha=0.6,color='orange',bins=20)
plt.legend(loc='upper left')