In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal

import jax
import jax.numpy as jnp
from jax import grad, jit, vmap
from jax.scipy.stats.multivariate_normal import logpdf as jlogpdf

# Question 1: Reading and formatting dataset

In [None]:
#1.1 : Read the datset with pandas
dataset = pd.read_csv("GermanCredit.txt", sep ="\s+", header=None)
dataset

In [None]:
#1.2 : Creating ytrain. For convenience, we will use 0 and 1 as labels
dataset[24] = dataset[24] - 1
dataset

In [None]:
# 1.2 Split into train and test set
M = 800 # train set size
d = 24
length = dataset.shape[0]
y_train = dataset.loc[:M-1, d]
y_test = dataset.loc[M:, d]

x_train = dataset.loc[:M-1, :d-1]
x_test = dataset.loc[M:, :d-1]

In [None]:
#1.3 Center and scale the features
from sklearn.preprocessing import StandardScaler

In [None]:
#1.3 : scaling xtrain 
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train.shape

In [None]:
#1.4 Extend them with a column of ones, which is for logistic regression
ones_train = jnp.ones((M, 1))
ones_test = jnp.ones((length-M, 1))
x_train = jnp.concatenate((ones_train, x_train), axis=1)
x_train.shape

In [None]:
# the shape changes to dimension 25
x_test = jnp.concatenate((ones_test, x_test), axis=1)
x_test

In [None]:
#make sure that the values are scaled 
jnp.mean(x_train, axis=0), jnp.std(x_train, axis=0)

# Question 2: Model specification

##### 2.1 : Proof of log-odds ratio
Let us denote $$z= \beta_0 + \sum_{j = 1}^n \beta_j X_j.$$

We are to show that
$$ \log \frac{P(Y=1|\beta)}{1-P(Y=1|\beta)} = z.$$
We can rewrite the left hand side by splitting the fraction of the log
\begin{align*} 
\log \frac{P(Y=1|\beta)}{1-P(Y=1|\beta)} &=  \log \frac{1}{1+e^{-z}} - \log \frac{e^{-z}}{1+e^{-z}}\\ 
 &=   \log 1 - \log e^{-z}\\ 
 &=  - \log e^{-z}\\ 
 &= z
\end{align*}
Note that in the second step we used that you can split again to 4 logarithms and then the log of the denominators will have opposite sign.

##### 2.2 : Parameters
We can interepret these parameters as XYZ

In [None]:
#2.3 : Decision Boundry. is it in terms of prob or vars >= 0

##### 2.4 : Proof of log-likelihood
Let us denote $$z_i= \beta_0 + \sum_{j = 1}^n \beta_j X_{i,j}.$$

If $y_i = 1$, we see that the right hand side is 
\begin{align*} 
\log \frac{1}{1+e^{-z_i}}  &= \log \frac{e^z_i}{1+e^{z_i}}\\
&= z_i - \log 1+e^z_i\\
&= y_i z_i - \log 1+e^z_i.
\end{align*}

If $y_i = 0$, we see that the right hand side is 
\begin{align*} 
\log \frac{e^{-z_i}}{1+e^{-z_i}}  &= \log \frac{1}{1+e^{z_i}}\\
&= 0 - \log 1+e^{z_i}\\
&= y_i z_i - \log 1+e^{z_i}.
\end{align*}

In [None]:
#Implementation – never used
def log_likelihood(beta):
    x_beta = np.matmul(x_train, beta)
    output = np.sum(y_train * x_beta - np.log(1 + np.exp(x_beta)))
    return output

math needed: Is there an error?


##### 2.5 : Jax-compatible log-likelihood

We are asked to calculate
$$\log P(y_1,...,y_m| \beta).$$

By independence of the training data points, this equals
$$\log \prod_{i=1}^m P(y_i| \beta).$$

We can now use the result from previous exercise to see that 

\begin{align*} 
\log \prod_{i=1}^m P(y_i| \beta) &=  \sum_{i=1}^m \log P(y_i| \beta) \\
&= \sum_{i=1}^m (y_i z_i - \log 1+e^z_i)\\
&= \sum_{i=1}^m y_i z_i -  \sum_{i=1}^m \log 1+e^z_i.
\end{align*}

In [None]:
#math and speed comparison needed
@jit
def log_likelihood_jax(beta):
    x_beta = jnp.matmul(x_train, beta)
    output = jnp.sum(y_train * x_beta - jnp.log(1 + jnp.exp(x_beta)))
    return output

# After investigation, the jit implementations speeds up significantly
jit_likelihood_jax = jit(log_likelihood_jax)

In [None]:
#2.6 : Gradient
# speed comparison needed

# Evaluates the gradient of the log likelihood for any beta
# After investigation, the jit implementations speeds up significantly
grad_log_likelihood = jit(grad(log_likelihood_jax))

In [None]:
#2.7 : Logprior Function
#speed comparison is needed

DIM = 25
constant = jnp.pi**2 * M / (3*DIM)
Sigma = constant * jnp.linalg.inv(jnp.matmul(x_train.T, x_train))

@jit
def log_prior(beta):
    '''
    Input – beta: a vector of size d+1
    Output – log prior density: constant
    '''
    return jlogpdf(beta, mean = jnp.zeros(DIM), cov = Sigma)


# After investigation, the jit implementations does not speed up significantly
# jitlogprior = jit(logprior)

In [None]:
#2.8 : Gradient
#speed comparison is needed
grad_log_prior = jit(grad(log_prior))

In [None]:
# 2.9 : Create the log posterior
def log_posterior(beta):
    '''
    Input – beta: a vector of size d+1
    Output – log posterior: constant
    '''
    return log_prior(beta) + log_likelihood(beta)


In [None]:
# 2.10 : Evaluates the gradient of the unnormalized log-posterior density
 
def grad_log_posterior(beta):
    '''
    Input – beta: a vector of size d+1
    Output – gradient step of log posterior: beta: a vector of size d+1
    '''
    return grad_log_prior(beta) + grad_log_likelihood(beta)

# Section 3

In [None]:
# Q1: independent Metropolis-Hastings
def sample_prior():
    return multivariate_normal.rvs(mean=np.zeros(DIM), cov=Sigma)

n_accept = 0
N = 10000
current_beta = sample_prior()
store_beta = np.zeros((N, DIM))

In [None]:
#run the loop
for n in range(N):
    #sample a proposed state
    proposed_beta = sample_prior()

    #evaluate posterior density
    log_posterior_proposed = log_posterior(proposed_beta)
    log_posterior_current = log_posterior(current_beta)

    #evaluate transition likelihood
    log_transition_proposed = log_prior(proposed_beta)
    log_transition_current = log_prior(current_beta)
    
    #log acceptance prob
    log_accept_prob = (log_posterior_proposed + log_transition_current
                       - log_posterior_current - log_transition_proposed)

    #accept tor reject
    uniform = np.random.rand(1) # sample a uniform on [0,1]
    if np.log(uniform) < log_accept_prob:
        current_beta = proposed_beta.copy() #accept
        n_accept += 1

    store_beta[n,:] = current_beta

In [None]:
print("Acceptance rate: ", n_accept/N)

In [None]:
iteration = np.arange(1,N+1)
plt.figure()
plt.plot(iteration, store_beta[:,0])
plt.plot(iteration, store_beta[:,1])
plt.plot(iteration, store_beta[:,2])
plt.plot(iteration, store_beta[:,3])
plt.xlabel('iteration')
plt.ylabel('beta')
plt.show()

In [None]:
#3.2 : Random Walk Metropolis–Hastings algorithm
s = 0.02
SIG = jnp.eye(DIM) * s**2
rng = jax.random.PRNGKey(0)
n_accept = 0
store_beta = np.zeros((N,DIM))
beta = sample_prior()

for n in range(N):
#     epsilon = jax.random.multivariate_normal(key=rng, mean= jnp.zeros(DIM), cov=SIG)  
    epsilon = np.random.multivariate_normal(mean= jnp.zeros(DIM), cov=SIG)  

    proposed_state = beta + epsilon
    
    pi_y = log_posterior(proposed_state)
    pi_x = log_posterior(beta)
    
    logacceptprob = float(pi_y - pi_x)
    
    #accept tor reject
    uniform = np.random.rand(1) # sample a uniform on [0,1]
    if np.log(uniform) < logacceptprob:
        beta = proposed_state.copy() #accept
        n_accept += 1
    store_beta[n,:] = beta
    

In [None]:
n_accept/N

In [None]:
iteration = np.arange(1,N+1)
plt.figure()
plt.plot(iteration, store_beta[:,0])
plt.plot(iteration, store_beta[:,1])
plt.plot(iteration, store_beta[:,2])
plt.plot(iteration, store_beta[:,3])
plt.xlabel('iteration')
plt.ylabel('beta')
plt.show()

In [None]:
iteration = np.arange(1,N+1)
plt.figure()
plt.plot(iteration, np.cumsum(store_beta[:,0])/iteration)
plt.plot(iteration, np.cumsum(store_beta[:,1])/iteration)
plt.plot(iteration, np.cumsum(store_beta[:,2])/iteration)
plt.plot(iteration, np.cumsum(store_beta[:,3])/iteration)
plt.xlabel('iteration')
plt.ylabel('beta')
plt.show()

In [None]:
# auto-correlation function
from statsmodels.graphics.tsaplots import plot_acf

plt.figure()
plot_acf(store_beta[2000:,0], lags = 30, alpha = None)
plot_acf(store_beta[2000:,1], lags = 30, alpha = None)

In [None]:
# 3.3 : Metropolis-adjusted Langevin algorithm
import scipy
s = 0.08
SIG = jnp.eye(DIM) * s**2
rng = jax.random.PRNGKey(0)
n_accept = 0
store_beta = np.zeros((N,DIM))
beta = sample_prior()

for n in range(N):
#     epsilon = jax.random.multivariate_normal(key=rng, mean= jnp.zeros(DIM), cov=SIG)    
    epsilon = np.random.multivariate_normal(mean= jnp.zeros(DIM), cov=SIG)    
    
    proposed_state = beta + s**2 /2 * grad_log_posterior(beta) + epsilon
    
    pi_y = log_posterior(proposed_state)
    pi_x = log_posterior(beta)
    q_y = jlogpdataset(proposed_state, mean = beta + s**2 /2 \
                                                 * gradlogdensity(beta) , cov=SIG)
    q_x = jlogpdataset(beta , mean = proposed_state + s**2 /2 \
                                                 * gradlogdensity(proposed_state) , cov=SIG)
    
    
    
    logacceptprob = float(pi_y + q_x - pi_x - q_y)
    
    #accept tor reject
    uniform = np.random.rand(1) # sample a uniform on [0,1]
    if np.log(uniform) < logacceptprob:
        beta = proposed_state.copy() #accept
        n_accept += 1
    store_beta[n,:] = beta
    

In [None]:
n_accept/N

In [None]:
iteration = np.arange(1,N+1)
plt.figure()
plt.plot(iteration, store_beta[:,0])
plt.plot(iteration, store_beta[:,1])
plt.plot(iteration, store_beta[:,2])
plt.plot(iteration, store_beta[:,3])
plt.xlabel('iteration')
plt.ylabel('beta')
plt.show()

In [None]:
iteration = np.arange(1,N+1)
plt.figure()
plt.plot(iteration, np.cumsum(store_beta[:,0])/iteration)
plt.plot(iteration, np.cumsum(store_beta[:,1])/iteration)
plt.plot(iteration, np.cumsum(store_beta[:,2])/iteration)
plt.plot(iteration, np.cumsum(store_beta[:,3])/iteration)
plt.xlabel('iteration')
plt.ylabel('beta')
plt.show()

In [None]:
# auto-correlation function
from statsmodels.graphics.tsaplots import plot_acf

plt.figure()
plot_acf(store_beta[2000:,0], lags = 30, alpha = None)
plot_acf(store_beta[2000:,1], lags = 30, alpha = None)

In [None]:
#3.4 : Hamiltonian Monte Carlo algorithm
import numpy as np

s = 0.08
SIG = jnp.eye(DIM) * s**2
n_accept = 0
store_beta = np.zeros((N,DIM))
beta = sample_prior()

def hamiltonian_dynamics(current_state, current_velocity, stepsize, num_steps, gradlogdensity):
    x = current_state
    v = current_velocity
    """Simulate Hamiltonian dynamics."""    
    v = v + stepsize * np.array(grad_log_posterior(x)) / 2
    for step in range(num_steps): 
        x = x + stepsize * v 
        if step != (num_steps-1):
            v = v + stepsize * np.array(grad_log_posterior(x))
            v = v + stepsize * np.array(grad_log_posterior(x)) / 2    
    return (x, v)

In [None]:
hamiltonian_dynamics(beta, beta, s, N, gradlogdensity)

# Section 4

##### 4.1
Under the integral, we can see the expit function and a probability distribution. To estimate this integral, we sample $N$ points $\{ \beta^{(t)}, \space t \in \{1, .. , N\} \space \}$ from the distribution $P(\beta | y_1,..,y_m)$ and then add the obtained values $$\text{expit}(z^{(t)})$$ where 
$$z^{(t)}= \beta_0^{(t)} + \sum_{j = 1}^n \beta_j^{(t)} X_{j}.$$
We give every point $\beta^{(t)}$ a weight of $1/N$ so that the total weight is 1 and equals the integral of the probability function. This way, we obtain the method of the project.

In [None]:
#4.2 : Approximated predictive probabilities

In [None]:
#4.3 : Prediction rule

In [None]:
#4.4 : Misclassification rate

In [None]:
# 4.5 : Cost function

def adjust(y_test, y_pred):
    '''
    Input – y_test: vector with good/bad credit risk
          – y_pred: pred values for good/bad credit risk
    Output – average cost – constat'''
    f = lamda x,y:  5 if x==0 and y==1\
                   else 1 if x==1 and y==0 \
                    else 0
    vf = np.vectorize(f)
    return vf(y_test, y_pred)


def average_cost(y_test):
    '''
    Input – y_test: vector with good/bad credit risk
    Output – average cost – constat''' 
    return jnp.sum(y_test)

In [None]:
#4.6 : Maximum Likelihood Estimator

In [None]:
#4.7 : Misclassification rate

In [None]:
#4.8 : Prediction accuracy