# Logistic regression

In [None]:
import sys
sys.path.append('../../Utilities/src')
sys.path.append('../../Utilities')

import pystan
import stan_utility

import arviz as az
import numpy as np
import scipy.stats as stats

import pandas as pd


In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
plt.style.context('seaborn-white')
mpl.rcParams['figure.dpi']= 200

In [None]:
from DA_tools.DA_tools import ribbon_plot
from DA_tools.DA_colors import *


In [None]:
data = pd.read_csv('log_reg_data.csv',index_col=0)

In [None]:
data.head()

In [None]:
data.describe()

### Prior selection


In [None]:
with open('logistic_regression_ppc.stan', 'r') as file:
    print(file.read())
model_ppc1 = stan_utility.compile_model('logistic_regression_ppc.stan')

In [None]:
with open('logistic_regression_ppc2.stan', 'r') as file:
    print(file.read())
model_ppc2 = stan_utility.compile_model('logistic_regression_ppc2.stan')

In [None]:
R=1000
data_ppc= dict(N = data.shape[0],
                 M = 5,
                 X = data.iloc[:,0:5],
                 sigma=10)
sim_ppc1=model_ppc1.sampling(data=data_ppc, 
                           iter=R, warmup=0, 
                           chains=1, 
                           refresh=R,
                           algorithm='Fixed_param',
                           seed=29042020)
sim_ppc2=model_ppc2.sampling(data=data_ppc, 
                           iter=R, warmup=0, 
                           chains=1, 
                           refresh=R,
                           algorithm='Fixed_param',
                           seed=29042020)

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(7, 6),sharex=True)
sigmas=[10,2,0.75]
for k in range(3):
    data_ppc['sigma']=sigmas[k]
    sim_ppc1=model_ppc1.sampling(data=data_ppc, 
                           iter=R, warmup=0, 
                           chains=1, 
                           refresh=R,
                           algorithm='Fixed_param',
                           seed=29042020)
    sim_ppc2=model_ppc2.sampling(data=data_ppc, 
                           iter=R, warmup=0, 
                           chains=1, 
                           refresh=R,
                           algorithm='Fixed_param',
                           seed=29042020)
    axes[k,0].hist(sim_ppc1.extract()['prob_ppc'].flatten(),bins=100,color=DARK,edgecolor=DARK_HIGHLIGHT,density=True)
    axes[k,0].set_yticks([])
    axes[k,0].set_title(r'$\beta\sim Normal(0,{})$'.format(sigmas[k]))    
    axes[k,1].hist(sim_ppc2.extract()['prob_ppc'].flatten(),bins=100,color=DARK,edgecolor=DARK_HIGHLIGHT,density=True)
    axes[k,1].set_yticks([])
    axes[k,1].set_title(r'$\beta\sim t_5(0,{})$'.format(sigmas[k]))
axes[2,0].set_xlabel(r'$\theta$')
axes[2,1].set_xlabel(r'$\theta$')

fig.tight_layout()
plt.show()

### Posterior inference and simulation

In [None]:
with open('logistic_regression.stan', 'r') as file:
    print(file.read())

In [None]:
model = stan_utility.compile_model('logistic_regression.stan')

In [None]:
data_dict = dict(N = data.shape[0],
                 M = 5,
                 X = data.iloc[:,0:5],
                 y = data.y.values,
                 N_hand = 2,
                 hand = data.hand)
fit = model.sampling(data=data_dict, seed=4938483)

In [None]:
params1 = fit.extract()
pars_mat=np.concatenate((params1['beta'],np.expand_dims(params1['alpha'],axis=1)),axis=1)

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(7, 6))
axes_flat=axes.flatten()
names_of_pars = [r'$\beta_1$',r'$\beta_2$',r'$\beta_3$',r'$\beta_4$',r'$\beta_5$',r'$\alpha$']
for k in range(len(axes_flat)):
    ax = axes_flat[k]
    ax.hist(pars_mat[:,k],bins=20,color=DARK,edgecolor=DARK_HIGHLIGHT,density=True)
    ax.set_title(names_of_pars[k])
    ax.set_yticks([])
fig.tight_layout()

plt.show()

### Estimation of group parameters

In [None]:
counts = len(data['y'])
bin_delta = 1.0 / counts
bins = np.arange(0 - 0.5 * bin_delta, 1 + 1.5 * bin_delta, bin_delta)



In [None]:
fig, axes = plt.subplots(1, 3, figsize=(7, 3))
ax1=axes[0]
group_mean = data.y.mean()
h_counts = ax1.hist(params1['p_hat_ppc'],bins=bins,color=DARK,edgecolor=DARK_HIGHLIGHT,density=True,zorder=0)
y_max = np.max(h_counts[0])
ax1.vlines(group_mean,0,y_max+1,color='black',linestyle='--',linewidth=1.5,zorder=2)
ax1.set_title("Aggregate PPC")
ax1.set_xlim([-bin_delta, 1 + bin_delta])
ax1.set_ylim([0, y_max + 1])
ax1.set_xticks([0,group_mean,1])
ax1.set_xticklabels([0,'{0:1.2f}'.format(group_mean),1])
ax1.set_yticks([])

ax2=axes[1]
left_mean = data[data.hand==1].y.mean()
h_counts = ax2.hist(params1['p_hat_left_ppc'],bins=bins,color=DARK,edgecolor=DARK_HIGHLIGHT,density=True,zorder=0)
y_max = np.max(h_counts[0])
ax2.vlines(left_mean,0,y_max+1,color='black',linestyle='--',linewidth=1.5,zorder=2)
ax2.set_title("Left PPC")
ax2.set_xlim([-bin_delta, 1 + bin_delta])
ax2.set_ylim([0, y_max + 1])
ax2.set_xticks([0,left_mean,1])
ax2.set_xticklabels([0,'{0:1.2f}'.format(left_mean),1])
ax2.set_yticks([])


ax3=axes[2]
right_mean = data[data.hand==2].y.mean()
h_counts = ax3.hist(params1['p_hat_right_ppc'],bins=bins,color=DARK,edgecolor=DARK_HIGHLIGHT,density=True,zorder=0)
y_max = np.max(h_counts[0])
ax3.vlines(right_mean,0,y_max+1,color='black',linestyle='--',linewidth=1.5,zorder=2)
ax3.set_title("Right PPC")
ax3.set_xlim([-bin_delta, 1 + bin_delta])
ax3.set_ylim([0, y_max + 1])
ax3.set_xticks([0,right_mean,1])
ax3.set_xticklabels([0,'{0:1.2f}'.format(right_mean),1])

ax3.set_yticks([])

fig.tight_layout()

plt.show()

### Individual predictions

In [None]:
med_prob_ppc = np.percentile(params1['prob_ppc'],[25,50,75],axis=0)
estimate_ppc_df = pd.DataFrame(med_prob_ppc.T,columns = ['lo','med','hi'])
estimate_ppc_df['y'] = data.y
estimate_ppc_df.sort_values(by='med',inplace=True)
estimate_ppc_df

error_bar=np.array((
                    (estimate_ppc_df['med']-estimate_ppc_df['lo']).values,
                    (estimate_ppc_df['hi']-estimate_ppc_df['med']).values))


In [None]:
fig, axes = plt.subplots(1,1, figsize=(7, 4),sharex=True)

axes.scatter([*range(500)],estimate_ppc_df.med,marker='.',c=[(1.*k,1.*k,1.*k) for k in estimate_ppc_df.y])
axes.errorbar([*range(500)],estimate_ppc_df.med,yerr=error_bar,fmt='none',zorder=0,ecolor=DARK)
axes.set_title('Probability of individual outcome is not well captured')
axes.set_ylabel(r'$\theta_i$',rotation=0)
axes.set_yticks([estimate_ppc_df['lo'].min(),estimate_ppc_df['hi'].max()])
axes.set_yticklabels(['{0:1.2}'.format(estimate_ppc_df['lo'].min()),'{0:1.2}'.format(estimate_ppc_df['hi'].max())])
axes.set_xlabel('i (sorted)')
plt.show()

### Introduction of grouping

In [None]:
with open('grouped_logistic_regression.stan', 'r') as file:
    print(file.read())
model2 = stan_utility.compile_model('grouped_logistic_regression.stan')

In [None]:
fit2 = model2.sampling(data=data_dict, seed=4938483)

In [None]:
params2 = fit2.extract()
pars_mat2=np.concatenate((params2['beta'],params2['alpha']),axis=1)



In [None]:
fig, axes = plt.subplots(3, 3, figsize=(7, 6))
axes_flat = axes.flatten()
axes_sort = np.concatenate((axes_flat[:5],axes_flat[6:8]))
names_of_pars = [r'$\beta_1$',r'$\beta_2$',r'$\beta_3$',r'$\beta_4$',r'$\beta_5$',r'$\alpha_\mathrm{left}$', r'$\alpha_\mathrm{right}$']
for k in range(pars_mat2.shape[1]):
    ax = axes_sort[k]
    ax.hist(pars_mat2[:,k],bins=20,color=DARK,edgecolor=DARK_HIGHLIGHT,density=True)
    ax.set_title(names_of_pars[k])
    ax.set_yticks([])
fig.delaxes(axes[1,2])
fig.delaxes(axes[2,2])

fig.tight_layout()

plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(7, 3))
ax1=axes[0]
group_mean = data.y.mean()
h_counts = ax1.hist(params2['p_hat_ppc'],bins=bins,color=DARK,edgecolor=DARK_HIGHLIGHT,density=True,zorder=0)
y_max = np.max(h_counts[0])
ax1.vlines(group_mean,0,y_max+1,color='black',linestyle='--',linewidth=1.5,zorder=2)
ax1.set_title("Aggregate PPC")
ax1.set_xlim([-bin_delta, 1 + bin_delta])
ax1.set_ylim([0, y_max + 1])
ax1.set_xticks([0,group_mean,1])
ax1.set_xticklabels([0,'{0:1.2f}'.format(group_mean),1])
ax1.set_yticks([])

ax2=axes[1]
left_mean = data[data.hand==1].y.mean()
h_counts = ax2.hist(params2['p_hat_left_ppc'],bins=bins,color=DARK,edgecolor=DARK_HIGHLIGHT,density=True,zorder=0)
y_max = np.max(h_counts[0])
ax2.vlines(left_mean,0,y_max+1,color='black',linestyle='--',linewidth=1.5,zorder=2)
ax2.set_title("Left PPC")
ax2.set_xlim([-bin_delta, 1 + bin_delta])
ax2.set_ylim([0, y_max + 1])
ax2.set_xticks([0,left_mean,1])
ax2.set_xticklabels([0,'{0:1.2f}'.format(left_mean),1])
ax2.set_yticks([])


ax3=axes[2]
right_mean = data[data.hand==2].y.mean()
h_counts = ax3.hist(params2['p_hat_right_ppc'],bins=bins,color=DARK,edgecolor=DARK_HIGHLIGHT,density=True,zorder=0)
y_max = np.max(h_counts[0])
ax3.vlines(right_mean,0,y_max+1,color='black',linestyle='--',linewidth=1.5,zorder=2)
ax3.set_title("Right PPC")
ax3.set_xlim([-bin_delta, 1 + bin_delta])
ax3.set_ylim([0, y_max + 1])
ax3.set_xticks([0,right_mean,1])
ax3.set_xticklabels([0,'{0:1.2f}'.format(right_mean),1])

ax3.set_yticks([])

fig.tight_layout()

plt.show()

In [None]:
med_prob_ppc = np.percentile(params2['prob_ppc'],[25,50,75],axis=0)
estimate_ppc_df = pd.DataFrame(med_prob_ppc.T,columns = ['lo','med','hi'])
estimate_ppc_df['y'] = data.y
estimate_ppc_df.sort_values(by='med',inplace=True)
estimate_ppc_df

error_bar=np.array((
                    (estimate_ppc_df['med']-estimate_ppc_df['lo']).values,
                    (estimate_ppc_df['hi']-estimate_ppc_df['med']).values))


In [None]:
#plt.scatter([*range(500)],estimate_ppc_df.med,marker='.',c=[(1.*k,1.*k,1.*k) for k in estimate_ppc_df.y])
#plt.errorbar([*range(500)],estimate_ppc_df.med,yerr=error_bar,fmt='none',zorder=0,ecolor=DARK)


fig, axes = plt.subplots(1,1, figsize=(7, 4),sharex=True)

axes.scatter([*range(500)],estimate_ppc_df.med,marker='.',c=[(1.*k,1.*k,1.*k) for k in estimate_ppc_df.y])
axes.errorbar([*range(500)],estimate_ppc_df.med,yerr=error_bar,fmt='none',zorder=0,ecolor=DARK)
axes.set_title('Grouping by handness improves individual predictions')
axes.set_ylabel(r'$\theta_i$',rotation=0)
axes.set_yticks([estimate_ppc_df['lo'].min(),.37,estimate_ppc_df['hi'].max()])
axes.set_yticklabels(['{0:1.2}'.format(estimate_ppc_df['lo'].min()),'0.37','{0:1.2}'.format(estimate_ppc_df['hi'].max())])
axes.set_xlabel('i (sorted)')
plt.show()