In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pymc3 as pm
import arviz as az

sns.set()

### set this to determine what data to process

####### NOTE !!!!! ##########
#############################
### since 2020-04-23 US states no longer work since CSSE changed their format ####
### FIXED sometime early may !#### 

#############################
#############################

US_states = False

### 
### MUST RUN create_world_df_for_regression FIRST!!!!! 
World_countries = True
### 

In [None]:
#### US STATES SPECIFIC FRAME ####

if US_states:
    
    title = 'US States'
    dick = pd.read_pickle('us_state_dict.pkl')


    states_timeline = pd.DataFrame()
    for state,timeline in dick.items():
        temp = pd.DataFrame(data=timeline,index=timeline.index).reset_index()
        temp['state'] = state
        states_timeline = pd.concat([states_timeline,temp])

    states_timeline['density2'] = states_timeline['density'].copy() * -1 * np.random.lognormal(0.1,0.01) # to do test regression on itself
    states_timeline['conf_log'] = np.log10(states_timeline['confirmed'])
    states_timeline['dead_log'] = np.log10(states_timeline['deceased'])
    states_timeline['density_log'] = np.log10(states_timeline['density'])

    states_timeline.replace(np.inf,np.nan,inplace=True)
    states_timeline.dropna(inplace=True)

    states_timeline

In [None]:
if US_states:
    us_abb = pd.read_csv('us_states.csv',sep=';',header=None,index_col=0,usecols=[0,3],names=['state','abb'])


if World_countries:
    
    world_pop = pd.read_csv('world_pop.csv',sep=';',header=None,thousands=',',index_col=0,names=['pop'])
    
    world_lat_lon = pd.read_csv('world_lat_lon.csv',sep='\t',index_col=3)
    
    world_facts = pd.read_pickle('world_facts.pkl')
    us_abb = pd.DataFrame(world_facts['abb'])
    world_facts['density_log'] = np.log10(world_facts['density'])
    world_facts = world_facts.join(world_lat_lon['lat'])
    world_facts['lat'] = np.abs(world_facts['lat'])
    world_facts['lat_log'] = np.log10(world_facts['lat'])
    world_facts = world_facts.join(world_pop)
    world_facts['dead_per_M'] = world_facts['deceased'] /(world_facts['pop'] / 1e6)
    world_facts['dead_per_M_log'] = np.log10(world_facts['dead_per_M'])
    print (world_facts.loc['Sweden'])
    print (world_facts.describe())


In [None]:
#world_facts.loc[world_facts['abb'] == 'KE']

In [None]:
#### US STATES SPECIFIC FRAME ####
if US_states:
    df = pd.DataFrame()

    def get_latest_numbers(df,statename):
        state = dick[statename]
        last_row = pd.DataFrame(state.iloc[-1]).T
        last_row['state'] = statename
        df = pd.concat([df,last_row],axis=0)
        return df

    for key,data in dick.items():
        df = get_latest_numbers(df,key)
    
    #df.sort_values('dead_per_M',inplace=True,ascending=False)
    df.head(40)
    #df.to_pickle('US_states_latest_day.pkl')


In [None]:


us_age = pd.read_csv('us_states_age.csv',sep=r'\t',header=None,engine='python',usecols=[1,2],
                     names=['state','median_age'],index_col=0)

us_age.index = us_age.index.str.strip()
us_age

In [None]:
if US_states:
    #us_df = pd.read_pickle('US_states_latest_day.pkl')
    us_df = df
    us_df['gdp_per_capita'] = us_df['gdp']
    us_df['density2'] = us_df['density'].copy() * -1 * np.random.lognormal(0.1,0.01) ## to test regress on itself
    us_df['conf_log'] = np.log10(us_df['confirmed'])
    us_df['dead_log'] = np.log10(us_df['deceased'])
    us_df['density_log'] = np.log10(us_df['density'])
    us_df['gdp_log'] = np.log10(us_df['gdp'])
    us_df['dead_per_M_log'] = np.log10(us_df['dead_per_M'])
    
    us_df.set_index('state',inplace=True)
    us_df = us_df.join(us_age)
    
    lat_lon = pd.read_csv('us_lat_lon.csv',sep=';',index_col=0)
    us_df = us_df.join(lat_lon)
    us_df['lat_log'] = np.log10(us_df['lat'])
    print(us_df)
    



In [None]:
#### US STATES SPECIFIC FRAME ####
if US_states:
    print (us_df.describe())

In [None]:

def regression(x1,x2,y):
    
    model = pm.Model()
    with model:
    
        beta1_mu = pm.Uniform('beta1_mu',-0.1,0.1)
        beta1_sigma = pm.Uniform('beta1_sigma',0,0.5)

        beta2_mu = pm.Uniform('beta2_mu',-0.1,0.1)
        beta2_sigma = pm.Uniform('beta2_sigma',0,0.5)

        obs_sigma = pm.Uniform('obs_sigma',0,0.5)

        alpha = pm.Uniform('alpha',-1,1)

        beta1 = pm.Normal('beta1',mu=beta1_mu, sd=beta1_sigma)
        beta2 = pm.Normal('beta2',mu=beta2_mu, sd=beta2_sigma )


        if multiple_regression:
            
            linreq = x1.values * beta1 + x2.values*beta2 + alpha

        else:
            
            linreq = x1.values * beta1 + alpha

        obs = pm.Normal('obs',mu=linreq,sd=obs_sigma,observed=y)

        step1 = pm.Metropolis()
        
        trace = pm.sample(10000,step=step1,tune=10000)

        beta1_mu_post = trace['beta1_mu'][:]
        beta1_sigma_post = trace['beta1_sigma'][:]

        if multiple_regression:
            beta2_mu_post = trace['beta2_mu'][:]
            beta2_sigma_post = trace['beta2_sigma'][:]

        obs_sigma_post = trace['obs_sigma'][:]

        alpha_post = trace['alpha'][:]

        beta1_post = trace['beta1'][:]


        if multiple_regression:

            beta2_post = trace['beta2'][:]

            

            result = pd.DataFrame({'beta1_mu_post' : beta1_mu_post,
                                  'beta1_sigma_post' : beta1_sigma_post,
                                  'beta2_mu_post' : beta2_mu_post,
                                  'beta2_sigma_post' : beta2_sigma_post,
                                  'obs_sigma_post' : obs_sigma_post,
                                  'alpha_post' : alpha_post,
                                  'beta1_post' : beta1_post,
                                  'beta2_post' : beta2_post})

        else:
            result = pd.DataFrame({'beta1_mu_post' : beta1_mu_post,
                                  'beta1_sigma_post' : beta1_sigma_post,
                                  'obs_sigma_post' : obs_sigma_post,
                                  'alpha_post' : alpha_post,
                                  'beta1_post' : beta1_post})
        _= az.plot_trace(trace)
            
        print (az.summary(trace,round_to=2))
            
        
        return result







In [None]:
world_facts

In [None]:
abb = pd.read_csv('three_letter_country_abb.csv',sep=';',header=None,index_col=0,usecols=[0,2],names=['country','abb3'])
world_facts = world_facts.join(abb)

In [None]:
start_date = '2020-03-01'
end_date = '2020-07-25'

url = 'https://covidtrackerapi.bsg.ox.ac.uk/api/v2/stringency/date-range/{}/{}'.format(start_date,end_date)

import requests
r = requests.get(url,timeout=5.0)
r.status_code

In [None]:
keys = pd.date_range(start_date,end_date)
keys

In [None]:
json = r.json()
data = json['data']
countries = json['countries']

In [None]:
data_list = []

for k in keys:
    date = k.date().strftime('%Y-%m-%d')
    for c in countries:
        try:
            data_list.append((date,c,data[date][c]['confirmed'],data[date][c]['deaths'],data[date][c]['stringency']))
        except KeyError:
            pass


In [None]:
ox_df = pd.DataFrame(data_list,columns=['date','country','confirmed','dead','oxford_stringency'])
ox_df['date'] = pd.to_datetime(ox_df['date'])
ox_df = ox_df.set_index(['country','date'])
print (ox_df.head())

In [None]:
ox_df = ox_df.groupby(level=0).mean()['oxford_stringency']
ox_df

In [None]:
world_facts = world_facts.join(ox_df,on='abb3')


In [None]:
world_facts = world_facts.loc[~world_facts['oxford_stringency'].isna()]
world_facts

In [None]:
#################################### HERE STARTS THE GENERIC BIT #####################################

# independent1, independent2, dependent

x1_param = x1_param_org = 'oxford_stringency'
x2_param = x2_param_org = 'lat'
y_param = y_param_org = 'dead_per_M_log'

#x1_param = 'x_real'
#x2_param = 'x_spurious'
#y_param = 'y'

multiple_regression = True

outlier = 10

### REPLACE THE ASSIGNMENT TO xy BELOW WITH THE DATAFRAME YOU WANT TO RUN REGRESSION ON ###
### AND CHANGE x1_param,x2_param,y_param above to correct column names and set US_states in top frame to True/False ###

if US_states:
    xy = us_df[[x1_param,x2_param,y_param]].copy()
    title = 'US states'

if World_countries:
    xy = world_facts[[x1_param,x2_param,y_param]]
    title = 'World countries'

# THIS IS A DATASET FOR UNDERSTANDING 
if not (US_states or World_countries):
    xy = pd.read_pickle('spurious_correlation.pkl')
    
    

In [None]:
if xy.isna().any().sum():
    print ('Warning - NaN values present! DROPPING....')
    xy.dropna(inplace=True)

In [None]:
def standardize(x):
    return (x - x.mean()) / x.std()



######################################################################################################

xy[['x1_std','x2_std','y_std']] = xy.apply(standardize)

mask = (xy[['x1_std','x2_std','y_std']] > outlier).any(axis=1)

xy = (xy[~mask]).copy()

true_x1_mean = np.power(10,xy[x1_param]).mean() if ('_log' in x1_param) else xy[x1_param].mean()
true_x1_std = np.power(10,xy[x1_param]).std() if ('_log' in x1_param) else xy[x1_param].std()
true_x2_mean = np.power(10,xy[x2_param]).mean() if ('_log' in x2_param) else xy[x2_param].mean()
true_x2_std = np.power(10,xy[x2_param]).std() if ('_log' in x2_param) else xy[x2_param].std()
true_y_mean = np.power(10,xy[y_param]).mean() if ('_log' in y_param) else xy[y_param].mean()
true_y_std = np.power(10,xy[y_param]).std() if ('_log' in y_param) else xy[y_param].std()

print (xy)
print (xy.describe())

xy_orig = xy.copy()


In [None]:


result = regression(xy['x1_std'],xy['x2_std'],xy['y_std'])
print ()
print ('result stats')
print (result.describe())
print()
print ('result covariance matrix')
print (result.cov())
print()
print ('result correlation coeff')
print (result.corr())

In [None]:
def plot(result,x1,x2,y,title,n,df,create_figure=True,residuals = False,color='crimson'):
    
    min_x = (np.minimum(x1,x2)).min()
    max_x = (np.maximum(x1,x2)).max()
    min_y = min(y)
    max_y = max(y)
    
    x1_mu = df[x1_param.replace('_std','')].mean()
    x1_sigma = df[x1_param.replace('_std','')].std()
    x2_mu = df[x2_param.replace('_std','')].mean()
    x2_sigma = df[x2_param.replace('_std','')].std()
    y_mu = df[y_param.replace('_std','')].mean()
    y_sigma = df[y_param.replace('_std','')].std()

    x1_param_beta = result.beta1_post.mean()
    alpha_param = result.alpha_post.mean()
    x1_param_sigma = result.beta1_post.std()
    x1_param_89 = np.percentile(result.beta1_post,[5.5,94.5])

    if multiple_regression:
    
        x2_param_beta = result.beta2_post.mean()
        x2_param_sigma = result.beta2_post.std()
        x2_param_89 = np.percentile(result.beta2_post,[5.5,94,5])

    if create_figure:
        plt.figure(figsize=(18,12))
        #plt.ylim([-outlier,outlier])

    
    plt.title ('{} {}  '.format(
        n, title))

    if not residuals:
        label1 = x1_param + \
                    r' $\beta$: {:.2f} $\alpha$: {:.2f} 89%: [{:.2f} {:.2f}]'.format(
                    x1_param_beta,alpha_param,x1_param_89[0],x1_param_89[1])
        
        if multiple_regression:
            label2 = x2_param + \
                r' $\beta$: {:.2f} $\alpha$: {:.2f} 89%: [{:.2f} {:.2f}]'.format(
                x2_param_beta,alpha_param,x2_param_89[0],x2_param_89[1])              
    else:
        
        label1 = ' '
        
    plt.scatter(x1,y, color=color, label=label1)
    
    if multiple_regression:
        plt.scatter(x2,y,color='g',label=label2)

    X = np.linspace(min_x,max_x,1000)
    
    rows = np.random.choice(result.index,replace=True,size=len(X))

    beta1_samples = result.beta1_post[rows]

    if multiple_regression:
        beta2_samples = result.beta2_post[rows]
    
    alpha_samples = result.alpha_post[rows]

    if multiple_regression:
        lines = [X[i] * beta1_samples + X[i] * beta2_samples + alpha_samples for i in range(len (X))]
        
        
        samples2 = np.array([pm.Normal.dist(X[i] * result.beta1_post[rows] + X[i] * result.beta2_post[rows] + result.alpha_post[rows],
                                      result.obs_sigma_post[rows]).random(size=len(X)) for i in range(len(X))])
        
        high2,low2 = np.percentile(samples2,[5.5,94.5],axis=1)
        
        plt.fill_between(X,high2,low2,color='orange',alpha=0.2)
        
        plt.plot(X,X * beta1_samples.mean() + X * beta2_samples.mean() + alpha_samples.mean(),color='k',ls='dashed')
        
    else:
        lines = [X[i] * beta1_samples  + alpha_samples for i in range(len (X))]
        rows = np.random.choice(result.index,replace=True,size=len(X))
        
        samples2 = np.array([pm.Normal.dist(X[i] * result.beta1_post[rows] + result.alpha_post[rows],
                                      result.obs_sigma_post[rows]).random(size=len(X)) for i in range(len(X))])
        
        high2,low2 = np.percentile(samples2,[5.5,94.5],axis=1)
        
        plt.fill_between(X,high2,low2,color='orange',alpha=0.2)
        
        plt.plot(X,X * beta1_samples.mean() + alpha_samples.mean(),color='k',ls='dashed')


    plt.plot(X,lines,alpha=0.01,color='r')

    if multiple_regression:
        plt.xlabel(x1_param + r' $\mu$: {:.2f} [ {:.2f} ] $\sigma$: {:.2f} [ {:.2f} ] '.format(
            x1_mu,true_x1_mean,x1_sigma,true_x1_std) + \
                   x2_param + r' $\mu$: {:.2f} [ {:.2f} ] $\sigma$: {:.2f} [ {:.2f} ]'.format(
            x2_mu,true_x2_mean,x2_sigma,true_x2_std))
        
        if not residuals:
            plt.ylabel(y_param + r' $\mu$: {:.2f} [ {:.2f} ] $\sigma$: {:.2f} [ {:.2f} ]'.format(
                y_mu,true_y_mean,y_sigma,true_y_std))

    
    else:
        if not residuals:
            plt.xlabel(x1_param + r' $\mu$: {:.2f} $\sigma$: {:.2f} '.format(x1_mu,x1_sigma) )
            plt.ylabel(y_param + r' $\mu$: {:.2f} $\sigma$: {:.2f}'.format(y_mu,y_sigma))
        else:
            plt.xlabel('predictor')
            plt.ylabel('other predictor residual')
        
    plt.legend(loc='upper left')
    
    if not multiple_regression:
        
        if US_states or World_countries:
            for s in xy.index:
                plt.text(x1.loc[s] ,y.loc[s] ,us_abb.loc[s,'abb'],color='k')
            
    else:
        if US_states or World_countries:
            for s in xy.index:
                plt.text(x1.loc[s] ,y.loc[s] ,us_abb.loc[s,'abb'],color='k')
                plt.text(x2.loc[s] ,y.loc[s] ,us_abb.loc[s,'abb'],color='k')
            

    filename = lambda x : x.replace(' ','_').replace('}','_').replace('{','_').replace('-','_')

    plt.savefig('linear_regression_' + filename(title) + '.jpg',format='jpg')

In [None]:
plot(result,xy['x1_std'],xy['x2_std'],xy['y_std'],
     'Bayesian Regression {} - {} as predictor for {}'.format(title, x1_param,y_param),len(xy),xy)



In [None]:
print (xy.head(50))

In [None]:
def plot_betas(result,title):
    
    x1_param_beta = result.beta1_post.mean()
    alpha_param = result.alpha_post.mean()
    x1_param_sigma = result.beta1_post.std()
    x1_param_89 = np.percentile(result.beta1_post,[5.5,94.5])

    if multiple_regression:
    
        x2_param_beta = result.beta2_post.mean()
        x2_param_sigma = result.beta2_post.std()
        x2_param_89 = np.percentile(result.beta2_post,[5.5,94,5])
        
    plt.figure(figsize=(18,12))
    plt.title('Regression Betas ' + title)
    plt.hist(result.beta1_post,density=True,label=x1_param + \
                r' $\beta$: {:.2f} $\sigma$: {:.2f} 89%: [{:.2f} {:.2f}]'.format(
        x1_param_beta,x1_param_sigma,x1_param_89[0],x1_param_89[1]),
             color='r',alpha=0.7,histtype='stepfilled')

    if multiple_regression:
        plt.hist(result.beta2_post,density=True,label=x2_param + \
                    r' $\beta$: {:.2f} $\sigma$: {:.2f} 89%: [{:.2f} {:.2f}]'.format(
            x2_param_beta,x2_param_sigma,x2_param_89[0],x2_param_89[1]),
                 color='g',alpha=0.7,histtype='stepfilled')
    
    plt.legend(loc='upper left')

In [None]:
plot_betas(result,'bar')


In [None]:
# disable the burn params in mcmc.sample to see the convergence
fig,axes = plt.subplots(4,2,figsize=(18,12))


axes[0,0].plot(result.beta1_mu_post)
axes[0,0].set_title('beta1_mu')
axes[0,1].plot(result.beta1_sigma_post)
axes[0,1].set_title('beta1_sigma')

if multiple_regression:
    axes[1,0].plot(result.beta2_mu_post)
    axes[1,0].set_title('beta2_mu')
    axes[1,1].plot(result.beta2_sigma_post)
    axes[1,1].set_title('beta2_sigma')
    
axes[2,0].plot(result.alpha_post)
axes[2,0].set_title('alpha')
axes[2,1].plot(result.obs_sigma_post)
axes[2,1].set_title('obs_sigma')
axes[3,0].plot(result.beta1_post)
axes[3,0].set_title('beta1')

if multiple_regression:
    axes[3,1].plot(result.beta2_post)
    axes[3,1].set_title('beta2')
    
plt.tight_layout()

fig,axes = plt.subplots(4,2,figsize=(18,12))


axes[0,0].hist(result.beta1_mu_post,density=True)
axes[0,0].set_title('beta1_mu')
axes[0,1].hist(result.beta1_sigma_post,density=True)
axes[0,1].set_title('beta1_sigma')

if multiple_regression:
    
    axes[1,0].hist(result.beta2_mu_post,density=True)
    axes[1,0].set_title('beta2_mu')
    axes[1,1].hist(result.beta2_sigma_post,density=True)
    axes[1,1].set_title('beta2_sigma')
    
axes[2,0].hist(result.alpha_post,density=True)
axes[2,0].set_title('alpha')
axes[2,1].hist(result.obs_sigma_post,density=True)
axes[2,1].set_title('obs_sigma')
axes[3,0].hist(result.beta1_post,density=True)
axes[3,0].set_title('beta1')

if multiple_regression:
    axes[3,1].hist(result.beta2_post,density=True)
    axes[3,1].set_title('beta2')
    
plt.tight_layout()

In [None]:
multiple_regression = False


#########
result_x1_x2 = regression(xy['x1_std'],xy['x1_std'],xy['x2_std'])

mean_ys = xy['x1_std'] * result_x1_x2.beta1_post.mean() + result_x1_x2.alpha_post.mean()

low_error = np.zeros_like(xy['x1_std'])
high_error = np.zeros_like(xy['x1_std'])

for i in range(len(low_error)):
    
    if xy['x2_std'].iloc[i] < mean_ys[i]:
        low_error[i] = np.abs(xy['x2_std'].iloc[i] - mean_ys[i])
        high_error[i] = 0
    else:
        high_error[i] = np.abs(xy['x2_std'].iloc[i] - mean_ys[i])
        low_error[i] = 0

residuals_x1_x2 = pd.DataFrame({0 : -1 * low_error + high_error})

if US_states or World_countries:
    residuals_x1_x2.index = xy.index

temp = pd.DataFrame({'predictor': xy['x1_std'],
                    'residual' : residuals_x1_x2[0]})

print()
print (temp.head(50))
print()



result_residuals_x1_x2 = regression(residuals_x1_x2[0],residuals_x1_x2[0], xy['y_std'])

plot(result_x1_x2,xy['x1_std'],xy['x1_std'],xy['x2_std'],
     'residuals x1 to x2',len(xy),xy,create_figure=True,residuals=True,color='g')


plt.errorbar(x=xy['x1_std'],
             y=mean_ys,
             color='k',ls='dashed',fmt='none', yerr= np.array((low_error,high_error)))

plt.savefig('residuals_x1_to_x2.jpg',format='jpg')
           

####
plt.figure(figsize=(18,12))
plt.title('residuals x1 --> x2 regression on outcome y')

plt.scatter(residuals_x1_x2,xy['y_std'],color='g',
            label=r'$\beta$: {:.2f} $\alpha$: {:.2f}'.format(result_residuals_x1_x2.beta1_post.mean(),
                                                            result_residuals_x1_x2.alpha_post.mean()))

plt.axhline(0,color='k',ls='dashed',label='Expectation Outcome')
plt.axvline(0,color='orange',ls='dashed',label='Expectation on predictor 2 from predictor 1')

if US_states or World_countries:
    for s in residuals_x1_x2.index:
        plt.text(residuals_x1_x2.loc[s,0] + 0.001,xy.loc[s,'y_std'] + 0.001,us_abb.loc[s,'abb'])
    
nr_lines = 1000

X = np.linspace(residuals_x1_x2[0].min(),residuals_x1_x2[0].max(),nr_lines)

rows = np.random.choice(result_residuals_x1_x2.index,replace=True,size=nr_lines)

lines = [X[i] * result_residuals_x1_x2.beta1_post[rows] + result_residuals_x1_x2.alpha_post[rows] for i in range(len(X))]


plt.plot(X,lines,alpha=0.01,color='r')  

plt.xlabel('x2_residuals with x1 as predictor')
plt.ylabel('y')

plt.legend(loc='upper left',framealpha=0.4)
plt.savefig('residuals_regression_outcome_plot_x1_x2.jpg',format='jpg')

#########
result_x2_x1 = regression (xy['x2_std'],xy['x2_std'],xy['x1_std'])

mean_ys = xy['x2_std'] * result_x2_x1.beta1_post.mean() + result_x2_x1.alpha_post.mean()

low_error = np.zeros_like(xy['x2_std'])
high_error = np.zeros_like(xy['x2_std'])

for i in range(len(low_error)):
    
    if xy['x1_std'].iloc[i] < mean_ys[i]:
        low_error[i] = np.abs(xy['x1_std'].iloc[i] - mean_ys[i])
        high_error[i] = 0
    else:
        high_error[i] = np.abs(xy['x1_std'].iloc[i] - mean_ys[i])
        low_error[i] = 0

residuals_x2_x1 = pd.DataFrame({0 : -1 * low_error + high_error})

if US_states or World_countries:
    residuals_x2_x1.index = xy.index

temp2 = pd.DataFrame({'predictor': xy['x2_std'],
                    'residual' : residuals_x2_x1[0]})

print()
print (temp2.head(50))
print()


###########################################################################


result_residuals_x2_x1 = regression(residuals_x2_x1[0],residuals_x2_x1[0], xy['y_std'])

plot(result_x2_x1,xy['x2_std'],xy['x2_std'],xy['x1_std'],
     'residuals x2 to x1',len(xy),xy,create_figure=True,residuals=True)


plt.errorbar(x=xy['x2_std'],
             y=mean_ys,
             color='k',ls='dashed',fmt='none', yerr= np.array((low_error,high_error)))

plt.savefig('residuals_x2_to_x1.jpg',format='jpg')

            
####
plt.figure(figsize=(18,12))
plt.title('residuals x2 --> x1 to outcome y')

plt.scatter(residuals_x2_x1,xy['y_std'],color='red',
           label=r'$\beta$: {:.2f} $\alpha$: {:.2f}'.format(result_residuals_x2_x1.beta1_post.mean(),
                                                            result_residuals_x2_x1.alpha_post.mean()))
plt.axhline(0,color='k',ls='dashed')
plt.axvline(0,color='orange',ls='dashed',label='Expectation on predictor 1 from predictor 2')

if US_states or World_countries:
    for s in residuals_x2_x1.index:
        plt.text(residuals_x2_x1.loc[s,0] ,xy.loc[s,'y_std'] ,us_abb.loc[s,'abb'])
    
X = np.linspace(residuals_x2_x1[0].min(),residuals_x2_x1[0].max(),nr_lines)

rows = np.random.choice(result_residuals_x2_x1.index,replace=True,size=nr_lines)

lines = [X[i] * result_residuals_x2_x1.beta1_post[rows] + result_residuals_x2_x1.alpha_post[rows] for i in range(len(X))]

plt.plot(X,lines,alpha=0.01,color='r')  
    
plt.xlabel('x1_residuals with x2 as predictor')
plt.ylabel('y')
plt.legend(loc='upper left',framealpha=0.4)

plt.savefig('residuals_regression_outcome_plot_x2_x1.jpg',format='jpg')

In [None]:
if US_states or World_countries:
    plt.figure(figsize=(36,24))
    plt.title('Linreg {} Residuals by State'.format(x2_param))
    residuals_x1_x2.sort_values(0,inplace=True,ascending=False)
    b = sns.barplot(data=residuals_x1_x2,y=residuals_x1_x2.index,x=0,palette=['orange'])
    plt.xticks(rotation=90)
    plt.savefig('linreg_{}_residuals_by_state.jpg'.format(x2_param))
    print (residuals_x1_x2)

In [None]:
if US_states or World_countries:
    plt.figure(figsize=(36,24))
    plt.title('Linreg {} Residuals by State'.format(x1_param))
    residuals_x2_x1.sort_values(0,inplace=True,ascending=False)
    sns.barplot(data=residuals_x2_x1,y=residuals_x2_x1.index,x=0,palette=['orange'])
    plt.xticks(rotation=90)
    plt.savefig('linreg_{}_residuals_by_state.jpg'.format(x1_param))
    print (residuals_x1_x2)

In [None]:
### counterfactual plots

### x1 to mean values
xy['x1_std'] = xy['x1_std'].mean() # array of zeros since we deal with standardized variables

multiple_regression = True
result_counterfactual_x1 = regression(xy['x1_std'], xy['x2_std'],xy['y_std'])

plot(result_counterfactual_x1,xy['x1_std'],xy['x2_std'],xy['y_std'],
     'counterfactuals_x1 to outcome',len(xy),xy,create_figure=True)

In [None]:
xy = xy_orig.copy()

### x2 to mean values
xy['x2_std'] = xy['x2_std'].mean() # array of zeros since we deal with standardized variables

multiple_regression = True
result_counterfactual_x2 = regression(xy['x1_std'], xy['x2_std'],xy['y_std'])

plot(result_counterfactual_x2,xy['x1_std'],xy['x2_std'],xy['y_std'],
     'counterfactuals_x2 to outcome',len(xy),xy,create_figure=True)



In [None]:
### Posterior prediction plot

xy = xy_orig.copy()

nr_samples = 10000

rows = np.random.choice(result.index,replace=True,size=nr_samples)
samples = result.iloc[rows]
print (samples)

alpha_samples = samples['alpha_post']
beta1_samples = samples['beta1_post']
beta2_samples = samples['beta2_post']
sigma_samples = samples['obs_sigma_post']

X1 = xy['x1_std']
X2 = xy['x2_std']

posterior_samples = np.array([pm.Normal.dist(X1[i] * beta1_samples + X2[i] * beta2_samples + alpha_samples,
                                        sigma_samples ).random(size=nr_samples) for i in range(len(X1))])

print (posterior_samples.shape)

samples_89 = np.percentile(posterior_samples,[5.5,94.4],axis=1)
print (samples_89.mean(axis=1))
print ()


plt.figure(figsize=(18,12))
plt.title('Posterior Prediction Plot  - actual outcome data points vs simulated')

plt.scatter(xy['y_std'],posterior_samples.mean(axis=1),color='orange')

plt.xlabel('outcome: {}'.format(y_param))
plt.ylabel('posterior mean outcome samples')


plt.errorbar(x=xy['y_std'],
             y=posterior_samples.mean(axis=1),ecolor='lightgrey',capsize=5,
             yerr=np.array((np.abs(posterior_samples.mean(axis=1) - samples_89[0]), 
                            np.abs(posterior_samples.mean(axis=1) - samples_89[1]))),fmt='none')

plt.plot(np.linspace(xy['y_std'].min(),
                     xy['y_std'].max(),100),np.linspace(xy['y_std'].min(),xy['y_std'].max(),100),
         color='k',ls='dashed')


for i,s in enumerate(xy.index):
    plt.text(xy['y_std'].loc[s] ,(posterior_samples.mean(axis=1))[i] ,us_abb.loc[s,'abb'])
    

    
plt.savefig('regression_posterior_plot_{}.jpg'.format(y_param),format='jpg')
    