In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pymc as pm
from pymc.Matplot import plot as pmplot

sns.set()

In [None]:
'''
### read data

df = pd.read_csv('../Stat_Rethink/Howell1.csv',sep=';')
df['height_std'] = df['height'].apply(lambda x : (x - df['height'].mean()) / df['height'].std())
df['weight_std'] = df['weight'].apply(lambda x : (x - df['weight'].mean()) / df['weight'].std())
df['age_std'] = df['age'].apply(lambda x : (x - df['age'].mean()) / df['age'].std())
df['male_std'] = df['male'].apply(lambda x : (x - df['male'].mean()) / df['male'].std())

df = df[df['age'] >=18]
df
'''

In [None]:
dick = pd.read_pickle('us_state_dict.pkl')

df = pd.DataFrame()

def get_latest_numbers(df,statename):
    state = dick[statename]
    last_row = pd.DataFrame(state.iloc[-1]).T
    last_row['state'] = statename
    df = pd.concat([df,last_row],axis=0)
    return df

for key,data in dick.items():
    df = get_latest_numbers(df,key)
    
df.sort_values('dead_per_M',inplace=True,ascending=False)
df.head(40)
df.to_pickle('US_states_latest_day.pkl')

In [None]:
us_df = pd.read_pickle('US_states_latest_day.pkl')
us_df['gdp_per_capita'] = us_df['gdp'] 
us_df.set_index('state',inplace=True)
us_df


In [None]:
us_df.describe()

In [None]:
multiple_regression = True

def regression(x1,x2,y):
    
    beta1_mu = pm.Uniform('beta1_mu',-0.1,0.1)
    beta1_sigma = pm.Uniform('beta1_sigma',0,0.5)
    
    beta2_mu = pm.Uniform('beta2_mu',-0.1,0.1)
    beta2_sigma = pm.Uniform('beta2_sigma',0,0.5)
    
    obs_sigma = pm.Uniform('obs_sigma',0,0.5)
    
    alpha = pm.Uniform('alpha',-1,1)
    
    beta1 = pm.Normal('beta1',beta1_mu, 1 / beta1_sigma ** 2)
    beta2 = pm.Normal('beta2',beta2_mu, 1 / beta2_sigma ** 2)
    

    if multiple_regression:
        @pm.deterministic()
        def linreq(x1=x1,x2=x2,beta1=beta1,beta2=beta2,alpha=alpha):
            return x1 * beta1 + x2*beta2 + alpha

    else:
        @pm.deterministic()
        def linreq(x1=x1,beta1=beta1,alpha=alpha):
            return x1 * beta1 + alpha
        
    obs = pm.Normal('obs',linreq,1 / obs_sigma ** 2,observed = True, value=y)
    
    if multiple_regression:
        model = pm.Model([beta1_mu,beta1_sigma,beta2_mu,beta2_sigma,obs_sigma,alpha,beta1,beta2,obs])
    else:
        model = pm.Model([beta1_mu,beta1_sigma,obs_sigma,alpha,beta1,obs])
    
    try:
        map_ = pm.MAP(model)
        map_.fit()
    except:
        print ('cant fit')
    mcmc = pm.MCMC(model)
    
    sample = mcmc.sample(100000,40000,2)
    
    beta1_mu_post = mcmc.trace('beta1_mu')[:]
    beta1_sigma_post = mcmc.trace('beta1_sigma')[:]
    
    if multiple_regression:
        beta2_mu_post = mcmc.trace('beta2_mu')[:]
        beta2_sigma_post = mcmc.trace('beta2_sigma')[:]
    
    obs_sigma_post = mcmc.trace('obs_sigma')[:]
    
    alpha_post = mcmc.trace('alpha')[:]
    
    beta1_post = mcmc.trace('beta1')[:]
    
    pmplot(beta1_post,'beta1')
    
    
    if multiple_regression:
        
        beta2_post = mcmc.trace('beta2')[:]
        
        pmplot(beta2_post,'beta2')

    
        result = pd.DataFrame({'beta1_mu_post' : beta1_mu_post,
                              'beta1_sigma_post' : beta1_sigma_post,
                              'beta2_mu_post' : beta2_mu_post,
                              'beta2_sigma_post' : beta2_sigma_post,
                              'obs_sigma_post' : obs_sigma_post,
                              'alpha_post' : alpha_post,
                              'beta1_post' : beta1_post,
                              'beta2_post' : beta2_post})
    
    else:
        result = pd.DataFrame({'beta1_mu_post' : beta1_mu_post,
                              'beta1_sigma_post' : beta1_sigma_post,
                              'obs_sigma_post' : obs_sigma_post,
                              'alpha_post' : alpha_post,
                              'beta1_post' : beta1_post})
    return result


# independent1, independent2, dependent

x1_param = 'growth'
x2_param = 'density'
y_param = 'conf_per_M'

def standardize(x):
    return (x - x.mean()) / x.std()

xy = us_df[[x1_param,x2_param,y_param]].copy()

xy[['x1_std','x2_std','y_std']] = xy.apply(standardize)

#### PARAM #####
outlier = 3

mask = (xy[['x1_std','x2_std','y_std']] > outlier).any(axis=1)

xy = xy[~mask]

print (xy)
print (xy.describe())

### to produce CounterFactual plot
#xy['x2_std'] = np.zeros_like(xy['x2_std'])
###

result = regression(xy['x1_std'],xy['x2_std'],xy['y_std'])
print ()
print (result.describe())



In [None]:
def plot(result,x1,x2,y,title,n,df):
    
    
    min_x = (np.minimum(x1,x2)).min()
    max_x = (np.maximum(x1,x2)).max()
    min_y = min(y)
    max_y = max(y)
    
    x1_mu = df[x1_param.replace('_std','')].mean()
    x1_sigma = df[x1_param.replace('_std','')].std()
    x2_mu = df[x2_param.replace('_std','')].mean()
    x2_sigma = df[x2_param.replace('_std','')].std()
    y_mu = df[y_param.replace('_std','')].mean()
    y_sigma = df[y_param.replace('_std','')].std()

    x1_param_beta = result.beta1_post.mean()
    alpha_param = result.alpha_post.mean()
    x1_param_sigma = result.beta1_post.std()
    x1_param_89 = np.percentile(result.beta1_post,[5.5,94.5])

    if multiple_regression:
    
        x2_param_beta = result.beta2_post.mean()
        x2_param_sigma = result.beta2_post.std()
        x2_param_89 = np.percentile(result.beta2_post,[5.5,94,5])

    plt.figure(figsize=(18,12))
    
    plt.title ('Bayesian Multi-Linear Regression for {} {}  '.format(
        n, title))

    
    plt.scatter(x1,y, color='red',label=x1_param + \
                r' $\beta$: {:.2f} $\alpha$: {:.2f} 89%: [{:.2f} {:.2f}]'.format(
        x1_param_beta,alpha_param,x1_param_89[0],x1_param_89[1]))


    if multiple_regression:
        plt.scatter(x2,y,color = 'g',label=x2_param + \
                    r' $\beta$: {:.2f} $\alpha$: {:.2f} 89%: [{:.2f} {:.2f}]'.format(
            x2_param_beta,alpha_param,x2_param_89[0],x2_param_89[1]))
    

    X = np.linspace(min_x,max_x,1000)
    
    rows = np.random.choice(result.index,replace=True,size=len(X))

    beta1_samples = result.beta1_post[rows]

    if multiple_regression:
        beta2_samples = result.beta2_post[rows]
    
    alpha_samples = result.alpha_post[rows]

    if multiple_regression:
        lines = [X[i] * beta1_samples + X[i] * beta2_samples + alpha_samples for i in range(len (X))]
        
        
        samples2 = np.array([pm.rnormal(X[i] * result.beta1_post[rows] + X[i] * result.beta2_post[rows] + result.alpha_post[rows],
                                      1 / result.obs_sigma_post[rows] ** 2,size=len(X)) for i in range(len(X))])
        
        high2,low2 = np.percentile(samples2,[5.5,94.5],axis=1)
        
        plt.fill_between(X,high2,low2,color='orange',alpha=0.2)
        
        plt.plot(X,X * beta1_samples.mean() + X * beta2_samples.mean() + alpha_samples.mean(),color='k',ls='dashed')
        
    else:
        lines = [X[i] * beta1_samples  + alpha_samples for i in range(len (X))]
        rows = np.random.choice(result.index,replace=True,size=len(X))
        
        samples2 = np.array([pm.rnormal(X[i] * result.beta1_post[rows] + result.alpha_post[rows],
                                      1 / result.obs_sigma_post[rows] ** 2,size=len(X)) for i in range(len(X))])
        
        high2,low2 = np.percentile(samples2,[5.5,94.5],axis=1)
        
        plt.fill_between(X,high2,low2,color='orange',alpha=0.2)
        
        plt.plot(X,X * beta1_samples.mean() + alpha_samples.mean(),color='k',ls='dashed')


    plt.plot(X,lines,alpha=0.01,color='r')

    if multiple_regression:
        plt.xlabel(x1_param + r' $\mu$: {:.2f} $\sigma$: {:.2f} '.format(x1_mu,x1_sigma) + \
                   x2_param + r' $\mu$: {:2f} $\sigma$: {:.2f} '.format(x2_mu,x2_sigma))
    
    else:
        plt.xlabel(x1_param + r' $\mu$: {:.2f} $\sigma$: {:.2f} '.format(x1_mu,x1_sigma) )
    
    plt.ylabel(y_param + r' $\mu$: {:.2f} $\sigma$: {:.2f}'.format(y_mu,y_sigma))
    plt.legend(loc='upper right')
    
    plt.savefig('linear_regression_generic.jpg',format='jpg')

In [None]:
plot(result,xy['x1_std'],xy['x2_std'],xy['y_std'],
     'US States - outliers above {} SD removed '.format(outlier),len(x1),xy)

In [None]:
def plot_betas(result,title):
    
    x1_param_beta = result.beta1_post.mean()
    alpha_param = result.alpha_post.mean()
    x1_param_sigma = result.beta1_post.std()
    x1_param_89 = np.percentile(result.beta1_post,[5.5,94.5])

    if multiple_regression:
    
        x2_param_beta = result.beta2_post.mean()
        x2_param_sigma = result.beta2_post.std()
        x2_param_89 = np.percentile(result.beta2_post,[5.5,94,5])
        
    plt.figure(figsize=(18,12))
    plt.title('Regression Betas ' + title)
    plt.hist(result.beta1_post,density=True,label=x1_param + \
                r' $\beta$: {:.2f} $\sigma$: {:.2f} 89%: [{:.2f} {:.2f}]'.format(
        x1_param_beta,x1_param_sigma,x1_param_89[0],x1_param_89[1]),
             color='r',alpha=0.7,histtype='stepfilled')

    if multiple_regression:
        plt.hist(result.beta2_post,density=True,label=x2_param + \
                    r' $\beta$: {:.2f} $\sigma$: {:.2f} 89%: [{:.2f} {:.2f}]'.format(
            x2_param_beta,x2_param_sigma,x2_param_89[0],x2_param_89[1]),
                 color='g',alpha=0.7,histtype='stepfilled')
    
    plt.legend(loc='upper left')

In [None]:
plot_betas(result,'bar')


In [None]:
'''
plt.figure(figsize=(18,12))
binned_weights = pd.cut(df['weight'],[20,40,60,80,100])
binned_ages = pd.cut(df['age'],[10,20,30,40,50,60,70,80,90,100])

sns.violinplot(binned_ages,y='height',data=df,inner='quartiles',scale_hue=False,
               style='count',hue='male',split=True,palette=['r','b'])
'''

In [None]:
# disable the burn params in mcmc.sample to see the convergence
fig,axes = plt.subplots(4,2,figsize=(18,12))


axes[0,0].plot(result.beta1_mu_post)
axes[0,0].set_title('beta1_mu')
axes[0,1].plot(result.beta1_sigma_post)
axes[0,1].set_title('beta1_sigma')
axes[1,0].plot(result.beta2_mu_post)
axes[1,0].set_title('beta2_mu')
axes[1,1].plot(result.beta2_sigma_post)
axes[1,1].set_title('beta2_sigma')
axes[2,0].plot(result.alpha_post)
axes[2,0].set_title('alpha')
axes[2,1].plot(result.obs_sigma_post)
axes[2,1].set_title('obs_sigma')
axes[3,0].plot(result.beta1_post)
axes[3,0].set_title('beta1')
axes[3,1].plot(result.beta2_post)
axes[3,1].set_title('beta2')
plt.tight_layout()

fig,axes = plt.subplots(4,2,figsize=(18,12))


axes[0,0].hist(result.beta1_mu_post,density=True)
axes[0,0].set_title('beta1_mu')
axes[0,1].hist(result.beta1_sigma_post,density=True)
axes[0,1].set_title('beta1_sigma')
axes[1,0].hist(result.beta2_mu_post,density=True)
axes[1,0].set_title('beta2_mu')
axes[1,1].hist(result.beta2_sigma_post,density=True)
axes[1,1].set_title('beta2_sigma')
axes[2,0].hist(result.alpha_post,density=True)
axes[2,0].set_title('alpha')
axes[2,1].hist(result.obs_sigma_post,density=True)
axes[2,1].set_title('obs_sigma')
axes[3,0].hist(result.beta1_post,density=True)
axes[3,0].set_title('beta1')
axes[3,1].hist(result.beta2_post,density=True)
axes[3,1].set_title('beta2')
plt.tight_layout()