In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pymc as pm

sns.set()

In [None]:
df = pd.read_csv('Howell1.csv',sep=';')
df.head()


In [None]:
df['sex'] = df['male'].apply(lambda x : 1 if x else 2)

religions = ['Christian','Muslim','Buddist','Jew']

df['religion'] =  np.random.choice(religions,replace=True,size=len(df),p=[0.25,0.50,0.15,0.10])

#### create an index for a multivalued category ####
df['rel'] = df['religion'].apply(lambda x : religions.index(x) + 1)

df = df[df['age'] > 18]
df

In [None]:
def regression (cat1,x1,y):
    
    a_mu = pm.Uniform('a_mu',0,1)
    a_sigma = pm.Uniform('a_sigma',0,1)
    
    a_cat1 = pm.Normal('a_cat1',a_mu, 1 / a_sigma ** 2,size=2)
    beta1 = pm.Normal('beta1',0,1,size=2)
    obs_sigma = pm.Uniform('obs_sigma',0,1)
    
    @pm.deterministic
    def linreg(a1=a_cat1[cat1-1],beta1=beta1[cat1-1],x1=x1):
        return beta1 * x1 + a1  
    
    obs = pm.Normal('obs',mu=linreg, tau=1/obs_sigma ** 2,observed=True,value=y)
    
    model = pm.Model([a_mu,a_sigma,a_cat1,beta1,obs_sigma,linreg,obs])
    
    map_ = pm.MAP(model)
    map_.fit()
    
    mcmc = pm.MCMC(model)
    
    sample = mcmc.sample(50000,10000,2)
    
    a_mu_post = mcmc.trace('a_mu')[:]
    a_mu_sigma_post = mcmc.trace('a_sigma')[:]
    a_cat1_1_post = mcmc.trace('a_cat1')[:,0]
    a_cat1_2_post = mcmc.trace('a_cat1')[:,1]
    beta1_post_1 = mcmc.trace('beta1')[:,0]
    beta1_post_2 = mcmc.trace('beta1')[:,1]
    obs_sigma_post = mcmc.trace('obs_sigma')[:]
    
    result = pd.DataFrame({'a_mu_post' : a_mu_post,
                         'a_mu_sigma_post' : a_mu_sigma_post,
                         'a_cat1_1_post' : a_cat1_1_post,
                         'a_cat1_2_post' : a_cat1_2_post,
                         'beta1_post_1' : beta1_post_1,
                         'beta1_post_2' : beta1_post_2,
                         'obs_sigma_post' : obs_sigma_post})
    
    return result
    

In [None]:
def standardize(x):
    return (x - x.mean()) / x.std()

df[['weight','age','height']] = df[['weight','age','height']].apply(standardize)

result = regression (df['sex'],df['weight'],df['height'])

In [None]:
print (result.head())
result.describe()


In [None]:
import scipy.stats as sps

plt.figure(figsize=(18,12))

males = df[ df['sex'] == 1]
females = df[df['sex'] == 2]

plt.scatter(males.weight,males.height,color='navy')
plt.scatter(females.weight,females.height,color='crimson')

male_slope,male_intercept,_,_,_ = sps.linregress(males.weight,males.height)
female_slope,female_intercept,_,_,_ = sps.linregress(females.weight,females.height)
combined_slope,combined_intercept,_,_,_ = sps.linregress(df.weight,df.height)

X = np.linspace(-3,3,100)

plt.plot(X,X * male_slope + male_intercept, 
         color='navy',
         ls='dashed',
         label='LSQ male slope {:.2f}, male intercept {:.2f}'.format(male_slope,male_intercept))
         
plt.plot(X,X * female_slope + female_intercept, 
         color='crimson',
         ls='dashed',
         label='LSQ female slope {:.2f}, male intercept {:.2f}'.format(female_slope,female_intercept))

plt.plot(X,X * combined_slope + combined_intercept, 
         color='orange',
         ls='dashed',
         label='LSQ combined slope {:.2f}, combined intercept {:.2f}'.format(combined_slope,combined_intercept))
         
plt.legend(loc='upper left')

nr_rows = 500

sample_rows = np.random.choice(result.index,replace=True,size=nr_rows)

a_cat1_1_samples = result.iloc[sample_rows].a_cat1_1_post
a_cat1_2_samples = result.iloc[sample_rows].a_cat1_2_post
beta1_1_samples = result.iloc[sample_rows].beta1_post_1
beta1_2_samples = result.iloc[sample_rows].beta1_post_2

lines_male = np.array([X[i] * beta1_1_samples  + \
                  a_cat1_1_samples  for i in range(len(X))])

_= plt.plot(X,lines_male,color='navy',alpha=0.01)

lines_female = np.array([X[i] * beta1_2_samples  + \
                  a_cat1_2_samples  for i in range(len(X))])

_= plt.plot(X,lines_female,color='crimson',alpha=0.01)


