In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pymc as pm
from pymc.Matplot import plot as pmplot

sns.set()



In [None]:
def strip(x):
    return x.strip()

df = pd.read_csv('joe_law.csv',sep=';',header=None,converters={0:strip},index_col=0,names=['US_news_rank'])

def standardize(x,mean,std):
    return (x - mean) / std

df['WS_rank'] = range(1,len(df) + 1)
df.sort_values('US_news_rank',inplace=True)

df['WS_rank_std'] = df['WS_rank'].apply(standardize,args=(df['WS_rank'].mean(),df['WS_rank'].std()))
df['US_news_rank_std'] = df['US_news_rank'].apply(standardize,args=(df['US_news_rank'].mean(),df['US_news_rank'].std()))

df.sort_values('WS_rank',inplace=True)
df

In [None]:
def inference(rank,rank2):
    
    x = rank
    
    alpha = pm.Normal('alpha',mu = 0, tau = 1 / 0.5 ** 2)
    beta = pm.Normal('beta', mu = 0, tau = 1 / 0.5 ** 2)
    
    sigma = pm.Uniform('sigma',0,1)
    
    @pm.deterministic
    def mu(x=x,alpha=alpha,beta=beta):
        return x * beta + alpha
    
    
    obs = pm.Normal('obs',mu=mu,tau = 1 / sigma ** 2,observed = True, value = rank2)
    
    model = pm.Model([alpha,beta,sigma,mu,obs])
    
    map_ = pm.MAP(model)
    map_.fit()
    
    mcmc = pm.MCMC(model)
    
    sample = mcmc.sample(50000,10000,3)
    
    alpha_post = mcmc.trace('alpha')[:]
    beta_post = mcmc.trace('beta')[:]
    sigma_post = mcmc.trace('sigma')[:]
    
    pmplot(alpha_post,'alpha')
    pmplot(beta_post,'beta')
    pmplot(sigma_post,'sigma')
    
    result = pd.DataFrame({'alpha_post' : alpha_post,
                          'beta_post' : beta_post,
                          'sigma_post' : sigma_post})
    
    
    return result



In [None]:
result = inference(df['WS_rank_std'],df['US_news_rank_std'])




In [None]:
print (result.describe())

X = np.linspace(-2,2,1000)

rows = np.random.choice(result.index,replace=True,size=len(X))

lines = result.iloc[rows]

samples = np.array([pm.rnormal(X[i] * result.beta_post[rows] + result.alpha_post[rows], 
                              1 / result.sigma_post[rows] ** 2,size=len(X)) for i in range(len(X))])

CI_89 = np.percentile(samples,[5.5,94.5],axis=1)

print (df.corr())
print (np.corrcoef(df['WS_rank_std'],df['US_news_rank_std']))

plt.figure(figsize=(18,12))
plt.scatter(df['WS_rank_std'],df['US_news_rank_std'])
plt.plot(X,[X[i] * lines.beta_post + lines.alpha_post for i in range(len(X))],color='r',alpha=0.01)

plt.fill_between(X,CI_89[0],CI_89[1],color='c',alpha=0.1)

plt.xlabel('WS ranking [STD]')
plt.ylabel('US news ranking [STD]')
plt.savefig('joe_law_correlation.jpg',format='jpg')