In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pymc as pm

%matplotlib inline

sns.set()

dick = pd.read_pickle('country_data.pkl')

limit = 100 # min nr of confirmed

  

In [None]:
country_density = pd.read_csv('world_density.csv',sep=';',usecols=['name','area','pop2020'])
country_density['density'] = country_density['pop2020'] / country_density['area']
country_density.set_index('name',inplace=True)
country_density['pop2020'] *= 1000
country_density['density'] *= 1000
country_density

In [None]:
def remove_dollar(s):
    return s[1:].replace(',','')

country_gdp = pd.read_csv('world_gdp.csv',sep=';',
                          usecols=[1,2,6],thousands=',',header=None,
                         converters={2:remove_dollar,6:remove_dollar})

country_gdp.columns = ['name','gdp','gdp_per_capita']
country_gdp.set_index('name',inplace=True)
country_gdp['gdp'] = country_gdp['gdp'].astype(int)
country_gdp['gdp_per_capita'] = country_gdp['gdp_per_capita'].astype(int)

country_gdp.loc['US']

In [None]:
print (country_gdp.sort_values('gdp_per_capita',ascending=False).head(10))
print (country_gdp.sort_values('gdp_per_capita',ascending=False).tail(10))

In [None]:
def strip(x):
    return x.replace('\t','')

country_populations = pd.read_csv('world_pop.csv',
                                  sep=';',header=None,index_col=0,names=['population'],
                                 thousands=',',converters={0 : strip})


good_countries = []

def add_population_data(country_name,limit):
    
    all_good = True
    
    df = dick[country_name]
    try:
        df['conf_per_M'] = df['confirmed'] / (country_populations.loc[country_name,'population'] / 1e6)
        df['dead_per_M'] = df['deceased'] / (country_populations.loc[country_name,'population'] / 1e6)
        df['population'] = country_populations.loc[country_name,'population']
    except:
        all_good = False
        print ('\tcant find population data for {}'.format(country_name))
    try:
        df['density'] = country_density.loc[country_name,'density']
    except:
        all_good = False
        print ('\tcant find density for {}'.format(country_name))
        
    try:
        df['gdp'] = country_gdp.loc[country_name,'gdp']
        df['gdp_per_capita'] = country_gdp.loc[country_name,'gdp_per_capita']
    except:
        all_good = False
        print ('cant find gdp for {}'.format(country_name))
        
    if all_good:
        good_countries.append(country_name)
        
    df.drop(['conf_over_dead','dead_conf_ratio'],axis=1,inplace=True)
    
    df.replace(np.inf,np.nan,inplace=True)

    df = df[df['confirmed'] > limit]
    
    return df

for c in list(dick.keys()):
    dick[c] = add_population_data(c,limit)
    

In [None]:
swe = dick['Sweden']
print (swe.tail())
us = dick['US']
print (us.tail())

In [None]:
#### PARAM
####
multiple_regression = True
y_param = 'conf_per_M'
x1_param = 'gdp_per_capita'
x2_param = 'density'

outliers = 3


####
####

def standardize(n):
    return (n - n.mean()) / n.std()

def process_countries(countries):
    nr_of_fails = 0
    
    x1_list = []
    x2_list = []
    y_list = []
    
    for country_name in countries:
        
        try:
        
            df = dick[country_name]
            y = df[y_param][-1]
            x1 = df[x1_param][-1]
            x2 = df[x2_param][-1] 
        
            x1_list.append(x1)
            x2_list.append(x2)
            y_list.append(y)
        
        except:
            print ('cant get data for {}'.format(country_name) )
            nr_of_fails += 1
            good_countries.remove(country_name)
            
    x1 = np.array(x1_list)
    x2 = np.array(x2_list)
    y = np.array(y_list)
    
    x1_std = standardize(x1)
    x2_std = standardize(x2)
    y_std = standardize(y)
    
    
    x1_outliers =  (x1_std > outliers) | (x1_std < -1 * outliers) #std
    x2_outliers = (x2_std > outliers) |  (x1_std < -1 * outliers) #std
    y_outliers = (y_std > outliers) | (x1_std < -1 * outliers) #std
    
    outlier_idx = (x1_outliers | x2_outliers | y_outliers)
    
    x1 = x1[~outlier_idx]
    x1_std = x1_std[~outlier_idx]
    x2 = x2[~outlier_idx]
    x2_std = x2_std[~outlier_idx]
    y = y[~outlier_idx]
    y_std = y_std[~outlier_idx]
    
    x1_mu = x1.mean()
    x1_sigma = x1.std()
    x2_mu = x2.mean()
    x2_sigma = x2.std()
    y_mu = y.mean()
    y_sigma = y.std()
    
    
    return x1_std,x2_std,y_std,x1_mu,x1_sigma,x2_mu,x2_sigma,y_mu,y_sigma,nr_of_fails
    
    
#countries = ['Sweden','Finland','Norway','Denmark','Iceland']

countries = good_countries

x1,x2,y,x1_mu,x1_sigma,x2_mu,x2_sigma,y_mu,y_sigma,nr_of_fails = process_countries(countries)
nr_of_successful_countries = len(countries) 

for c in countries:
    print (c)

In [None]:

def regression(x1,x2,y):
    
    beta1_mu = pm.Uniform('beta1_mu',-1,1)
    beta1_sigma = pm.Uniform('beta1_sigma',0,1)
    
    beta2_mu = pm.Uniform('beta2_mu',-1,1)
    beta2_sigma = pm.Uniform('beta2_sigma',0,1)
    
    obs_sigma = pm.Uniform('obs_sigma',0,5)
    
    alpha = pm.Uniform('alpha',-1,1)
    
    beta1 = pm.Normal('beta1',beta1_mu, 1 / beta1_sigma ** 2)
    beta2 = pm.Normal('beta2',beta2_mu, 1 / beta2_sigma ** 2)

    if multiple_regression:
        @pm.deterministic()
        def linreq(x1=x1,x2=x2,beta1=beta1,beta2=beta2,alpha=alpha):
            return x1 * beta1 + x2*beta2 + alpha

    else:
        @pm.deterministic()
        def linreq(x1=x1,beta1=beta1,alpha=alpha):
            return x1 * beta1 + alpha
        
    obs = pm.Normal('obs',linreq,1 / obs_sigma ** 2,observed = True, value=y)
    
    if multiple_regression:
        model = pm.Model([beta1_mu,beta1_sigma,beta2_mu,beta2_sigma,obs_sigma,alpha,beta1,beta2,obs])
    else:
        model = pm.Model([beta1_mu,beta1_sigma,obs_sigma,alpha,beta1,obs])
    
    try:
        map_ = pm.MAP(model)
        map_.fit()
    except:
        print ('cant fit')
    mcmc = pm.MCMC(model)
    
    sample = mcmc.sample(100000,50000,2)
    
    beta1_mu_post = mcmc.trace('beta1_mu')[:]
    beta1_sigma_post = mcmc.trace('beta1_sigma')[:]
    
    if multiple_regression:
        beta2_mu_post = mcmc.trace('beta2_mu')[:]
        beta2_sigma_post = mcmc.trace('beta2_sigma')[:]
    
    obs_sigma_post = mcmc.trace('obs_sigma')[:]
    
    alpha_post = mcmc.trace('alpha')[:]
    
    beta1_post = mcmc.trace('beta1')[:]
    
    if multiple_regression:
        
        beta2_post = mcmc.trace('beta2')[:]
    
        result = pd.DataFrame({'beta1_mu_post' : beta1_mu_post,
                              'beta1_sigma_post' : beta1_sigma_post,
                              'beta2_mu_post' : beta2_mu_post,
                              'beta2_sigma_post' : beta2_sigma_post,
                              'obs_sigma_post' : obs_sigma_post,
                              'alpha_post' : alpha_post,
                              'beta1_post' : beta1_post,
                              'beta2_post' : beta2_post})
    
    else:
        result = pd.DataFrame({'beta1_mu_post' : beta1_mu_post,
                              'beta1_sigma_post' : beta1_sigma_post,
                              'obs_sigma_post' : obs_sigma_post,
                              'alpha_post' : alpha_post,
                              'beta1_post' : beta1_post})
    return result


result = regression(x1,x2,y)
print ()
print (result.describe())

    

In [None]:
def plot(result,x1,x2,y,title,n):
    

    x1_param_beta = result.beta1_post.mean()
    alpha_param = result.alpha_post.mean()
    x1_param_sigma = result.beta1_post.std()
    x1_param_89 = np.percentile(result.beta1_post,[5.5,94.5])

    if multiple_regression:
    
        x2_param_beta = result.beta2_post.mean()
        x2_param_sigma = result.beta2_post.std()
        x2_param_89 = np.percentile(result.beta2_post,[5.5,94,5])

    plt.figure(figsize=(18,12))
    plt.ylim([-1 * outliers,outliers])

    plt.title ('Bayesian Multi-Linear Regression for {} {} > {} CONFIRMED'.format(
        n, title,limit))

    plt.scatter(x1,y, color='red',label=x1_param + \
                r' $\beta$: {:.2f} $\sigma$: {:.2f} 89%: [{:.2f} {:.2f}]'.format(
        x1_param_beta,x1_param_sigma,x1_param_89[0],x1_param_89[1]))


    if multiple_regression:
        plt.scatter(x2,y,color = 'g',label=x2_param + \
                    r' $\beta$: {:.2f} $\sigma$: {:.2f} 89%: [{:.2f} {:.2f}]'.format(
            x2_param_beta,x2_param_sigma,x2_param_89[0],x2_param_89[1]))
    

    X = np.linspace(-1*outliers,outliers,1000)

    beta1_samples = np.random.choice(result.beta1_post,replace=True,size=len(X))

    if multiple_regression:
        beta2_samples = np.random.choice(result.beta2_post,replace=True,size=len(X))
    
    alpha_samples = np.random.choice(result.alpha_post,replace=True,size=len(X))

    if multiple_regression:
        lines = [X[i] * beta1_samples + X[i] * beta2_samples + alpha_samples for i in range(len (X))]
        
        rows = np.random.choice(result.index,replace=True,size=len(X))
        
        samples2 = np.array([pm.rnormal(X[i] * result.beta1_post[rows] + X[i] * result.beta2_post[rows] + result.alpha_post[rows],
                                      1 / result.obs_sigma_post[rows] ** 2,size=len(X)) for i in range(len(X))])
        
        high2,low2 = np.percentile(samples2,[5.5,94.5],axis=1)
        
        plt.fill_between(X,high2,low2,color='orange',alpha=0.05)
    
    else:
        lines = [X[i] * beta1_samples  + alpha_samples for i in range(len (X))]
        
        rows = np.random.choice(result.index,replace=True,size=len(X))
        
        samples2 = np.array([pm.rnormal(X[i] * result.beta1_post[rows]  + result.alpha_post[rows],
                                      1 / result.obs_sigma_post[rows] ** 2,size=len(X)) for i in range(len(X))])
        
        high2,low2 = np.percentile(samples2,[5.5,94.5],axis=1)
        
        plt.fill_between(X,high2,low2,color='orange',alpha=0.05)


    _ = plt.plot(X,lines,alpha=0.01,color='r')

    if multiple_regression:
        plt.xlabel(x1_param + r' $\mu$: {:.2f} $\sigma$: {:.2f} '.format(x1_mu,x1_sigma) + \
                   x2_param + r' $\mu$: {:2f} $\sigma$: {:.2f} '.format(x2_mu,x2_sigma))
    
    else:
        plt.xlabel(x1_param + r' $\mu$: {:.2f} $\sigma$: {:.2f} '.format(x1_mu,x1_sigma) )
        
    #samples
    #samples = np.array([pm.rnormal(X[i] ])
    
    plt.ylabel(y_param + r' $\mu$: {:.2f} $\sigma$: {:.2f}'.format(y_mu,y_sigma))
    plt.legend(loc='upper right')
    plt.savefig('multi_regression_{}_{}.jpg'.format(title,y_param),format='jpg')
    
    


In [None]:
def plot_betas(result,title):
    print (result.describe())
    
    x1_param_beta = result.beta1_post.mean()
    alpha_param = result.alpha_post.mean()
    x1_param_sigma = result.beta1_post.std()
    x1_param_89 = np.percentile(result.beta1_post,[5.5,94.5])

    if multiple_regression:
    
        x2_param_beta = result.beta2_post.mean()
        x2_param_sigma = result.beta2_post.std()
        x2_param_89 = np.percentile(result.beta2_post,[5.5,94,5])
        
    plt.figure(figsize=(18,12))
    plt.title('Regression Betas ' + title)
    plt.hist(result.beta1_post,density=True,label=x1_param + \
                r' $\beta$: {:.2f} $\sigma$: {:.2f} 89%: [{:.2f} {:.2f}]'.format(
        x1_param_beta,x1_param_sigma,x1_param_89[0],x1_param_89[1]),
             color='r',alpha=0.7,histtype='stepfilled')

    if multiple_regression:
        plt.hist(result.beta2_post,density=True,label=x2_param + \
                    r' $\beta$: {:.2f} $\sigma$: {:.2f} 89%: [{:.2f} {:.2f}]'.format(
            x2_param_beta,x2_param_sigma,x2_param_89[0],x2_param_89[1]),
                 color='g',alpha=0.7,histtype='stepfilled')
    
    plt.legend(loc='upper left')
    plt.savefig('multi_regression_posteriors_{}_{}.jpg'.format(title,y_param),format='jpg')

In [None]:
def plot_traces(result,country_name):
    # disable the burn params in mcmc.sample to see the convergence
    fig,axes = plt.subplots(4,2,figsize=(18,12))


    axes[0,0].plot(result.beta1_mu_post)
    axes[0,0].set_title('beta1_mu')
    axes[0,1].plot(result.beta1_sigma_post)
    axes[0,1].set_title('beta1_sigma')
    axes[1,0].plot(result.beta2_mu_post)
    axes[1,0].set_title('beta2_mu')
    axes[1,1].plot(result.beta2_sigma_post)
    axes[1,1].set_title('beta2_sigma')
    axes[2,0].plot(result.alpha_post)
    axes[2,0].set_title('alpha')
    axes[2,1].plot(result.obs_sigma_post)
    axes[2,1].set_title('obs_sigma')
    axes[3,0].plot(result.beta1_post)
    axes[3,0].set_title('beta1')
    axes[3,1].plot(result.beta2_post)
    axes[3,1].set_title('beta2')
    plt.tight_layout()
    plt.savefig('Corona_linear_reg_traces_{}_{}.jpg'.format(country_name,y_param),format='jpg')
    
def plot_posteriors(result,country_name):
    fig,axes = plt.subplots(4,2,figsize=(18,12))


    axes[0,0].hist(result.beta1_mu_post,density=True)
    axes[0,0].set_title('beta1_mu')
    axes[0,1].hist(result.beta1_sigma_post,density=True)
    axes[0,1].set_title('beta1_sigma')
    axes[1,0].hist(result.beta2_mu_post,density=True)
    axes[1,0].set_title('beta2_mu')
    axes[1,1].hist(result.beta2_sigma_post,density=True)
    axes[1,1].set_title('beta2_sigma')
    axes[2,0].hist(result.alpha_post,density=True)
    axes[2,0].set_title('alpha')
    axes[2,1].hist(result.obs_sigma_post,density=True)
    axes[2,1].set_title('obs_sigma')
    axes[3,0].hist(result.beta1_post,density=True)
    axes[3,0].set_title('beta1')
    axes[3,1].hist(result.beta2_post,density=True)
    axes[3,1].set_title('beta2')
    plt.tight_layout()
    
    plt.savefig('corona_linear_reg_post_hist_{}_{}.jpg'.format(country_name,y_param),format='jpg')

In [None]:
plot(result,x1,x2,y,'countries',nr_of_successful_countries)
plot_betas(result, 'Countries')
plot_traces(result,'World')
plot_posteriors(result,'World')

In [None]:
us_df = pd.read_pickle('US_states_latest_day.pkl')
us_df['gdp_per_capita'] = us_df['gdp'] 


x1 = us_df[x1_param]
x2 = us_df[x2_param]
y = us_df[y_param]

x1_mu = x1.mean()
print (x1_mu)
x1_sigma = x1.std()
x2_mu = x2.mean()
x2_sigma = x2.std()
y_mu = y.mean()
y_sigma = y.std()



In [None]:

x1_std = standardize(x1)
x2_std = standardize(x2)
y_std = standardize(y)

x1_outliers = (x1_std > outliers) | (x1_std < -1 * outliers) #std
x2_outliers = (x2_std > outliers) | (x1_std < -1 * outliers) #std
y_outliers = (y_std > outliers) | (x1_std < -1 * outliers) #std
    
outlier_idx = (x1_outliers | x2_outliers | y_outliers)
    
x1 = x1[~outlier_idx]
x1_std = x1_std[~outlier_idx]
x2 = x2[~outlier_idx]
x2_std = x2_std[~outlier_idx]
y = y[~outlier_idx]
y_std = y_std[~outlier_idx]
    

    
result = regression(x1_std,x2_std,y_std)
print (result.describe())
plot(result,x1_std,x2_std,y_std,'US States',len(us_df))
plot_betas(result,'US States')
plot_traces(result,'US_States')
plot_posteriors(result,'US_States')
