In [644]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pymc as pm

sns.set()

dick = pd.read_pickle('country_data.pkl')
  

In [645]:
country_density = pd.read_csv('world_density.csv',sep=';',usecols=['name','area','pop2020'])
country_density['density'] = country_density['pop2020'] / country_density['area']
country_density.set_index('name',inplace=True)
country_density['pop2020'] *= 1000
country_density['density'] *= 1000
country_density

Unnamed: 0_level_0,pop2020,area,density
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Macau,649335.0,30.00,21644.500000
Monaco,39242.0,2.02,19426.732673
Singapore,5850342.0,710.00,8239.918310
Hong Kong,7496981.0,1104.00,6790.743659
Gibraltar,33691.0,6.00,5615.166667
...,...,...,...
Namibia,2540905.0,825615.00,3.077591
Western Sahara,597339.0,266000.00,2.245635
Mongolia,3278290.0,1564110.00,2.095946
Falkland Islands,3480.0,12173.00,0.285879


In [646]:
def remove_dollar(s):
    return s[1:].replace(',','')

country_gdp = pd.read_csv('world_gdp.csv',sep=';',
                          usecols=[1,2,6],thousands=',',header=None,
                         converters={2:remove_dollar,6:remove_dollar})

country_gdp.columns = ['name','gdp','gdp_per_capita']
country_gdp.set_index('name',inplace=True)
country_gdp['gdp'] = country_gdp['gdp'].astype(int)
country_gdp['gdp_per_capita'] = country_gdp['gdp_per_capita'].astype(int)

country_gdp.loc['US']

gdp               19485394000000
gdp_per_capita             59939
Name: US, dtype: int64

In [647]:
def strip(x):
    return x.replace('\t','')

country_populations = pd.read_csv('world_pop.csv',
                                  sep=';',header=None,index_col=0,names=['population'],
                                 thousands=',',converters={0 : strip})


good_countries = []

def add_population_data(country_name):
    limit = 1
    
    all_good = True
    
    df = dick[country_name]
    try:
        df['conf_per_M'] = df['confirmed'] / (country_populations.loc[country_name,'population'] / 1e6)
        df['dead_per_M'] = df['deceased'] / (country_populations.loc[country_name,'population'] / 1e6)
        df['population'] = country_populations.loc[country_name,'population']
    except:
        all_good = False
        print ('\tcant find population data for {}'.format(country_name))
    try:
        df['density'] = country_density.loc[country_name,'density']
    except:
        all_good = False
        print ('\tcant find density for {}'.format(country_name))
        
    try:
        df['gdp'] = country_gdp.loc[country_name,'gdp']
        df['gdp_per_capita'] = country_gdp.loc[country_name,'gdp_per_capita']
    except:
        all_good = False
        print ('cant find gdp for {}'.format(country_name))
        
    if all_good:
        good_countries.append(country_name)
        
    df.drop(['conf_over_dead','dead_conf_ratio'],axis=1,inplace=True)
    
    df.replace(np.inf,np.nan,inplace=True)

    df = df[df['confirmed'] > limit]
    
    return df

for c in list(dick.keys()):
    dick[c] = add_population_data(c)
    

	cant find population data for Bahamas
	cant find density for Cabo Verde
	cant find population data for Congo (Brazzaville)
	cant find density for Congo (Brazzaville)
cant find gdp for Congo (Brazzaville)
	cant find population data for Congo (Kinshasa)
	cant find density for Congo (Kinshasa)
cant find gdp for Congo (Kinshasa)
	cant find population data for Cote d'Ivoire
	cant find density for Cote d'Ivoire
cant find gdp for Cote d'Ivoire
	cant find population data for Diamond Princess
	cant find density for Diamond Princess
cant find gdp for Diamond Princess
cant find gdp for Djibouti
cant find gdp for Eritrea
	cant find density for Eswatini
	cant find density for Holy See
cant find gdp for Holy See
cant find gdp for Liechtenstein
cant find gdp for Monaco
	cant find density for North Macedonia
	cant find population data for Saint Vincent and the Grenadines
cant find gdp for Saint Vincent and the Grenadines
cant find gdp for Somalia
cant find gdp for Taiwan*
cant find gdp for Venezuela


In [648]:
swe = dick['Sweden']
print (swe.tail())


            confirmed    factor    inc  deceased  dead_factor  dead_inc  \
2020-03-28       3447  1.123167  378.0       105     1.000000       0.0   
2020-03-29       3700  1.073397  253.0       110     1.047619       5.0   
2020-03-30       4028  1.088649  328.0       146     1.327273      36.0   
2020-03-31       4435  1.101043  407.0       180     1.232877      34.0   
2020-04-01       4947  1.115445  512.0       239     1.327778      59.0   

            pct_dead  conf_per_M  dead_per_M  population    density  \
2020-03-28  3.046127  341.311967   10.396796    10099265  22.428108   
2020-03-29  2.972973  366.363295   10.891882    10099265  22.428108   
2020-03-30  3.624628  398.840906   14.456498    10099265  22.428108   
2020-03-31  4.058625  439.140868   17.823079    10099265  22.428108   
2020-04-01  4.831211  489.837627   23.665088    10099265  22.428108   

                     gdp  gdp_per_capita  
2020-03-28  535607385506           54075  
2020-03-29  535607385506           5

In [649]:
#### PARAM
####
multiple_regression = True
y_param = 'dead_per_M'
x1_param = 'gdp_per_capita'
x2_param = 'conf_per_M'
####
####

def process_countries(countries):
    
    def standardize(n):
        return (n - n.mean()) / n.std()
    
    x1_list = []
    x2_list = []
    y_list = []
    
    for country_name in countries:
        
        try:
        
            df = dick[country_name]
            y = df[y_param][-1]
            x1 = df[x1_param][-1]
            x2 = df[x2_param][-1] 
        
            x1_list.append(x1)
            x2_list.append(x2)
            y_list.append(y)
        
        except:
            print ('cant get data for {}'.format(country_name) )
            
    x1 = np.array(x1_list)
    x2 = np.array(x2_list)
    y = np.array(y_list)
    
    x1_std = standardize(x1)
    x2_std = standardize(x2)
    y_std = standardize(y)
    
    
    x1_outliers = x1_std > 3 #std
    x2_outliers = x2_std > 3 #std
    y_outliers = y_std > 3 #std
    
    outlier_idx = (x1_outliers | x2_outliers | y_outliers)
    
    x1 = x1[~outlier_idx]
    x1_std = x1_std[~outlier_idx]
    x2 = x2[~outlier_idx]
    
    x1_mu = x1.mean()
    x1_sigma = x1.std()
    x2_mu = x2.mean()
    x2_sigma = x2.std()
    y_mu = y.mean()
    y_sigma = y.std()
    
    
    x2_std = x2_std[~outlier_idx]
    y = y[~outlier_idx]
    y_std = y_std[~outlier_idx]

    
    return x1_std,x2_std,y_std,x1_mu,x1_sigma,x2_mu,x2_sigma,y_mu,y_sigma
    
    
countries = ['Sweden','US','Spain','Italy','Germany','Netherlands','United Kingdom','France','Korea, South',
            'Japan','India','Finland','Denmark','Norway','Iceland','Belgium','Poland','Estonia','Russia',
            'Hungary','Luxembourg','Lithuania','Czechia','Argentina','Brazil','Israel','Greece','Turkey',
            'Portugal','China','Austria','Switzerland','Australia','Canada','Maroco','South Africa','Gambia',
            'Nigeria','Tunisia']

countries = good_countries


x1,x2,y,x1_mu,x1_sigma,x2_mu,x2_sigma,y_mu,y_sigma = process_countries(countries)


cant get data for Papua New Guinea
cant get data for Timor-Leste


In [650]:

def regression(x1,x2,y):
    
    beta1_mu = pm.Uniform('beta1_mu',-1,1)
    beta1_sigma = pm.Uniform('beta1_sigma',0,1)
    
    beta2_mu = pm.Uniform('beta2_mu',-1,1)
    beta2_sigma = pm.Uniform('beta2_sigma',0,1)
    
    obs_sigma = pm.Uniform('obs_sigma',0,5)
    
    alpha = pm.Uniform('alpha',-1,1)
    
    beta1 = pm.Normal('beta1',beta1_mu, 1 / beta1_sigma ** 2)
    beta2 = pm.Normal('beta2',beta2_mu, 1 / beta2_sigma ** 2)

    if multiple_regression:
        @pm.deterministic()
        def linreq(x1=x1,x2=x2,beta1=beta1,beta2=beta2,alpha=alpha):
            return x1 * beta1 + x2*beta2 + alpha

    else:
        @pm.deterministic()
        def linreq(x1=x1,x2=x2,beta1=beta1,beta2=beta2,alpha=alpha):
            return x1 * beta1 + x2*beta2 + alpha
        
    obs = pm.Normal('obs',linreq,1 / obs_sigma ** 2,observed = True, value=y)
    
    if multiple_regression:
        model = pm.Model([beta1_mu,beta1_sigma,beta2_mu,beta2_sigma,obs_sigma,alpha,beta1,beta2,obs])
    else:
        model = pm.Model([beta1_mu,beta1_sigma,obs_sigma,alpha,beta1,obs])
    
    try:
        map_ = pm.MAP(model)
        map_.fit()
    except:
        print ('cant fit')
    mcmc = pm.MCMC(model)
    
    sample = mcmc.sample(100000,50000,2)
    
    beta1_mu_post = mcmc.trace('beta1_mu')[:]
    beta1_sigma_post = mcmc.trace('beta1_sigma')[:]
    
    if multiple_regression:
        beta2_mu_post = mcmc.trace('beta2_mu')[:]
        beta2_sigma_post = mcmc.trace('beta2_sigma')[:]
    
    obs_sigma_post = mcmc.trace('obs_sigma')[:]
    
    alpha_post = mcmc.trace('alpha')[:]
    
    beta1_post = mcmc.trace('beta1')[:]
    
    if multiple_regression:
        
        beta2_post = mcmc.trace('beta2')[:]
    
        result = pd.DataFrame({'beta1_mu_post' : beta1_mu_post,
                              'beta1_sigma_post' : beta1_sigma_post,
                              'beta2_mu_post' : beta2_mu_post,
                              'beta2_sigma_post' : beta2_sigma_post,
                              'obs_sigma_post' : obs_sigma_post,
                              'alpha_post' : alpha_post,
                              'beta1_post' : beta1_post,
                              'beta2_post' : beta2_post})
    
    else:
        result = pd.DataFrame({'beta1_mu_post' : beta1_mu_post,
                              'beta1_sigma_post' : beta1_sigma_post,
                              'obs_sigma_post' : obs_sigma_post,
                              'alpha_post' : alpha_post,
                              'beta1_post' : beta1_post})
    return result


result = regression(x1,x2,y)
print ()
print (result.describe())
    
    

 [-----------------100%-----------------] 100000 of 100000 complete in 41.0 sec
       beta1_mu_post  beta1_sigma_post  beta2_mu_post  beta2_sigma_post  \
count   25000.000000      25000.000000   25000.000000      25000.000000   
mean       -0.082036          0.464570       0.587685          0.451350   
std         0.392961          0.281619       0.385476          0.294516   
min        -0.999528          0.002176      -0.999000          0.000672   
25%        -0.312324          0.226684       0.418640          0.185375   
50%        -0.103196          0.449338       0.725187          0.430653   
75%         0.135158          0.693586       0.864652          0.702150   
max         0.999007          0.999935       0.999991          0.999987   

       obs_sigma_post    alpha_post    beta1_post    beta2_post  
count    25000.000000  25000.000000  25000.000000  25000.000000  
mean         0.144866      0.019600     -0.109525      0.864056  
std          0.008503      0.013608      0.018

In [None]:
x1_param_beta = result.beta1_post.mean()
alpha_param = result.alpha_post.mean()
x1_param_sigma = result.beta1_post.std()
x1_param_89 = np.percentile(result.beta1_post,[5.5,94.5])

if multiple_regression:
    
    x2_param_beta = result.beta2_post.mean()
    x2_param_sigma = result.beta2_post.std()
    x2_param_89 = np.percentile(result.beta2_post,[5.5,94,5])

plt.figure(figsize=(18,12))
plt.title ('Bayesian Multi-Linear Regression over {} countries'.format(len(countries)))
plt.scatter(x1,y, color='red',label=x1_param + \
            r' $\beta$: {:.2f} $\sigma$: {:.2f} 89%: [{:.2f} {:.2f}]'.format(
    x1_param_beta,x1_param_sigma,x1_param_89[0],x1_param_89[1]))

if multiple_regression:
    plt.scatter(x2,y,color = 'g',label=x2_param + \
                r' $\beta$: {:.2f} $\sigma$: {:.2f} 89%: [{:.2f} {:.2f}]'.format(
        x2_param_beta,x2_param_sigma,x2_param_89[0],x2_param_89[1]))

X = np.linspace(-3,3,1000)

beta1_samples = np.random.choice(result.beta1_post,replace=True,size=len(X))

if multiple_regression:
    beta2_samples = np.random.choice(result.beta2_post,replace=True,size=len(X))
    
alpha_samples = np.random.choice(result.alpha_post,replace=True,size=len(X))

if multiple_regression:
    lines = [X[i] * beta1_samples + X[i] * beta2_samples + alpha_samples for i in range(len (X))]
    
else:
    lines = [X[i] * beta1_samples  + alpha_samples for i in range(len (X))]


_ = plt.plot(X,lines,alpha=0.01,color='r')

if multiple_regression:
    plt.xlabel(x1_param + r' $\mu$: {:.2f} $\sigma$: {:.2f} '.format(x1_mu,x1_sigma) + \
               x2_param + r' $\mu$: {:2f} $\sigma$: {:.2f} '.format(x2_mu,x2_sigma))
    
else:
    plt.xlabel(x1_param + r' $\mu$: {:.2f} $\sigma$: {:.2f} '.format(x1_mu,x1_sigma) )
    
plt.ylabel(y_param + r' $\mu$: {:.2f} $\sigma$: {:.2f}'.format(y_mu,y_sigma))
plt.legend(loc='upper right')
