In [430]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pymc as pm

sns.set()

dick = pd.read_pickle('country_data.pkl')
  

In [431]:
country_density = pd.read_csv('world_density.csv',sep=';',usecols=['name','area','pop2020'])
country_density['density'] = country_density['pop2020'] / country_density['area']
country_density.set_index('name',inplace=True)
country_density['pop2020'] *= 1000
country_density['density'] *= 1000
country_density

Unnamed: 0_level_0,pop2020,area,density
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Macau,649335.0,30.00,21644.500000
Monaco,39242.0,2.02,19426.732673
Singapore,5850342.0,710.00,8239.918310
Hong Kong,7496981.0,1104.00,6790.743659
Gibraltar,33691.0,6.00,5615.166667
...,...,...,...
Namibia,2540905.0,825615.00,3.077591
Western Sahara,597339.0,266000.00,2.245635
Mongolia,3278290.0,1564110.00,2.095946
Falkland Islands,3480.0,12173.00,0.285879


In [432]:
def remove_dollar(s):
    return s[1:].replace(',','')

country_gdp = pd.read_csv('world_gdp.csv',sep=';',
                          usecols=[1,2,6],thousands=',',header=None,
                         converters={2:remove_dollar,6:remove_dollar})

country_gdp.columns = ['name','gdp','gdp_per_capita']
country_gdp.set_index('name',inplace=True)
country_gdp['gdp'] = country_gdp['gdp'].astype(int)
country_gdp['gdp_per_capita'] = country_gdp['gdp_per_capita'].astype(int)

country_gdp.loc['US']

gdp               19485394000000
gdp_per_capita             59939
Name: US, dtype: int64

In [433]:
def strip(x):
    return x.replace('\t','')

country_populations = pd.read_csv('world_pop.csv',
                                  sep=';',header=None,index_col=0,names=['population'],
                                 thousands=',',converters={0 : strip})


def add_population_data(country_name):
    limit = 500
    
    df = dick[country_name]
    try:
        df['conf_per_M'] = df['confirmed'] / (country_populations.loc[country_name,'population'] / 1e6)
        df['dead_per_M'] = df['deceased'] / (country_populations.loc[country_name,'population'] / 1e6)
    except:
        print ('\tcant find population data for {}'.format(country_name))
    try:
        df['density'] = country_density.loc[country_name,'density']
    except:
        print ('\tcant find density for {}'.format(country_name))
        
    try:
        df['gdp'] = country_gdp.loc[country_name,'gdp']
        df['gdp_per_capita'] = country_gdp.loc[country_name,'gdp_per_capita']
    except:
        print ('cant find gdp for {}'.format(country_name))
        
    df.drop(['conf_over_dead','dead_conf_ratio'],axis=1,inplace=True)
    
    df.replace(np.inf,np.nan,inplace=True)

    df = df[df['confirmed'] > limit]
    
    return df

for c in list(dick.keys()):
    dick[c] = add_population_data(c)
    

	cant find population data for Bahamas
	cant find density for Cabo Verde
	cant find population data for Congo (Brazzaville)
	cant find density for Congo (Brazzaville)
cant find gdp for Congo (Brazzaville)
	cant find population data for Congo (Kinshasa)
	cant find density for Congo (Kinshasa)
cant find gdp for Congo (Kinshasa)
	cant find population data for Cote d'Ivoire
	cant find density for Cote d'Ivoire
cant find gdp for Cote d'Ivoire
	cant find population data for Diamond Princess
	cant find density for Diamond Princess
cant find gdp for Diamond Princess
cant find gdp for Czechia
cant find gdp for Djibouti
cant find gdp for Eritrea
	cant find density for Eswatini
	cant find density for Holy See
cant find gdp for Holy See
cant find gdp for Liechtenstein
cant find gdp for Monaco
	cant find density for North Macedonia
	cant find population data for Saint Vincent and the Grenadines
cant find gdp for Saint Vincent and the Grenadines
cant find gdp for Somalia
cant find gdp for Taiwan*
ca

In [434]:
swe = dick['Sweden']
print (swe.tail())


            confirmed    factor    inc  deceased  dead_factor  dead_inc  \
2020-03-28       3447  1.123167  378.0       105     1.000000       0.0   
2020-03-29       3700  1.073397  253.0       110     1.047619       5.0   
2020-03-30       4028  1.088649  328.0       146     1.327273      36.0   
2020-03-31       4435  1.101043  407.0       180     1.232877      34.0   
2020-04-01       4947  1.115445  512.0       239     1.327778      59.0   

            pct_dead  conf_per_M  dead_per_M    density           gdp  \
2020-03-28  3.046127  341.311967   10.396796  22.428108  535607385506   
2020-03-29  2.972973  366.363295   10.891882  22.428108  535607385506   
2020-03-30  3.624628  398.840906   14.456498  22.428108  535607385506   
2020-03-31  4.058625  439.140868   17.823079  22.428108  535607385506   
2020-04-01  4.831211  489.837627   23.665088  22.428108  535607385506   

            gdp_per_capita  
2020-03-28           54075  
2020-03-29           54075  
2020-03-30           54

In [435]:
y_param = 'conf_per_M'
x2_param = 'gdp_per_capita'
x1_param = 'density'

def process_countries(countries):
    
    def standardize(n):
        return (n - n.mean()) / n.std()
    
    x1_list = []
    x2_list = []
    y_list = []
    
    for country_name in countries:
        
        try:
        
            df = dick[country_name]
            y = df[y_param][-1]
            x1 = df[x1_param][-1]
            x2 = df[x2_param][-1] 
        
            x1_list.append(x1)
            x2_list.append(x2)
            y_list.append(y)
        
            x1 = standardize(np.array(x1_list))
            x2 = standardize(np.array(x2_list))
            y = standardize(np.array(y_list))
    
        except:
            print ('cant get data for {}'.format(country_name) )
       
    return x1,x2,y
    
    
countries = ['Sweden','US','Spain','Italy','Germany','Netherlands','United Kingdom','France','Korea, South',
            'Japan','India','Finland','Denmark','Norway','Iceland','Belgium','Poland','Estonia','Russia',
            'Hungary','Luxembourg','Lithuania','Czechia','Argentina','Brazil','Israel','Greece','Turkey',
            'Portugal','China','Austria','Switzerland','Australia','Canada','Maroco','South Africa','Gambia',
            'Nigeria','Tunisia']


x1,x2,y = process_countries(countries)


cant get data for Czechia
cant get data for Maroco
cant get data for Gambia
cant get data for Nigeria
cant get data for Tunisia


  


In [436]:
multiple_regression = False

def regression(x1,x2,y):
    
    beta1_mu = pm.Uniform('beta1_mu',-1,1)
    beta1_sigma = pm.Uniform('beta1_sigma',0,1)
    
    beta2_mu = pm.Uniform('beta2_mu',-1,1)
    beta2_sigma = pm.Uniform('beta2_sigma',0,1)
    
    obs_sigma = pm.Uniform('obs_sigma',0,0.5)
    
    alpha = pm.Uniform('alpha',-1,1)
    
    beta1 = pm.Normal('beta1',beta1_mu, 1 / beta1_sigma ** 2)
    beta2 = pm.Normal('beta2',beta2_mu, 1 / beta2_sigma ** 2)

    if multiple_regression:
        @pm.deterministic()
        def linreq(x1=x1,x2=x2,beta1=beta1,beta2=beta2,alpha=alpha):
            return x1 * beta1 + x2*beta2 + alpha

    else:
        @pm.deterministic()
        def linreq(x1=x1,x2=x2,beta1=beta1,beta2=beta2,alpha=alpha):
            return x1 * beta1 + x2*beta2 + alpha
        
    obs = pm.Normal('obs',linreq,1 / obs_sigma ** 2,observed = True, value=y)
    
    if multiple_regression:
        model = pm.Model([beta1_mu,beta1_sigma,beta2_mu,beta2_sigma,obs_sigma,alpha,beta1,beta2,obs])
    else:
        model = pm.Model([beta1_mu,beta1_sigma,obs_sigma,alpha,beta1,obs])
    
    try:
        map_ = pm.MAP(model)
        map_.fit()
    except:
        print ('cant fit')
    mcmc = pm.MCMC(model)
    
    sample = mcmc.sample(100000,50000,2)
    
    beta1_mu_post = mcmc.trace('beta1_mu')[:]
    beta1_sigma_post = mcmc.trace('beta1_sigma')[:]
    
    if multiple_regression:
        beta2_mu_post = mcmc.trace('beta2_mu')[:]
        beta2_sigma_post = mcmc.trace('beta2_sigma')[:]
    
    obs_sigma_post = mcmc.trace('obs_sigma')[:]
    
    alpha_post = mcmc.trace('alpha')[:]
    
    beta1_post = mcmc.trace('beta1')[:]
    
    if multiple_regression:
        
        beta2_post = mcmc.trace('beta2')[:]
    
        result = pd.DataFrame({'beta1_mu_post' : beta1_mu_post,
                              'beta1_sigma_post' : beta1_sigma_post,
                              'beta2_mu_post' : beta2_mu_post,
                              'beta2_sigma_post' : beta2_sigma_post,
                              'obs_sigma_post' : obs_sigma_post,
                              'alpha_post' : alpha_post,
                              'beta1_post' : beta1_post,
                              'beta2_post' : beta2_post})
    
    else:
        result = pd.DataFrame({'beta1_mu_post' : beta1_mu_post,
                              'beta1_sigma_post' : beta1_sigma_post,
                              'obs_sigma_post' : obs_sigma_post,
                              'alpha_post' : alpha_post,
                              'beta1_post' : beta1_post})
    return result


result = regression(x1,x2,y)
print ()
print (result.describe())
    
    

 [-----------------100%-----------------] 100000 of 100000 complete in 24.4 sec
       beta1_mu_post  beta1_sigma_post  obs_sigma_post    alpha_post  \
count   25000.000000      25000.000000    25000.000000  25000.000000   
mean        0.013045          0.473137        0.489566     -0.000103   
std         0.398493          0.281249        0.009652      0.083884   
min        -0.999194          0.003605        0.429833     -0.308283   
25%        -0.216509          0.231436        0.485030     -0.057053   
50%         0.010133          0.459814        0.492432     -0.000963   
75%         0.248139          0.708776        0.496953      0.055628   
max         0.999446          0.999957        0.500000      0.314428   

         beta1_post  
count  25000.000000  
mean       0.015730  
std        0.083572  
min       -0.325995  
25%       -0.040798  
50%        0.015680  
75%        0.071802  
max        0.325953  


In [None]:
x1_param_beta = result.beta1_post.mean()
alpha_param = result.alpha_post.mean()
x1_param_sigma = result.beta1_post.std()

if multiple_regression:
    
    x2_param_beta = result.beta2_post.mean()
    x2_param_sigma = result.beta2_post.std()

plt.figure(figsize=(18,12))
plt.scatter(x1,y, color='red',label=x1_param + r' $\beta$: {:.2f} $\sigma$: {:.2f}'.format(x1_param_beta,x1_param_sigma))

if multiple_regression:
    plt.scatter(x2,y,color = 'g',label=x2_param + r' $\beta$: {:.2f} $\sigma$: {:.2f}'.format(x2_param_beta,x2_param_sigma))

X = np.linspace(-3,3,1000)

beta1_samples = np.random.choice(result.beta1_post,replace=True,size=len(X))

if multiple_regression:
    beta2_samples = np.random.choice(result.beta2_post,replace=True,size=len(X))
    
alpha_samples = np.random.choice(result.alpha_post,replace=True,size=len(X))

if multiple_regression:
    lines = [X[i] * beta1_samples + X[i] * beta2_samples + alpha_samples for i in range(len (X))]
    
else:
    lines = [X[i] * beta1_samples  + alpha_samples for i in range(len (X))]


_ = plt.plot(X,lines,alpha=0.05,color='r')

if multiple_regression:
    plt.xlabel(x1_param + ' & ' + x2_param  )
    
else:
    plt.xlabel(x1_param)
    
plt.ylabel(y_param)
plt.legend(loc='upper right')
