In [7]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import matplotlib.pyplot as plt

In [17]:
data = pd.read_csv('gapminder.csv')
data.head()

Unnamed: 0,country,incomeperperson,alcconsumption,armedforcesrate,breastcancerper100th,co2emissions,femaleemployrate,hivrate,internetuserate,lifeexpectancy,oilperperson,polityscore,relectricperperson,suicideper100th,employrate,urbanrate
0,Afghanistan,,0.03,0.5696534,26.8,75944000.0,25.6000003814697,,3.65412162280064,48.673,,0.0,,6.68438529968262,55.7000007629394,24.04
1,Albania,1914.99655094922,7.29,1.0247361,57.4,223747333.333333,42.0999984741211,,44.9899469578783,76.918,,9.0,636.341383366604,7.69932985305786,51.4000015258789,46.72
2,Algeria,2231.99333515006,0.69,2.306817,23.5,2932108666.66667,31.7000007629394,0.1,12.5000733055148,73.131,0.42009452521537,2.0,590.509814347428,4.8487696647644,50.5,65.22
3,Andorra,21943.3398976022,10.17,,,,,,81.0,,,,,5.36217880249023,,88.92
4,Angola,1381.00426770244,5.57,1.4613288,23.1,248358000.0,69.4000015258789,2.0,9.99995388324075,51.093,,-2.0,172.999227388199,14.5546770095825,75.6999969482422,56.7


In [18]:
# dropping empty values
sample_data = data[['internetuserate', 'polityscore', 'urbanrate']]
sample_data = sample_data.replace(' ', np.nan)
print('Number of regions before dropping None:', len(sample_data))
sample_data = sample_data.dropna()
print('Number of regions without None:', len(sample_data))

Number of regions before dropping None: 213
Number of regions without None: 155


In [19]:
sample_data['internetuserate'] = pd.to_numeric(sample_data['internetuserate'])
sample_data['polityscore'] = pd.to_numeric(sample_data['polityscore'])
sample_data['urbanrate'] = pd.to_numeric(sample_data['urbanrate'])
sample_data.head()

Unnamed: 0,internetuserate,polityscore,urbanrate
0,3.654122,0,24.04
1,44.989947,9,46.72
2,12.500073,2,65.22
4,9.999954,-2,56.7
6,36.000335,8,92.0


Since the Gapminder dataset does not contain any binary categorical variables I have to make them myself. I will divide Internet user rate and urbanization rate into two categories: low ( < 50%) and high ( > 50%). Polity score is between -10 and 10, so the low category will be for values < 0 and high for values > 0.

In [20]:
def categorize_variable(variable, threshold):
    categ = []
    for rate in sample_data[variable]:
        if rate > threshold:
            categ.append(1)
        else:
            categ.append(0)
    return categ

In [23]:
sample_data['internetuserate_cat'] = categorize_variable('internetuserate', 50)
sample_data['polityscore_cat'] = categorize_variable('polityscore', 0)
sample_data['urbanrate_cat'] = categorize_variable('urbanrate', 50)
sample_data.head()

Unnamed: 0,internetuserate,polityscore,urbanrate,internetuserate_cat,polityscore_cat,urbanrate_cat
0,3.654122,0,24.04,0,0,0
1,44.989947,9,46.72,0,1,0
2,12.500073,2,65.22,0,1,1
4,9.999954,-2,56.7,0,0,1
6,36.000335,8,92.0,0,1,1


In [29]:
reg = smf.logit('internetuserate_cat ~ urbanrate_cat + polityscore_cat', data=sample_data).fit()
print(reg.summary())

Optimization terminated successfully.
         Current function value: 0.422817
         Iterations 8
                            Logit Regression Results                           
Dep. Variable:     internetuserate_cat   No. Observations:                  155
Model:                           Logit   Df Residuals:                      152
Method:                            MLE   Df Model:                            2
Date:                 Thu, 25 Mar 2021   Pseudo R-squ.:                  0.2595
Time:                         12:23:05   Log-Likelihood:                -65.537
converged:                        True   LL-Null:                       -88.509
Covariance Type:             nonrobust   LLR p-value:                 1.056e-10
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -5.0738      1.113     -4.557      0.000      -7.256      -2.892
urbanr

In [31]:
print ("Odds Ratios")
params = reg.params
conf = reg.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
print (np.exp(conf))

Odds Ratios
                 Lower CI    Upper CI         OR
Intercept        0.000706    0.055477   0.006258
urbanrate_cat    5.339769  306.801662  40.475303
polityscore_cat  1.277310   11.070324   3.760350


* The results show that both urbanization rate (P = 0.000 (< 0.005), Beta = 3.7) and polity score (P = 0.016 (< 0.005), Beta = 1.3) are significantly and positively associated with the rate of Internet users.
* Regions with low urbanization rate are 40 times more likely to have low Internet users rate (OR=40.47, 95% CI = 5.34-306.80). Regions with low democracy score are 3.8 times more likely to have low Internet users rate (OR=3.76, 95% CI = 1.27-11.07). The confidence intervals overlap, so we can't say that urbanization rate is more strongly associated.
* The results support my hypothesis. Regions that are not democratic are supposed to have less Internet access. The same applies to regions with low urbanization.