In [1]:
import requests, bs4
from bs4 import BeautifulSoup as bs
import pandas as pd
import os
from selenium import webdriver

In [2]:
chromedriver = '/Applications/chromedriver'
os.environ["webdriver.chrome.driver"] = \
    chromedriver

driver = webdriver.Chrome(chromedriver)

Scrapping World Bankdata

In [3]:
worldBank_indicators = [{"indicator_id": "NY.GDP.MKTP.CD", "indicator_name": "gdp"},
                        {"indicator_id": "EN.ATM.CO2E.KT", "indicator_name": "c02"},
                        {"indicator_id": "SH.MED.BEDS.ZS", "indicator_name": "hospital_beds"},
                        {"indicator_id": "NY.GNP.PCAP.CD", "indicator_name": "gni"},
                        {"indicator_id": "HD.HCI.OVRL", "indicator_name": "hdi"},
                        {"indicator_id": "SP.DYN.LE00.IN", "indicator_name": "life_expectancy"}]

#GNI Stands for Gross national income per capita
#C02 Emmissions (Metrics tons per capita)
#Hospital Beds (per 1,000 people)
#HDI stands for Human Development Index
#GDP Gross domestic product

In [4]:
countries_dict = {}

In [5]:
def checkKey(dict, key):
    if key in dict.keys():
        return True
    else:
        return False

def getWorldBankData(soup):
    extracted_data = []
    tableRows = soup.find_all(class_="item")
    tableRows.pop(0) 
    for div in tableRows:
        country = div.contents[0].find('a').string
        value = div.contents[2].string
        if value:
            value = value.replace(',', '')
        extracted_data.append({'country': country, 'dataValue': value})
    return extracted_data

def insertWorldBankIndicator(data, indicator):
    for item in data:
        country = item['country']
        indicatorValue = item['dataValue']
        if checkKey(countries_dict, country) == False:
            countries_dict[country] = {}
            
        countries_dict[country][indicator] = indicatorValue
        
for indicator in worldBank_indicators:
    url = 'https://data.worldbank.org/indicator/'+indicator["indicator_id"]
    driver.get(url)
    soup = bs(driver.page_source)
    insertWorldBankIndicator(getWorldBankData(soup), indicator["indicator_name"])
    

In [6]:
df = pd.DataFrame.from_dict(countries_dict, orient='index')
# df = df[df['life_expectancy'].notna()]
# df = df.dropna()
# df = df[:"Zimbabwe"]

In [7]:
df = df.apply(pd.to_numeric)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 248 entries, Afghanistan to Upper middle income
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   gdp              244 non-null    float64
 1   c02              233 non-null    float64
 2   hospital_beds    231 non-null    float64
 3   gni              236 non-null    float64
 4   hdi              173 non-null    float64
 5   life_expectancy  239 non-null    float64
dtypes: float64(6)
memory usage: 13.6+ KB


In [9]:
df.head(100)

Unnamed: 0,gdp,c02,hospital_beds,gni,hdi,life_expectancy
Afghanistan,19807.07,7440.0,0.4,500.0,0.4,65.0
Albania,14799.62,5560.0,2.9,5210.0,0.6,79.0
Algeria,145163.90,151670.0,1.9,3550.0,0.5,77.0
American Samoa,638.00,,,,,
Andorra,3155.07,460.0,2.5,,,
...,...,...,...,...,...,...
Israel,401953.80,61970.0,3.0,43070.0,0.7,83.0
Italy,1886445.27,324850.0,3.1,32200.0,0.7,83.0
Jamaica,13812.42,8510.0,1.7,4620.0,0.5,74.0
Japan,5064872.88,1106150.0,13.0,41580.0,0.8,84.0


Getting Inmunization Data

In [24]:
bcg_df = pd.read_csv('bcg.csv')
bcg_dict = bcg_df[["country", "2019"]].to_dict('records')

dtp3_df = pd.read_csv('dtp3.csv')
dtp3_dict = dtp3_df[["country", "2019"]].to_dict('records')

hepb_df = pd.read_csv('hepb.csv')
hepb_dict = hepb_df[["country", "2019"]].to_dict('records')

def insertInmunizationIndicator(inmunization_dict, indicator):
    for item in inmunization_dict:
        country = item["country"]
        indicatorValue = item["2019"]
        if checkKey(countries_dict, country) == True:
            countries_dict[country][indicator] = indicatorValue

insertInmunizationIndicator(bcg_dict, "bcg")
insertInmunizationIndicator(dtp3_dict, "dtp3")
insertInmunizationIndicator(hepb_dict, "hepb")

In [None]:
Getting 

In [25]:
df = pd.DataFrame.from_dict(countries_dict, orient='index')
df = df.apply(pd.to_numeric)
df = df.dropna()
df = df[:"Zimbabwe"]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 116 entries, Afghanistan to Zimbabwe
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   gdp              116 non-null    float64
 1   c02              116 non-null    float64
 2   hospital_beds    116 non-null    float64
 3   gni              116 non-null    float64
 4   hdi              116 non-null    float64
 5   life_expectancy  116 non-null    float64
 6   bcg              116 non-null    float64
 7   dtp3             116 non-null    float64
 8   hepb             116 non-null    float64
dtypes: float64(9)
memory usage: 9.1+ KB


In [32]:
import statsmodels.api as sm
Y = df["life_expectancy"]
x = df.drop(columns = ['life_expectancy', 'hdi', 'c02', 'gdp'])
x = sm.add_constant(x)
model = sm.OLS(Y,x, data=df)
model.fit().summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,life_expectancy,R-squared:,0.503
Model:,OLS,Adj. R-squared:,0.48
Method:,Least Squares,F-statistic:,22.27
Date:,"Wed, 06 Oct 2021",Prob (F-statistic):,2.31e-15
Time:,23:04:42,Log-Likelihood:,-348.81
No. Observations:,116,AIC:,709.6
Df Residuals:,110,BIC:,726.1
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,49.8513,3.752,13.285,0.000,42.415,57.288
hospital_beds,0.6041,0.220,2.743,0.007,0.168,1.041
gni,0.0003,4.84e-05,5.502,0.000,0.000,0.000
bcg,0.0315,0.052,0.602,0.548,-0.072,0.135
dtp3,0.3358,0.144,2.331,0.022,0.050,0.621
hepb,-0.1715,0.140,-1.222,0.224,-0.450,0.107

0,1,2,3
Omnibus:,1.743,Durbin-Watson:,1.962
Prob(Omnibus):,0.418,Jarque-Bera (JB):,1.794
Skew:,-0.278,Prob(JB):,0.408
Kurtosis:,2.752,Cond. No.,107000.0


In [1]:
import seaborn as sns
sns.pairplot(df)

NameError: name 'df' is not defined