In [28]:
import pandas as pd
import numpy as np
import statsmodels.api as sm 
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from IPython import display
import os
import re
import seaborn as sns; sns.set()
%matplotlib inline

# Scraping CIA Factbook

In [2]:
cat = 'CIA data/docs/notesanddefs.html'
page = open(cat).read()
page = BeautifulSoup(page)
data_map = {}
cols = page.select("div.category")
for col in cols:
    links = col.select('a')
    if len(links) > 0:
        fpath = links[0]['href']
        field = col.text.strip()
        data_map[field] =  fpath
        print(field, fpath)

Administrative divisions ../fields/302.html
Age structure ../fields/341.html
Agriculture - products ../fields/215.html
Airports ../fields/379.html
Airports - with paved runways ../fields/380.html
Airports - with unpaved runways ../fields/381.html
Area ../fields/279.html
Area - comparative ../fields/280.html
Background ../fields/325.html
Birth rate ../fields/345.html
Broadband - fixed subscriptions ../fields/206.html
Broadcast media ../fields/199.html
Budget ../fields/224.html
Budget surplus (+) or deficit (-) ../fields/226.html
Capital ../fields/301.html
Carbon dioxide emissions from consumption of energy ../fields/274.html
Central bank discount rate ../fields/230.html
Children under the age of 5 years underweight ../fields/368.html
Citizenship ../fields/310.html
Civil aircraft registration country code prefix ../fields/378.html
Climate ../fields/284.html
Coastline ../fields/282.html
Commercial bank prime lending rate ../fields/231.html
Communications - note ../fields/205.html
Constitu

In [7]:
def map_data(data_map):
    data = {}
    for field in data_map:
        page = open('CIA data/fields/' + data_map[field].split('/')[-1]).read()
        page_field = BeautifulSoup(page)
        cols = page_field.select('td')
        for i in range(len(cols)):
            if i % 2 == 0:
                country = cols[i].select('a')[0].text
            else:
                value = cols[i].select('div.category_data')
                if field not in data:
                    data[field] = [(country, [x.text for x in value])]
                else:
                    data[field].append((country, [x.text for x in value]))
                    
    return data
data = map_data(data_map)       

# Demographics and Constructing a data Frame

In [318]:
Columns = ['Age structure', 'Area','Budget', 'Birth rate', 'Death rate', 'Debt - external', 'GDP (official exchange rate)',
          'GDP - per capita (PPP)', 'Hospital bed density','Physicians density', 'Life expectancy at birth',
           'Household income or consumption by percentage share', 'Population', 'Population below poverty line',
           'Religions', 'Unemployment rate']

In [319]:
from functools import reduce
dfList = []
for field in Columns:
    df = pd.DataFrame({'Country': [key[0] for key in data[field]], 
                       field: [ key[1] for key in data[field]]})
    dfList.append(df)
    

df = reduce(lambda x, y: pd.merge(x, y, how='outer', on = 'Country'), dfList)

In [320]:
df.head()

Unnamed: 0,Country,Age structure,Area,Birth rate,Death rate,Debt - external,GDP (official exchange rate),GDP - per capita (PPP),Hospital bed density,Physicians density,Life expectancy at birth,Household income or consumption by percentage share,Population,Population below poverty line,Religions,Unemployment rate
0,Afghanistan,"[\n0-14 years:\n40.92%\n(male 7,263,716 /femal...","[\ntotal:\n652,230 sq km\n\n, \nland:\n652,230...","[\n37.5 births/1,000 population\n\n(2018 est.)\n]","[\n13.2 deaths/1,000 population\n\n(2018 est.)\n]",[\n$2.84 billion\n\n(FY/)\n],[\n$20.24 billion\n(2017 est.)\n(2017 est.)\n],"[\n$2,000\n\n(2017 est.)\n, \n$2,000\n\n(2016 ...","[\n0.5 beds/1,000 population\n\n(2014)\n]","[\n0.3 physicians/1,000 population\n\n(2016)\n]",[\ntotal population:\n52.1 years\n\n(2018 est....,"[\nlowest 10%:\n3.8%\n\n(2008)\n, \nhighest 10...","[\n34,940,837\n\n(July 2018 est.)\n]",[\n54.5%\n\n(2017 est.)\n],[\n \n Muslim 99.7% (Sunni 84....,"[\n23.9%\n\n(2017 est.)\n, \n22.6%\n\n(2016 es..."
1,Albania,"[\n0-14 years:\n17.84%\n(male 287,750 /female ...","[\ntotal:\n28,748 sq km\n\n, \nland:\n27,398 s...","[\n13.2 births/1,000 population\n\n(2018 est.)\n]","[\n6.9 deaths/1,000 population\n\n(2018 est.)\n]",[\n$9.505 billion\n\n(31 December 2017 est.)\n...,[\n$13.07 billion\n(2017 est.)\n(2017 est.)\n],"[\n$12,500\n\n(2017 est.)\n, \n$12,100\n\n(201...","[\n2.9 beds/1,000 population\n\n(2013)\n]","[\n1.29 physicians/1,000 population\n\n(2013)\n]",[\ntotal population:\n78.6 years\n\n(2018 est....,"[\nlowest 10%:\n19.6%\n\n(2015 est.)\n, \nhigh...","[\n3,057,220\n\n(July 2018 est.)\n]",[\n14.3%\n\n(2012 est.)\n],"[\n \n Muslim 56.7%, Roman Cat...","[\n13.8%\n\n(2017 est.)\n, \n15.2%\n\n(2016 es..."
2,Algeria,"[\n0-14 years:\n29.49%\n(male 6,290,619 /femal...","[\ntotal:\n2,381,740 sq km\n\n, \nland:\n2,381...","[\n21.5 births/1,000 population\n\n(2018 est.)\n]","[\n4.3 deaths/1,000 population\n\n(2018 est.)\n]","[\n$6.26 billion\n\n(31 December 2017 est.)\n,...",[\n$167.6 billion\n(2017 est.)\n(2017 est.)\n],"[\n$15,200\n\n(2017 est.)\n, \n$15,200\n\n(201...","[\n1.9 beds/1,000 population\n\n(2015)\n]",,[\ntotal population:\n77.2 years\n\n(2018 est....,"[\nlowest 10%:\n26.8%\n\n(1995)\n, \nhighest 1...","[\n41,657,488\n\n(July 2018 est.)\n]",[\n23%\n\n(2006 est.)\n],[\n \n Muslim (official; predo...,"[\n11.7%\n\n(2017 est.)\n, \n10.5%\n\n(2016 es..."
3,American Samoa,"[\n0-14 years:\n29.59%\n(male 7,732 /female 7,...","[\ntotal:\n224 sq km\n\n, \nland:\n224 sq km\n...","[\n19 births/1,000 population\n\n(2018 est.)\n]","[\n5.9 deaths/1,000 population\n\n(2018 est.)\n]",[\nNA\n],[\n$658 million\n(2016 est.)\n(2016 est.)\n],"[\n$11,200\n\n(2016 est.)\n, \n$11,300\n\n(201...",,,[\ntotal population:\n73.9 years\n\n(2018 est....,"[\nlowest 10%:\nNA\n, \nhighest 10%:\nNA\n]","[\n50,826\n\n(July 2018 est.)\n]",[\nNA\n],"[\n \n Christian 98.3%, other ...",[\n29.8%\n\n(2005)\n]
4,Andorra,"[\n0-14 years:\n14.06%\n(male 6,197 /female 5,...","[\ntotal:\n468 sq km\n\n, \nland:\n468 sq km\n...","[\n7.3 births/1,000 population\n\n(2018 est.)\n]","[\n7.4 deaths/1,000 population\n\n(2018 est.)\n]",[\n$0\n\n(2016)\n],[\n$2.712 billion\n(2016 est.)\n(2016 est.)\n],"[\n$49,900\n\n(2015 est.)\n, \n$51,300\n\n(201...","[\n2.5 beds/1,000 population\n\n(2009)\n]","[\n3.69 physicians/1,000 population\n\n(2015)\n]",[\ntotal population:\n82.9 years\n\n(2018 est....,"[\nlowest 10%:\nNA\n, \nhighest 10%:\nNA\n]","[\n85,708\n\n(July 2018 est.)\n]",,[\n \n Roman Catholic (predom...,"[\n3.7%\n\n(2016 est.)\n, \n4.1%\n\n(2015 est...."


## Age Strucuture

In [321]:
Age_groups = ['0-14', '15-24', '25-54', '55-64', '65+']
for i, age in enumerate(Age_groups):    
    df[age+'_male'] = df['Age structure'].apply(lambda l: re.search('male \d*,?\d*,?\d*', str(l[i])) 
                                                                    if isinstance(l, list) else float('Nan'))
    df[age+'_female'] = df['Age structure'].apply(lambda l: re.search('female \d*,?\d*,?\d*', str(l[i])) 
                                                                    if isinstance(l, list) else float('Nan'))
    
# cleaning up and turning into some readable numbers
def get_number(reg):
    if isinstance(reg, re.Match):
        return float(reg.string[reg.start(): reg.end()].split(' ')[-1].replace(',', ''))
    
    else:
        return float('nan')

In [322]:
age_columns = []
for i, age in enumerate(Age_groups):  
    df[age+'_male'] = df[age+'_male'].apply(get_number)
    df[age+'_female'] = df[age+'_female'].apply(get_number)
    df[age] = df[age+'_male'] + df[age+'_female']
    age_columns += [age+'_male', age+'_female']

In [323]:
Frame = pd.melt(df, id_vars=['Country'] , value_vars=age_columns,
        var_name='Category', value_name='Population')

In [324]:
Frame.head()

Unnamed: 0,Country,Category,Population
0,Afghanistan,0-14_male,7263716.0
1,Albania,0-14_male,287750.0
2,Algeria,0-14_male,6290619.0
3,American Samoa,0-14_male,7732.0
4,Andorra,0-14_male,6197.0


df

In [296]:
Frame[['Category', 'Sex']] = Frame['Category'].str.split('_', expand=True)
Frame.head()

Unnamed: 0,Country,Category,Population,Sex
0,Afghanistan,0-14,7263716.0,male
1,Albania,0-14,287750.0,male
2,Algeria,0-14,6290619.0,male
3,American Samoa,0-14,7732.0,male
4,Andorra,0-14,6197.0,male


In [297]:
len(df) == len(Frame)/5/2 # 5 age cateories and 2 sex cat

True

In [327]:
Frame.to_csv('clean_CIA_data/Age_structure.csv')

## More Demographics by Country

In [298]:
demo = ['Area','Birth rate', 'Death rate', 'Hospital bed density','Physicians density', 
        'Life expectancy at birth','Population below poverty line','Unemployment rate']

In [299]:
for c in demo:
    df[c] = df[c].apply(lambda l: l[0].replace('\n', '') if isinstance(l , list) else float('nan'))

In [300]:
demo_df = df.copy()[demo]
demo_df.head()

Unnamed: 0,Area,Birth rate,Death rate,Hospital bed density,Physicians density,Life expectancy at birth,Population below poverty line,Unemployment rate
0,"total:652,230 sq km","37.5 births/1,000 population(2018 est.)","13.2 deaths/1,000 population(2018 est.)","0.5 beds/1,000 population(2014)","0.3 physicians/1,000 population(2016)",total population:52.1 years(2018 est.),54.5%(2017 est.),23.9%(2017 est.)
1,"total:28,748 sq km","13.2 births/1,000 population(2018 est.)","6.9 deaths/1,000 population(2018 est.)","2.9 beds/1,000 population(2013)","1.29 physicians/1,000 population(2013)",total population:78.6 years(2018 est.),14.3%(2012 est.),13.8%(2017 est.)
2,"total:2,381,740 sq km","21.5 births/1,000 population(2018 est.)","4.3 deaths/1,000 population(2018 est.)","1.9 beds/1,000 population(2015)",,total population:77.2 years(2018 est.),23%(2006 est.),11.7%(2017 est.)
3,total:224 sq km,"19 births/1,000 population(2018 est.)","5.9 deaths/1,000 population(2018 est.)",,,total population:73.9 years(2018 est.),,29.8%(2005)
4,total:468 sq km,"7.3 births/1,000 population(2018 est.)","7.4 deaths/1,000 population(2018 est.)","2.5 beds/1,000 population(2009)","3.69 physicians/1,000 population(2015)",total population:82.9 years(2018 est.),,3.7%(2016 est.)


### Area (km^2) & Life expectancy at birth

In [301]:
area = demo_df['Area'].apply(lambda x: (x.split(':')[1].split(' ')[0]).replace(',',''))
area.at[243] = 439781 #French Southern and Antarctic Lands'
demo_df['Area'] = area.astype(float)

In [302]:
def get_exp(c):
    try:
        return float((c.split(':')[1].split(' ')[0]).replace(',',''))
    except:
        return float('nan')
        
life_exp = df['Life expectancy at birth'].apply(get_exp)
demo_df['Life expectancy at birth'] = life_exp.astype(float)

In [303]:
demo_df

Unnamed: 0,Area,Birth rate,Death rate,Hospital bed density,Physicians density,Life expectancy at birth,Population below poverty line,Unemployment rate
0,652230.00,"37.5 births/1,000 population(2018 est.)","13.2 deaths/1,000 population(2018 est.)","0.5 beds/1,000 population(2014)","0.3 physicians/1,000 population(2016)",52.1,54.5%(2017 est.),23.9%(2017 est.)
1,28748.00,"13.2 births/1,000 population(2018 est.)","6.9 deaths/1,000 population(2018 est.)","2.9 beds/1,000 population(2013)","1.29 physicians/1,000 population(2013)",78.6,14.3%(2012 est.),13.8%(2017 est.)
2,2381740.00,"21.5 births/1,000 population(2018 est.)","4.3 deaths/1,000 population(2018 est.)","1.9 beds/1,000 population(2015)",,77.2,23%(2006 est.),11.7%(2017 est.)
3,224.00,"19 births/1,000 population(2018 est.)","5.9 deaths/1,000 population(2018 est.)",,,73.9,,29.8%(2005)
4,468.00,"7.3 births/1,000 population(2018 est.)","7.4 deaths/1,000 population(2018 est.)","2.5 beds/1,000 population(2009)","3.69 physicians/1,000 population(2015)",82.9,,3.7%(2016 est.)
...,...,...,...,...,...,...,...,...
262,5.00,,,,,,,
263,62045.00,,,,,,,
264,12.00,,,,"2.72 physicians/1,000 population(2010)",,,
265,6959.41,,,,,,,


### Birth Rate, Death rate, Physicians density, Hospital bed density, Population below poverty line and Unemployment rate

In [304]:
section1 = ['Birth rate', 'Death rate', 'Hospital bed density','Physicians density']

In [305]:
for s in section1:
    demo_df[s] = demo_df[s].map(lambda x: float(x.split(' ')[0]) if isinstance(x, str) else float('nan'))

In [306]:
section2 = ['Population below poverty line', 'Unemployment rate']
def clean_perc(c):
    try:
        return float(c.split('%')[0])
    except:
        return float('nan')
        
for s in section2:
    demo_df[s] = demo_df[s].apply(clean_perc)

In [312]:
demo_df.index = df['Country']

In [328]:
demo_df.to_csv('clean_CIA_data/country_data.csv')

In [331]:
Frame.sample(n=10)

Unnamed: 0,Country,Category,Population
465,Switzerland,0-14_female,612479.0
975,Saint Martin,15-24_female,1685.0
2267,Malta,65+_male,41900.0
732,Switzerland,15-24_male,453003.0
1966,Iran,55-64_female,3113443.0
1852,Johnston Atoll,55-64_male,
2606,Thailand,65+_female,4239992.0
547,Azerbaijan,15-24_male,743142.0
542,Argentina,15-24_male,3476344.0
974,Saint Lucia,15-24_female,12060.0


In [332]:
demo_df.sample(n=10)

Unnamed: 0_level_0,Area,Birth rate,Death rate,Hospital bed density,Physicians density,Life expectancy at birth,Population below poverty line,Unemployment rate
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
El Salvador,21041.0,16.1,5.8,1.3,1.92,75.1,32.7,7.0
Palmyra Atoll,11.9,,,,,,,
Akrotiri,123.0,,,,,,,
Lebanon,10400.0,14.1,5.1,2.9,2.38,77.9,28.6,9.7
Mozambique,799380.0,37.8,11.4,0.7,0.06,54.1,46.1,24.5
Comoros,2235.0,25.3,7.1,2.2,,64.9,44.8,6.5
Armenia,29743.0,12.6,9.5,4.2,2.8,75.1,32.0,18.9
Brunei,5765.0,16.9,3.7,2.7,1.75,77.5,,6.9
Togo,56785.0,32.8,6.8,0.7,0.06,65.8,55.1,6.9
Samoa,2831.0,20.2,5.4,,0.34,74.2,,5.2
