This notebook dynamically scrapes data from the CIA World Factbook. Use the table of contents to skip directly to visualizations. 
If you would like to know more about how this is implemented, you can read the lines starting with # in each code block.

This notebook updates with the latest CIA data every time you refresh your browser. The CIA's numbers shown here are updated every Monday.

# South Central Asia: CIA World Factbook Data 
[Countries in This Notebook](#SCA-Countries)

### Table of Contents
1. [Age Demographics](#Age)
2. [GDP Per Capita/Purchasing Power Parity](#GDP)
3. [Internet Usage %](#Internet-Use)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

In [2]:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# Read the HTML from the URL and pass on to BeautifulSoup
url = 'https://www.cia.gov/library/publications/the-world-factbook/'
print("Opening the file connection...")
uh= urllib.request.urlopen(url, context=ctx)
print("HTTP status",uh.getcode())
html =uh.read().decode()
print(f"Reading done. Total {len(html)} characters read.")

Opening the file connection...
HTTP status 200
Reading done. Total 75081 characters read.


In [3]:
soup = BeautifulSoup(html, 'html.parser')
#print(soup.prettify())

In [4]:
country_codes=['af', 'bg','bt', 'in', 'kz', 'kg', 'mv', 'np', 'pk', 'ce', 'ti', 'tx', 'uz' ]
country_names=['Afghanistan', 'Bangladesh', 'Bhutan', 'India', 'Kazakhstan', 'Kyrgyzstan', 'Maldives', 'Nepal', 'Pakistan', 'Sri Lanka', 'Tajikistan', 'Turkmenistan', 'Uzbekistan']

"\nfor tag in soup.find_all('option'):\n    country_codes.append(tag.get('value')[5:7])\n    country_names.append(tag.text)\n"

# SCA-Countries

In [36]:
print('COUNTRY NAMES\n'+'-'*30)
for country in country_names[1:]:
    print(country,end='\n')

COUNTRY NAMES
------------------------------
Bangladesh
Bhutan
India
Kazakhstan
Kyrgyzstan
Maldives
Nepal
Pakistan
Sri Lanka
Tajikistan
Turkmenistan
Uzbekistan


In [15]:
# Base URL
urlbase = 'https://www.cia.gov/library/publications/the-world-factbook/geos/'
demographics1=[]
demographics2=[]
demographics3=[]
demographics4=[]
demographics5=[]

offset = len('65 years and over: ')

# Iterate over every country
for i in range(1,len(country_names)-1):
    country_html=country_codes[i]+'.html'
    url_to_get=urlbase+country_html
    # Read the HTML from the URL and pass on to BeautifulSoup
    html = urllib.request.urlopen(url_to_get, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    
    txt=soup.get_text()
    pos1=txt.find('0-14 years: ')
    pos2=txt.find('15-24 years: ')
    pos3=txt.find('25-54 years: ')
    pos4=txt.find('55-64 years: ')
    pos5=txt.find('65 years and over: ')
    
    if pos1==-1:
        print(f"**0-14 years % data not found for {country_names[i]}!**")
        demographics1.append(np.nan)
    else:
        text=txt[pos1+12:pos1+18]
        end=re.search('%',text).start()
        a=float((txt[pos1+12:pos1+12+end]))
        demographics1.append(a)
        #print(f"0-14 years % data extraction complete for {country_names[i]}!")
    
    if pos2==-1:
        print(f"**15-24 years % data not found for {country_names[i]}!**")
        demographics2.append(np.nan)
    else:
        text=txt[pos2+13:pos2+19]
        end=re.search('%',text).start()
        a=float((txt[pos2+13:pos2+13+end]))
        demographics2.append(a)
        #print(f"15-24 years % data extraction complete for {country_names[i]}!")
        
    if pos3==-1:
        print(f"**25-54 years % data not found for {country_names[i]}!**")
        demographics3.append(np.nan)
    else:
        text=txt[pos3+13:pos3+19]
        end=re.search('%',text).start()
        a=float((txt[pos3+13:pos3+13+end]))
        demographics3.append(a)
        #print(f"25-54 years % data extraction complete for {country_names[i]}!")
    
    if pos4==-1:
        print(f"**55-64 years % data not found for {country_names[i]}!**")
        demographics4.append(np.nan)
    else:
        text=txt[pos4+13:pos4+19]
        end=re.search('%',text).start()
        a=float((txt[pos4+13:pos4+13+end]))
        demographics4.append(a)
        #print(f"55-64 years % data extraction complete for {country_names[i]}!")
    
    
    if pos5==-1:
        print(f"**65 years and beyond % data not found for {country_names[i]}!**")
        demographics5.append(np.nan)
    else:
        text=txt[pos5+offset:pos5+offset+6]
        end=re.search('%',text).start()
        a=float((txt[pos5+offset:pos5+offset+end]))
        demographics5.append(a)
        #print(f"65 years and beyond % data extraction complete for {country_names[i]}!")

# Age

In [8]:
data={
    '0-14 years old %':demographics1,
    '15-24 years %':demographics2,
    '25-54 years %':demographics3,
    '55-64 years %':demographics4,
    '65 years and above %':demographics5
}
#df1=pd.DataFrame(data=data,index=country_names[1:5])
df_demo=pd.DataFrame(data=data,index=country_names[1:len(country_codes)-1])
df_demo.index.name='COUNTRY'
df_demo.dropna(inplace=True)
df_demo

Unnamed: 0_level_0,0-14 years old %,15-24 years %,25-54 years %,55-64 years %,65 years and above %
COUNTRY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bangladesh,27.76,19.36,39.73,6.93,6.23
Bhutan,25.8,18.81,43.07,6.03,6.29
India,27.34,17.9,41.08,7.45,6.24
Kazakhstan,25.91,14.05,42.42,9.97,7.65
Kyrgyzstan,30.3,16.79,39.84,7.8,5.27
Maldives,21.4,20.21,48.1,5.85,4.45
Nepal,30.2,21.73,36.58,6.32,5.17
Pakistan,31.36,21.14,37.45,5.57,4.48
Sri Lanka,24.06,14.63,41.58,10.06,9.67
Tajikistan,32.33,18.61,40.12,5.62,3.32


In [19]:
# Base URL
urlbase = 'https://www.cia.gov/library/publications/the-world-factbook/geos/'
# Empty data dictionary
text_data=dict()

# Iterate over every country
for i in range(1,len(country_names)-1):
    country_html=country_codes[i]+'.html'
    url_to_get=urlbase+country_html
    # Read the HTML from the URL and pass on to BeautifulSoup
    html = urllib.request.urlopen(url_to_get, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    txt=soup.get_text()
    text_data[country_names[i]]=txt
    print(f"Finished loading data for {country_names[i]}")
    
print ("\n**Finished downloading all text data!**")

Finished loading data for Bangladesh
Finished loading data for Bhutan
Finished loading data for India
Finished loading data for Kazakhstan
Finished loading data for Kyrgyzstan
Finished loading data for Maldives
Finished loading data for Nepal
Finished loading data for Pakistan
Finished loading data for Sri Lanka
Finished loading data for Tajikistan
Finished loading data for Turkmenistan

**Finished downloading all text data!**


In [21]:
import pickle
pickle.dump(text_data,open("text_data_CIA_Factobook.p", "wb"))
text_data = pickle.load(open("text_data_CIA_Factobook.p", "rb"))
avg = 0
total=0
for k,v in text_data.items():
    total+=len(v)
print("Total length",total)
avg=total/len(text_data)
print("Average length",avg)
text_data.keys()

def convert_float(string):
    if string.isnumeric():
        return float(string)
    if string[0].isdigit():
        if ',' not in string and '.' in string:
            return float(string)
        if ',' not in string and '.' not in string:
            idx=string.find(' ')
            result = string[:idx]
            return float(result)
        idx1=string.find(',')
        idx2=string.find(' ')
        result = string[:idx1]+string[idx1+1:idx2]
        return float(result)
    else:
        return (-1)

Total length 574014
Average length 52183.09090909091


In [22]:
# Initialize dictionary for holding the data
Total_GDP_PPP = {}
# Iterate over every country
for i in range(1,len(country_names)-1):
    country= country_names[i]
    txt=text_data[country]       
    pos = txt.find('GDP (purchasing power parity):')
    if pos!=-1: 
        pos= pos+len('GDP (purchasing power parity):')
        string = txt[pos+1:pos+15]
        start = re.search('\$',string)
        end = re.search('[b,t]',string)
        if (start!=None and end!=None):
            start=start.start()
            end=end.start()
            a=string[start+1:start+end-1]
            print(a)
            a = convert_float(a)
            if (string[end]=='t'):
                # If the GDP was in trillions, multiply it by 1000
                a=1000*a
            #print(f"Total GDP (PPP) of {country}: {a} billion")
            # Insert the data in the dictionary
            Total_GDP_PPP[country]=a
        else:
            print("**Could not find GDP data!**")
    else:
        print("**Could not find GDP data!**")

print ("\nFinished finding all GDP (Purchasing Power Parity) (in billion $) data")

686.5
7.011
9.447
474.3
22.64
6.896
78.55
1.056
278.2
27.67
103.5

Finished finding all GDP (Purchasing Power Parity) (in billion $) data


In [29]:
# Initialize dictionary for holding the data
GDP_PPP = {}
# Iterate over every country
for i in range(1,len(country_names)-1):
    country= country_names[i]
    txt=text_data[country]       
    pos = txt.find('GDP - per capita (PPP):')
    if pos!=-1:
        pos= pos+len('GDP - per capita (PPP):')
        string = txt[pos+1:pos+11]
        start = re.search('\$',string)
        end = re.search('\S',string)
        if (start!=None and end!=None):
            start=start.start()
            end=end.start()
            a=string[start+1:start+end-1]
            #print(a)
            a = convert_float(a)
            if (a!=-1.0):
                print(f"GDP/capita (PPP) of {country}: {a} dollars")
                # Insert the data in the dictionary
                GDP_PPP[country]=a
            else:
                print("**Could not find GDP/capita data!**")
        else:
            print("**Could not find GDP/capita data!**")
    else:
        print("**Could not find GDP/capita data!**")

print ("\nFinished finding all GDP/capita data")

GDP/capita (PPP) of Bangladesh: 4200.0 dollars
GDP/capita (PPP) of Bhutan: 8700.0 dollars
GDP/capita (PPP) of India: 7200.0 dollars
GDP/capita (PPP) of Kazakhstan: 26100.0 dollars
GDP/capita (PPP) of Kyrgyzstan: 3700.0 dollars
GDP/capita (PPP) of Maldives: 19200.0 dollars
GDP/capita (PPP) of Nepal: 2700.0 dollars
GDP/capita (PPP) of Pakistan: 5400.0 dollars
GDP/capita (PPP) of Sri Lanka: 13000.0 dollars
GDP/capita (PPP) of Tajikistan: 3100.0 dollars
GDP/capita (PPP) of Turkmenistan: 18700.0 dollars

Finished finding all GDP/capita data


# GDP

In [30]:
df_GDP=pd.Series(GDP_PPP).to_frame()
df_GDP.columns=['GDP (PPP)']
df_GDP.index.name='COUNTRY'
df_GDP

Unnamed: 0_level_0,GDP (PPP)
COUNTRY,Unnamed: 1_level_1
Bangladesh,4200.0
Bhutan,8700.0
India,7200.0
Kazakhstan,26100.0
Kyrgyzstan,3700.0
Maldives,19200.0
Nepal,2700.0
Pakistan,5400.0
Sri Lanka,13000.0
Tajikistan,3100.0


In [31]:
# Initialize dictionary for holding the data
GDP_growth = {}
# Iterate over every country
for i in range(1,len(country_names)-1):
    country= country_names[i]
    txt=text_data[country]       
    pos = txt.find('GDP - real growth rate:')
    if pos!=-1: 
        pos= pos+len('GDP - real growth rate: ')
        string = txt[pos:pos+12]
        end = re.search('%',string)
        if (end!=None):
            end=end.start()
            a=string[:end]
            if a[-1].isdigit():
                a = float(a)
                print(f"GDP growth rate of {country}: {a}")
                # Insert the data in the dictionary
                GDP_growth[country]=a
            else:
                print("**Could not find GDP growth rate data!**")
        else:
            print("**Could not find GDP growth rate data!**")
    else:
        print("**Could not find GDP growth rate data!**")

print ("\nFinished finding all GDP growth rate data")

GDP growth rate of Bangladesh: 7.1
GDP growth rate of Bhutan: 5.9
GDP growth rate of India: 6.7
GDP growth rate of Kazakhstan: 3.3
GDP growth rate of Kyrgyzstan: 3.5
GDP growth rate of Maldives: 4.6
GDP growth rate of Nepal: 7.5
GDP growth rate of Pakistan: 5.3
GDP growth rate of Sri Lanka: 4.7
GDP growth rate of Tajikistan: 4.5
GDP growth rate of Turkmenistan: 6.5

Finished finding all GDP growth rate data


In [32]:
df_GDP_growth=pd.Series(GDP_growth).to_frame()
df_GDP_growth.columns=['GDP growth rate (%)']
df_GDP_growth.index.name='COUNTRY'
df_GDP_growth

Unnamed: 0_level_0,GDP growth rate (%)
COUNTRY,Unnamed: 1_level_1
Bangladesh,7.1
Bhutan,5.9
India,6.7
Kazakhstan,3.3
Kyrgyzstan,3.5
Maldives,4.6
Nepal,7.5
Pakistan,5.3
Sri Lanka,4.7
Tajikistan,4.5


In [34]:
# Initialize dictionary for holding the data
Internet_user = {}
# Iterate over every country
for i in range(1,len(country_names)-1):
    country= country_names[i]
    txt=text_data[country]       
    pos = txt.find('Internet users:')
    if pos!=-1: 
        pos= pos+len('Internet users: ')
        string = txt[pos:pos+50]
        #print(string)
        start=re.search('percent of population: ',string)
        end = re.search('%',string)
        if (start!=None and end!=None):
            start=start.end()
            end=end.start()
            a=string[start:end]
            if a[-1].isdigit():
                a = float(a)
                print(f"Internet users % of {country}: {a}")
                # Insert the data in the dictionary
                Internet_user[country]=a
            else:
                print("**Could not find Internet users data!**")
        else:
            print("**Could not find Internet users data!**")
    else:
        print("**Could not find Internet users data!**")

print ("\nFinished finding all Internet users data")

Internet users % of Bangladesh: 18.2
Internet users % of Bhutan: 41.8
Internet users % of India: 29.5
Internet users % of Kazakhstan: 76.8
Internet users % of Kyrgyzstan: 34.5
Internet users % of Maldives: 59.1
Internet users % of Nepal: 19.7
Internet users % of Pakistan: 15.5
Internet users % of Sri Lanka: 32.1
Internet users % of Tajikistan: 20.5
Internet users % of Turkmenistan: 18.0

Finished finding all Internet users data


# Internet-Use

In [35]:
df_Internet_user=pd.Series(Internet_user).to_frame()
df_Internet_user.columns=['Internet users (%)']
df_Internet_user.index.name='COUNTRY'
df_Internet_user

Unnamed: 0_level_0,Internet users (%)
COUNTRY,Unnamed: 1_level_1
Bangladesh,18.2
Bhutan,41.8
India,29.5
Kazakhstan,76.8
Kyrgyzstan,34.5
Maldives,59.1
Nepal,19.7
Pakistan,15.5
Sri Lanka,32.1
Tajikistan,20.5
