This notebook dynamically scrapes data from the CIA World Factbook. Use the table of contents to skip directly to visualizations. 
If you would like to know more about how this is implemented, you can read the lines starting with # in each code block.

This notebook updates with the latest CIA data every time you refresh your browser. The CIA's numbers shown here are updated every Monday.

# Near East Asia Division: CIA World Factbook Data 
[Countries in This Notebook](#NEA-Countries)

### Table of Contents
1. [Age Demographics](#Age)
2. [GDP Per Capita/Purchasing Power Parity](#GDP)
3. [Internet Usage %](#Internet-Use)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

In [2]:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# Read the HTML from the URL and pass on to BeautifulSoup
url = 'https://www.cia.gov/library/publications/the-world-factbook/'
print("Opening the file connection...")
uh= urllib.request.urlopen(url, context=ctx)
print("HTTP status",uh.getcode())
html =uh.read().decode()
print(f"Reading done. Total {len(html)} characters read.")

Opening the file connection...
HTTP status 200
Reading done. Total 75081 characters read.


In [3]:
soup = BeautifulSoup(html, 'html.parser')
#print(soup.prettify())

In [4]:
country_codes=['ag', 'ba','eg', 'ir', 'iz', 'is', 'jo', 'ku', 'le', 'ly', 'mo', 'mu', 'qa', 'sa', 'sy', 'ts', 'ae', 'ym' ]
country_names=['Algeria', 'Bahrain', 'Egypt', 'Iran', 'Iraq', 'Israel', 'Jordan', 'Kuwait', 'Lebanon', 'Libya', 'Morocco', 'Oman', 'Qatar', 'Saudi Arabia', 'Syria', 'Tunisia', 'United Arab Emirates', 'Yemen']

# NEA-Countries

In [19]:
print('COUNTRY NAMES\n'+'-'*30)
for country in country_names[1:]:
    print(country,end='\n')

COUNTRY NAMES
------------------------------
Bahrain
Egypt
Iran
Iraq
Israel
Jordan
Kuwait
Lebanon
Libya
Morocco
Oman
Qatar
Saudi Arabia
Syria
Tunisia
United Arab Emirates
Yemen


In [6]:
# Base URL
urlbase = 'https://www.cia.gov/library/publications/the-world-factbook/geos/'
demographics1=[]
demographics2=[]
demographics3=[]
demographics4=[]
demographics5=[]

offset = len('65 years and over: ')

# Iterate over every country
for i in range(1,len(country_names)-1):
    country_html=country_codes[i]+'.html'
    url_to_get=urlbase+country_html
    # Read the HTML from the URL and pass on to BeautifulSoup
    html = urllib.request.urlopen(url_to_get, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    
    txt=soup.get_text()
    pos1=txt.find('0-14 years: ')
    pos2=txt.find('15-24 years: ')
    pos3=txt.find('25-54 years: ')
    pos4=txt.find('55-64 years: ')
    pos5=txt.find('65 years and over: ')
    
    if pos1==-1:
        print(f"**0-14 years % data not found for {country_names[i]}!**")
        demographics1.append(np.nan)
    else:
        text=txt[pos1+12:pos1+18]
        end=re.search('%',text).start()
        a=float((txt[pos1+12:pos1+12+end]))
        demographics1.append(a)
        #print(f"0-14 years % data extraction complete for {country_names[i]}!")
    
    if pos2==-1:
        print(f"**15-24 years % data not found for {country_names[i]}!**")
        demographics2.append(np.nan)
    else:
        text=txt[pos2+13:pos2+19]
        end=re.search('%',text).start()
        a=float((txt[pos2+13:pos2+13+end]))
        demographics2.append(a)
        #print(f"15-24 years % data extraction complete for {country_names[i]}!")
        
    if pos3==-1:
        print(f"**25-54 years % data not found for {country_names[i]}!**")
        demographics3.append(np.nan)
    else:
        text=txt[pos3+13:pos3+19]
        end=re.search('%',text).start()
        a=float((txt[pos3+13:pos3+13+end]))
        demographics3.append(a)
        #print(f"25-54 years % data extraction complete for {country_names[i]}!")
    
    if pos4==-1:
        print(f"**55-64 years % data not found for {country_names[i]}!**")
        demographics4.append(np.nan)
    else:
        text=txt[pos4+13:pos4+19]
        end=re.search('%',text).start()
        a=float((txt[pos4+13:pos4+13+end]))
        demographics4.append(a)
        #print(f"55-64 years % data extraction complete for {country_names[i]}!")
    
    
    if pos5==-1:
        print(f"**65 years and beyond % data not found for {country_names[i]}!**")
        demographics5.append(np.nan)
    else:
        text=txt[pos5+offset:pos5+offset+6]
        end=re.search('%',text).start()
        a=float((txt[pos5+offset:pos5+offset+end]))
        demographics5.append(a)
        #print(f"65 years and beyond % data extraction complete for {country_names[i]}!")

# Age

In [7]:
data={
    '0-14 years old %':demographics1,
    '15-24 years %':demographics2,
    '25-54 years %':demographics3,
    '55-64 years %':demographics4,
    '65 years and above %':demographics5
}
#df1=pd.DataFrame(data=data,index=country_names[1:5])
df_demo=pd.DataFrame(data=data,index=country_names[1:len(country_codes)-1])
df_demo.index.name='COUNTRY'
df_demo.dropna(inplace=True)
df_demo

Unnamed: 0_level_0,0-14 years old %,15-24 years %,25-54 years %,55-64 years %,65 years and above %
COUNTRY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bahrain,19.08,15.65,56.04,6.28,2.95
Egypt,33.29,18.94,37.6,5.95,4.22
Iran,24.19,14.69,48.57,7.22,5.32
Iraq,39.46,19.25,33.84,3.99,3.46
Israel,27.51,15.53,37.17,8.46,11.33
Jordan,34.68,20.07,37.36,4.44,3.45
Kuwait,25.02,15.1,52.27,5.07,2.54
Lebanon,24.09,16.42,44.79,7.91,6.78
Libya,25.84,17.09,47.28,5.48,4.31
Morocco,25.77,17.04,42.32,8.13,6.74


In [8]:
# Base URL
urlbase = 'https://www.cia.gov/library/publications/the-world-factbook/geos/'
# Empty data dictionary
text_data=dict()

# Iterate over every country
for i in range(1,len(country_names)-1):
    country_html=country_codes[i]+'.html'
    url_to_get=urlbase+country_html
    # Read the HTML from the URL and pass on to BeautifulSoup
    html = urllib.request.urlopen(url_to_get, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    txt=soup.get_text()
    text_data[country_names[i]]=txt
    print(f"Finished loading data for {country_names[i]}")
    
print ("\n**Finished downloading all text data!**")

Finished loading data for Bahrain
Finished loading data for Egypt
Finished loading data for Iran
Finished loading data for Iraq
Finished loading data for Israel
Finished loading data for Jordan
Finished loading data for Kuwait
Finished loading data for Lebanon
Finished loading data for Libya
Finished loading data for Morocco
Finished loading data for Oman
Finished loading data for Qatar
Finished loading data for Saudi Arabia
Finished loading data for Syria
Finished loading data for Tunisia
Finished loading data for United Arab Emirates

**Finished downloading all text data!**


In [9]:
import pickle
pickle.dump(text_data,open("text_data_CIA_Factobook.p", "wb"))
text_data = pickle.load(open("text_data_CIA_Factobook.p", "rb"))
avg = 0
total=0
for k,v in text_data.items():
    total+=len(v)
print("Total length",total)
avg=total/len(text_data)
print("Average length",avg)
text_data.keys()

def convert_float(string):
    if string.isnumeric():
        return float(string)
    if string[0].isdigit():
        if ',' not in string and '.' in string:
            return float(string)
        if ',' not in string and '.' not in string:
            idx=string.find(' ')
            result = string[:idx]
            return float(result)
        idx1=string.find(',')
        idx2=string.find(' ')
        result = string[:idx1]+string[idx1+1:idx2]
        return float(result)
    else:
        return (-1)

Total length 910950
Average length 56934.375


In [10]:
# Initialize dictionary for holding the data
Total_GDP_PPP = {}
# Iterate over every country
for i in range(1,len(country_names)-1):
    country= country_names[i]
    txt=text_data[country]       
    pos = txt.find('GDP (purchasing power parity):')
    if pos!=-1: 
        pos= pos+len('GDP (purchasing power parity):')
        string = txt[pos+1:pos+15]
        start = re.search('\$',string)
        end = re.search('[b,t]',string)
        if (start!=None and end!=None):
            start=start.start()
            end=end.start()
            a=string[start+1:start+end-1]
            print(a)
            a = convert_float(a)
            if (string[end]=='t'):
                # If the GDP was in trillions, multiply it by 1000
                a=1000*a
            #print(f"Total GDP (PPP) of {country}: {a} billion")
            # Insert the data in the dictionary
            Total_GDP_PPP[country]=a
        else:
            print("**Could not find GDP data!**")
    else:
        print("**Could not find GDP data!**")

print ("\nFinished finding all GDP (Purchasing Power Parity) (in billion $) data")

69.77
1.199
1.631
660.7
315.6
89.05
302.5
87.89
63.14
300.1
187.9
341.7
1.789
50.28
135.9
691.9

Finished finding all GDP (Purchasing Power Parity) (in billion $) data


In [11]:
# Initialize dictionary for holding the data
GDP_PPP = {}
# Iterate over every country
for i in range(1,len(country_names)-1):
    country= country_names[i]
    txt=text_data[country]       
    pos = txt.find('GDP - per capita (PPP):')
    if pos!=-1:
        pos= pos+len('GDP - per capita (PPP):')
        string = txt[pos+1:pos+11]
        start = re.search('\$',string)
        end = re.search('\S',string)
        if (start!=None and end!=None):
            start=start.start()
            end=end.start()
            a=string[start+1:start+end-1]
            #print(a)
            a = convert_float(a)
            if (a!=-1.0):
                print(f"GDP/capita (PPP) of {country}: {a} dollars")
                # Insert the data in the dictionary
                GDP_PPP[country]=a
            else:
                print("**Could not find GDP/capita data!**")
        else:
            print("**Could not find GDP/capita data!**")
    else:
        print("**Could not find GDP/capita data!**")

print ("\nFinished finding all GDP/capita data")

GDP/capita (PPP) of Bahrain: 51800.0 dollars
GDP/capita (PPP) of Egypt: 13000.0 dollars
GDP/capita (PPP) of Iran: 20000.0 dollars
GDP/capita (PPP) of Iraq: 17000.0 dollars
GDP/capita (PPP) of Israel: 36200.0 dollars
GDP/capita (PPP) of Jordan: 12500.0 dollars
GDP/capita (PPP) of Kuwait: 69700.0 dollars
GDP/capita (PPP) of Lebanon: 19500.0 dollars
GDP/capita (PPP) of Libya: 9800.0 dollars
GDP/capita (PPP) of Morocco: 8600.0 dollars
GDP/capita (PPP) of Oman: 45500.0 dollars
GDP/capita (PPP) of Qatar: 124900.0 dollars
GDP/capita (PPP) of Saudi Arabia: 55300.0 dollars
GDP/capita (PPP) of Syria: 2900.0 dollars
GDP/capita (PPP) of Tunisia: 12000.0 dollars
GDP/capita (PPP) of United Arab Emirates: 68200.0 dollars

Finished finding all GDP/capita data


# GDP

In [12]:
df_GDP=pd.Series(GDP_PPP).to_frame()
df_GDP.columns=['GDP (PPP)']
df_GDP.index.name='COUNTRY'
df_GDP

Unnamed: 0_level_0,GDP (PPP)
COUNTRY,Unnamed: 1_level_1
Bahrain,51800.0
Egypt,13000.0
Iran,20000.0
Iraq,17000.0
Israel,36200.0
Jordan,12500.0
Kuwait,69700.0
Lebanon,19500.0
Libya,9800.0
Morocco,8600.0


In [13]:
# Initialize dictionary for holding the data
GDP_growth = {}
# Iterate over every country
for i in range(1,len(country_names)-1):
    country= country_names[i]
    txt=text_data[country]       
    pos = txt.find('GDP - real growth rate:')
    if pos!=-1: 
        pos= pos+len('GDP - real growth rate: ')
        string = txt[pos:pos+12]
        end = re.search('%',string)
        if (end!=None):
            end=end.start()
            a=string[:end]
            if a[-1].isdigit():
                a = float(a)
                print(f"GDP growth rate of {country}: {a}")
                # Insert the data in the dictionary
                GDP_growth[country]=a
            else:
                print("**Could not find GDP growth rate data!**")
        else:
            print("**Could not find GDP growth rate data!**")
    else:
        print("**Could not find GDP growth rate data!**")

print ("\nFinished finding all GDP growth rate data")

GDP growth rate of Bahrain: 2.5
GDP growth rate of Egypt: 4.1
GDP growth rate of Iran: 3.5
GDP growth rate of Iraq: -0.4
GDP growth rate of Israel: 3.1
GDP growth rate of Jordan: 2.3
GDP growth rate of Kuwait: -2.1
GDP growth rate of Lebanon: 1.5
GDP growth rate of Libya: 55.1
GDP growth rate of Morocco: 4.8
GDP growth rate of Oman: 0.0
GDP growth rate of Qatar: 2.5
GDP growth rate of Saudi Arabia: 0.1
**Could not find GDP growth rate data!**
GDP growth rate of Tunisia: 2.3
GDP growth rate of United Arab Emirates: 1.3

Finished finding all GDP growth rate data


In [14]:
df_GDP_growth=pd.Series(GDP_growth).to_frame()
df_GDP_growth.columns=['GDP growth rate (%)']
df_GDP_growth.index.name='COUNTRY'
df_GDP_growth

Unnamed: 0_level_0,GDP growth rate (%)
COUNTRY,Unnamed: 1_level_1
Bahrain,2.5
Egypt,4.1
Iran,3.5
Iraq,-0.4
Israel,3.1
Jordan,2.3
Kuwait,-2.1
Lebanon,1.5
Libya,55.1
Morocco,4.8


In [15]:
# Initialize dictionary for holding the data
Internet_user = {}
# Iterate over every country
for i in range(1,len(country_names)-1):
    country= country_names[i]
    txt=text_data[country]       
    pos = txt.find('Internet users:')
    if pos!=-1: 
        pos= pos+len('Internet users: ')
        string = txt[pos:pos+50]
        #print(string)
        start=re.search('percent of population: ',string)
        end = re.search('%',string)
        if (start!=None and end!=None):
            start=start.end()
            end=end.start()
            a=string[start:end]
            if a[-1].isdigit():
                a = float(a)
                print(f"Internet users % of {country}: {a}")
                # Insert the data in the dictionary
                Internet_user[country]=a
            else:
                print("**Could not find Internet users data!**")
        else:
            print("**Could not find Internet users data!**")
    else:
        print("**Could not find Internet users data!**")

print ("\nFinished finding all Internet users data")

Internet users % of Bahrain: 98.0
Internet users % of Egypt: 39.2
Internet users % of Iran: 44.1
Internet users % of Iraq: 21.2
Internet users % of Israel: 79.8
Internet users % of Jordan: 62.3
Internet users % of Kuwait: 78.4
Internet users % of Lebanon: 76.1
Internet users % of Libya: 20.3
Internet users % of Morocco: 58.3
Internet users % of Oman: 69.8
Internet users % of Qatar: 94.3
Internet users % of Saudi Arabia: 73.8
Internet users % of Syria: 31.9
Internet users % of Tunisia: 50.9
Internet users % of United Arab Emirates: 90.6

Finished finding all Internet users data


# Internet-Use

In [16]:
df_Internet_user=pd.Series(Internet_user).to_frame()
df_Internet_user.columns=['Internet users (%)']
df_Internet_user.index.name='COUNTRY'
df_Internet_user

Unnamed: 0_level_0,Internet users (%)
COUNTRY,Unnamed: 1_level_1
Bahrain,98.0
Egypt,39.2
Iran,44.1
Iraq,21.2
Israel,79.8
Jordan,62.3
Kuwait,78.4
Lebanon,76.1
Libya,20.3
Morocco,58.3
