In [1]:
import pandas as pd
import numpy as np

### Get all the pillar names from the excel

In [2]:
names = pd.read_excel('../../UNDP Digital Assessment Data Framework Filename Matching V7.xlsx')

In [3]:
col_names = ['Indicator','check', 'Data Source','Data Link','Raw/Index','Filename','Sub-Pillar']

In [4]:
names = names[col_names]

In [5]:
names.head()

Unnamed: 0,Indicator,check,Data Source,Data Link,Raw/Index,Filename,Sub-Pillar
0,Countries,,UN Statistics Division: List of Countries,https://unstats.un.org,Raw,Countries,
1,"Database of Global Administrative Areas (GADM,...",,,https://gadm.org,Raw,,
2,High Resolution Population Density Maps + Demo...,,,,,,
3,population density vs openstreetmap object den...,,,,,,
4,Population Density,Infrastructure,World Bank: World Development Indicators,https://datacatalog.worldbank.org,Raw,population_density,Connectivity Technology


In [6]:
# get all the files per pillar
data_stats = names.groupby('check').agg({'Filename':'count','Indicator':'count'})

In [7]:
data_stats

Unnamed: 0_level_0,Filename,Indicator
check,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,19,28
Foundations,13,22
Government,11,16
Infrastructure,47,58
People,35,49
Regulation,5,8
Strategy,1,1


### Government

In [8]:
bnames = names[(names.check=='Government')&(~names.Filename.isna())]#&(names.Index==False)]

In [9]:
bnames.head(25)

Unnamed: 0,Indicator,check,Data Source,Data Link,Raw/Index,Filename,Sub-Pillar
63,Online-Service-Index (OSI),Government,UN: E-Government Survey,https://publicadministration.un.org/egovkb,Index,e_government_index,Digital Public Services
64,E-Participation index,Government,UN: E-Government Survey,https://publicadministration.un.org/egovkb,Index,e_government_index,Digital Public Services
65,Use of public services online (% of services o...,Government,Boston Consulting Group/Salesforce: The Global...,https://www.salesforce.com,Index,digital_public_service_use,Digital Public Services
66,Security incidents (# of relevant issues),Government,SPECOPS,https://specopssoft.com,Raw,cyber_attacks,Digital Public Services
67,Inherent Cyber Risk,Government,FM Global: FM Global Resilience Index,https://www.fmglobal.com/,Index,inherent_cyber_risk,Digital Public Services
68,What is the % change of government digitizing ...,Government,World Bank: GovTech Dataset,https://datacatalog.worldbank.org,Index,Egov_strategy,Digital Public Services
69,R&D spending (% of GDP),Government,World Bank: World Development Indicators,https://datacatalog.worldbank.org,Index,RD_Percentage_GDP,Funding and procurement
70,ICT investment as a percentage of GDP,Government,OECD: Going Digital Toolkit,https://data.oecd.org,Index,ICT_Investment,Funding and procurement
72,Evidence of digital strategies in/across Minis...,Government,World Bank: GovTech Dataset,https://datacatalog.worldbank.org,,Egov_strategy,Funding and procurement
73,Evidence of focus on vulnerable groups,Government,World Bank: GovTech Dataset,https://datacatalog.worldbank.org,,Egov_strategy,Leadership and coordination


In [10]:
# get list of names for all indicators
indicators = bnames.Indicator.unique()
subpillars = bnames['Sub-Pillar'].unique()

In [11]:
# get all file names
bfiles = bnames.Filename.unique()

In [12]:
bfiles

array(['e_government_index', 'digital_public_service_use',
       'cyber_attacks', 'inherent_cyber_risk', 'Egov_strategy',
       'RD_Percentage_GDP', 'ICT_Investment', 'digital_skill_level'],
      dtype=object)

In [13]:
subpillars

array(['Digital Public Services', 'Funding and procurement',
       'Leadership and coordination', 'Capabilities'], dtype=object)

In [14]:
# formula for converting scale 0-100
def convert_rank(old_value, old_min=0, old_max=max, new_min=1, new_max=5.99 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

In [15]:
# formula for converting scale 0-1
def convert_rank_b(old_value, old_min=0, old_max=1, new_min=1, new_max=5.99 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

In [16]:
# formula for converting scale quintile
def convert_rank_I(old_value, old_min,old_max,new_min=1, new_max=1.99 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value
def convert_rank_II(old_value, old_min, old_max, new_min=2, new_max=2.99 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value
def convert_rank_III(old_value, old_min, old_max, new_min=3, new_max=3.99 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value
def convert_rank_IV(old_value, old_min, old_max, new_min=4, new_max=4.99 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value
def convert_rank_V(old_value, old_min, old_max, new_min=5, new_max=5.99 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

In [17]:
### Add population dataset
df = pd.read_csv('../../processed/Population.csv')

# Normalize country names as much as possible
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Bahamas.*$)', 'Bahamas (the)')
df['Country Name'] = df['Country Name'].replace('Bahrain (Kingdom of)','Bahrain')
df['Country Name'] = df['Country Name'].replace('Bolivia','Bolivia (Plurinational State of)')
df['Country Name'] = df['Country Name'].replace('Bolivia, Plurinational State of','Bolivia (Plurinational State of)')
df['Country Name'] = df['Country Name'].replace('Brunei','Brunei Darussalam')
df['Country Name'] = df['Country Name'].replace('Bulgaria (Rep.)','Bulgaria')
df['Country Name'] = df['Country Name'].replace('Central African Republic','Central African Republic (the)')
df['Country Name'] = df['Country Name'].replace('Central African Rep.','Central African Republic (the)')
df['Country Name'] = df['Country Name'].replace("China (People's Rep.)",'China')
df['Country Name'] = df['Country Name'].replace("Comoros",'Comoros (the)')
df['Country Name'] = df['Country Name'].replace("Congo",'Congo (the)')
df['Country Name'] = df['Country Name'].replace("Congo (Brazzaville)",'Congo (the)')
df['Country Name'] = df['Country Name'].replace("Congo (Rep. of the)",'Congo (the)')
df['Country Name'] = df['Country Name'].replace("Congo (Democratic Republic of the)",'Democratic Republic of the Congo (the)')
df['Country Name'] = df['Country Name'].replace("Congo, Dem. Rep.",'Democratic Republic of the Congo (the)')
df['Country Name'] = df['Country Name'].replace("Congo, The Democratic Republic of the",'Democratic Republic of the Congo (the)')
df['Country Name'] = df['Country Name'].replace("DR Congo",'Democratic Republic of the Congo (the)')
df['Country Name'] = df['Country Name'].replace("Democratic Republic of Congo",'Democratic Republic of the Congo (the)')
df['Country Name'] = df['Country Name'].replace("Democratic Republic of the Congo",'Democratic Republic of the Congo (the)')
df['Country Name'] = df['Country Name'].replace("Dem. Rep. of the Congo",'Democratic Republic of the Congo (the)')
df['Country Name'] = df['Country Name'].replace("Cote d'Ivoire","Côte d'Ivoire")
df['Country Name'] = df['Country Name'].replace("Côte d’Ivoire","Côte d'Ivoire")
df['Country Name'] = df['Country Name'].replace("Cote D'Ivoire","Côte d'Ivoire")
df['Country Name'] = df['Country Name'].replace("Cote dIvoire","Côte d'Ivoire")
df['Country Name'] = df['Country Name'].str.replace(r"(^.*Côte d'Ivoire.*$)", "Côte d'Ivoire")
df['Country Name'] = df['Country Name'].replace("Cōte d'Ivoire","Côte d'Ivoire")
df['Country Name'] = df['Country Name'].replace("Ivory Coast","Côte d'Ivoire")
df['Country Name'] = df['Country Name'].replace("Dem. People's Rep. of Korea","Democratic People's Republic of Korea (the)")
df['Country Name'] = df['Country Name'].replace("Democratic People's Republic of Korea","Democratic People's Republic of Korea (the)")
df['Country Name'] = df['Country Name'].replace("Korea, Dem. People's Rep.","Democratic People's Republic of Korea (the)")
df['Country Name'] = df['Country Name'].replace("North Korea","Democratic People's Republic of Korea (the)")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Czech.*$)', 'Czechia')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Dominican Re.*$)', 'Dominican Republic (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Hong Kong.*$)', 'China, Hong Kong Special Administrative Region')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Hongkong.*$)', 'China, Hong Kong Special Administrative Region')
df['Country Name'] = df['Country Name'].replace("Eswatini (Kingdom of)",'Eswatini')
df['Country Name'] = df['Country Name'].replace("Swaziland",'Eswatini')
df['Country Name'] = df['Country Name'].replace("Faröe Islands",'Faroe Islands')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Gambia.*$)', 'Gambia (the)')
df['Country Name'] = df['Country Name'].replace("Georgia (Country)",'Georgia')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Iran.*$)', 'Iran (Islamic Republic of)')
df['Country Name'] = df['Country Name'].replace("Korea",'Republic of Korea (the)')
df['Country Name'] = df['Country Name'].replace("Korea (Rep. of)",'Republic of Korea (the)')
df['Country Name'] = df['Country Name'].replace("Korea (Rep.)",'Republic of Korea (the)')
df['Country Name'] = df['Country Name'].replace("Korea (Republic of)",'Republic of Korea (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Korea, Rep.*$)', 'Republic of Korea (the)')
df['Country Name'] = df['Country Name'].replace("Korea, South",'Republic of Korea (the)')
df['Country Name'] = df['Country Name'].replace("South Korea",'Republic of Korea (the)')
df['Country Name'] = df['Country Name'].replace("Republic of Korea",'Republic of Korea (the)')
df['Country Name'] = df['Country Name'].replace('Republic of Congo','Congo (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Kyrgyz.*$)', 'Kyrgyzstan')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Lao.*$)', "Lao People's Democratic Republic (the)")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Macao.*$)', "China, Macao Special Administrative Region")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Macau.*$)', "China, Macao Special Administrative Region")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Micronesia.*$)', "Micronesia (Federated States of)")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Moldova.*$)', "Republic of Moldova (the)")
df['Country Name'] = df['Country Name'].replace("Morroco",'Morocco')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Nepal.*$)', "Nepal")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*New Ze.*$)', "New Zealand")
df['Country Name'] = df['Country Name'].replace("Niger",'Niger (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Macedonia.*$)', "North Macedonia")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*New Ze.*$)', "New Zealand")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Palestin.*$)', "State of Palestine (the)")
df['Country Name'] = df['Country Name'].replace("West Bank and Gaza",'State of Palestine (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Panama.*$)', "Panama")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Philippines.*$)', "Philippines (the)")
df['Country Name'] = df['Country Name'].replace("Republic of the Congo",'Congo (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Myanmar.*$)', "Myanmar")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Puerto Rico.*$)', "Puerto Rico")
df['Country Name'] = df['Country Name'].replace("Russia",'Russian Federation (the)')
df['Country Name'] = df['Country Name'].replace("Russian Federation",'Russian Federation (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Slovak.*$)', "Slovakia")
df['Country Name'] = df['Country Name'].str.replace(r'\bSudan\b', 'Sudan (the)')
df['Country Name'] = df['Country Name'].str.replace(r'\bSudan (the)\b', 'Sudan (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*South Sudan.*$)', "South Sudan")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Syria.*$)', "Syrian Arab Republic (the)")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*São Tomé.*$)', "Sao Tome and Principe")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Taiwan.*$)', "Taiwan")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Taipei.*$)', "Taiwan")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Tanzania.*$)', "United Republic of Tanzania (the)")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Netherlands.*$)', "Netherlands (the)")
df['Country Name'] = df['Country Name'].replace("UAE",'United Arab Emirates (the)')
df['Country Name'] = df['Country Name'].replace("U.A.E",'United Arab Emirates (the)')
df['Country Name'] = df['Country Name'].replace("United Arab Emirates",'United Arab Emirates (the)')
df['Country Name'] = df['Country Name'].replace('United Kingdom','United Kingdom of Great Britain and Northern Ireland (the)')
df['Country Name'] = df['Country Name'].replace('UK','United Kingdom of Great Britain and Northern Ireland (the)')
df['Country Name'] = df['Country Name'].replace("Great Britain",'United Kingdom of Great Britain and Northern Ireland (the)')
df['Country Name'] = df['Country Name'].replace("United Kingdom of Great Britain and Northern Ireland",'United Kingdom of Great Britain and Northern Ireland (the)')
df['Country Name'] = df['Country Name'].replace('Vietnam','Viet Nam')
df['Country Name'] = df['Country Name'].replace('United States','United States of America (the)')
df['Country Name'] = df['Country Name'].replace('USA','United States of America (the)')
df['Country Name'] = df['Country Name'].replace('United States of America','United States of America (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Virgin Islands.*$)', "United States Virgin Islands")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Vatican.*$)', "Vatican")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Venezuela.*$)', "Venezuela (Bolivarian Republic of)")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Yemen.*$)', "Yemen")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Arab world.*$)', "Arab World")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*World.*$)', "World")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Kitts and Nevis.*$)', "Saint Kitts and Nevis")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Lucia.*$)', "Saint Lucia")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Martin (French Part).*$)', "Saint Martin (French Part)")
df['Country Name'] = df['Country Name'].replace('Sint Maarten','Saint Martin')
df['Country Name'] = df['Country Name'].replace('St. Martin (French part)','Saint Martin (French Part)')
df['Country Name'] = df['Country Name'].replace('Sint Maarten (Dutch part)','Saint Martin (Dutch Part)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Vincent and the Grenadines.*$)', "Saint Vincent and the Grenadines")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Verde.*$)', "Cabo Verde")
df['Country Name'] = df['Country Name'].replace('Congo, Democratic Republic','Democratic Republic of the Congo (the)')
df['Country Name'] = df['Country Name'].replace('Congo, Rep.','Congo (the)')
df['Country Name'] = df['Country Name'].replace('Republic of Congo','Congo (the)')
df['Country Name'] = df['Country Name'].replace('Congo (Rep.)','Congo (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Egypt.*$)', "Egypt")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Korea, D.*$)', "Democratic People's Republic of Korea (the)")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Tobago.*$)', "Trinidad and Tobago")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Timor-Leste.*$)', "Timor-Leste")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Emirates.*$)', "United Arab Emirates (the)")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Papua.*$)', "Papua New Guinea")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Bissau.*$)', "Guinea-Bissau")
df['Country Name'] = df['Country Name'].replace('Eq. Guinea','Equatorial Guinea')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Burma.*$)', "Myanmar")
df['Country Name'] = df['Country Name'].replace('C.A. Republic','Central African Republic (the)')
df['Country Name'] = df['Country Name'].replace('Ant.& Barb.','Antigua and Barbuda')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Bosnia.*$)', "Bosnia and Herzegovina")
df['Country Name'] = df['Country Name'].replace('Domin. Rep.','Dominican Republic (the)')
df['Country Name'] = df['Country Name'].replace('Dominica (Commonwealth of)','Dominica')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*European Union.*$)', "European Union")
df['Country Name'] = df['Country Name'].replace('R. of Congo','Congo (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Principe.*$)', "Sao Tome and Principe")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Solomon.*$)', "Solomon Islands")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Vincent.*$)', "Saint Vincent and the Grenadines")
df['Country Name'] = df['Country Name'].replace('Curacao','Curaçao')
df['Country Name'] = df['Country Name'].replace('Reunion','Réunion')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Kosovo.*$)', "Kosovo (UNSCR 1244)")

pop = df
pop

  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Bahamas.*$)', 'Bahamas (the)')
  df['Country Name'] = df['Country Name'].str.replace(r"(^.*Côte d'Ivoire.*$)", "Côte d'Ivoire")
  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Czech.*$)', 'Czechia')
  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Dominican Re.*$)', 'Dominican Republic (the)')
  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Hong Kong.*$)', 'China, Hong Kong Special Administrative Region')
  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Hongkong.*$)', 'China, Hong Kong Special Administrative Region')
  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Gambia.*$)', 'Gambia (the)')
  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Iran.*$)', 'Iran (Islamic Republic of)')
  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Korea, Rep.*$)', 'Republic of Korea (the)')
  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Kyrgyz.*$)', 'Kyr

Unnamed: 0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2015,2016,2017,2018,2019,2020,Country Name,Country Code,Indicator Name,Indicator Code
0,54208.0,55434.0,56234.0,56699.0,57029.0,57357.0,57702.0,58044.0,58377.0,58734.0,...,104339.0,104865.0,105361.0,105846.0,106310.0,106766.0,Aruba,ABW,"Population, total",SP.POP.TOTL
1,130836765.0,134159786.0,137614644.0,141202036.0,144920186.0,148769974.0,152752671.0,156876454.0,161156430.0,165611760.0,...,593871847.0,609978946.0,626392880.0,643090131.0,660046272.0,677243299.0,Africa Eastern and Southern,AFE,"Population, total",SP.POP.TOTL
2,8996967.0,9169406.0,9351442.0,9543200.0,9744772.0,9956318.0,10174840.0,10399936.0,10637064.0,10893772.0,...,34413603.0,35383028.0,36296111.0,37171922.0,38041757.0,38928341.0,Afghanistan,AFG,"Population, total",SP.POP.TOTL
3,96396419.0,98407221.0,100506960.0,102691339.0,104953470.0,107289875.0,109701811.0,112195950.0,114781116.0,117468741.0,...,401586651.0,412551299.0,423769930.0,435229381.0,446911598.0,458803476.0,Africa Western and Central,AFW,"Population, total",SP.POP.TOTL
4,5454938.0,5531451.0,5608499.0,5679409.0,5734995.0,5770573.0,5781305.0,5774440.0,5771973.0,5803677.0,...,27884380.0,28842482.0,29816769.0,30809787.0,31825299.0,32866268.0,Angola,AGO,"Population, total",SP.POP.TOTL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,947000.0,966000.0,994000.0,1022000.0,1050000.0,1078000.0,1106000.0,1135000.0,1163000.0,1191000.0,...,1788196.0,1777557.0,1791003.0,1797085.0,1788878.0,1775378.0,Kosovo (UNSCR 1244),XKX,"Population, total",SP.POP.TOTL
262,5315351.0,5393034.0,5473671.0,5556767.0,5641598.0,5727745.0,5816241.0,5907873.0,6001858.0,6097042.0,...,26497881.0,27168210.0,27834811.0,28498683.0,29161922.0,29825968.0,Yemen,YEM,"Population, total",SP.POP.TOTL
263,17099836.0,17524533.0,17965733.0,18423157.0,18896303.0,19384838.0,19888259.0,20406863.0,20942147.0,21496075.0,...,55386369.0,56207649.0,57009751.0,57792520.0,58558267.0,59308690.0,South Africa,ZAF,"Population, total",SP.POP.TOTL
264,3070780.0,3164330.0,3260645.0,3360099.0,3463211.0,3570466.0,3681953.0,3797877.0,3918872.0,4045740.0,...,15879370.0,16363449.0,16853608.0,17351714.0,17861034.0,18383956.0,Zambia,ZMB,"Population, total",SP.POP.TOTL


### 1. Online-Service-Index (OSI)

In [18]:
indicators[0]

'Online-Service-Index (OSI)'

In [19]:
# load data
indicator = indicators[0]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Online-Service-Index (OSI)
e_government_index


In [20]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Digital Public Services


In [21]:
df.head()

Unnamed: 0,Survey Year,Country Name,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
0,2020,Iraq,143,0.436,0.3095,0.3353,0.4358,0.537
1,2020,Ireland,27,0.8433,0.8571,0.7706,0.9494,0.81
2,2020,Israel,30,0.8361,0.7143,0.7471,0.8924,0.8689
3,2020,Italy,37,0.8231,0.8214,0.8294,0.8466,0.7932
4,2020,Jamaica,114,0.5392,0.369,0.3882,0.7142,0.5151


In [22]:
# all data from 2020
df['Survey Year'].value_counts()

2020    193
Name: Survey Year, dtype: int64

In [23]:
# score looks like the one to use
df.describe()

Unnamed: 0,Survey Year,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
count,193.0,193.0,193.0,193.0,193.0,193.0,193.0
mean,2020.0,97.0,0.598767,0.567723,0.561961,0.687992,0.546354
std,0.0,55.858452,0.214869,0.259592,0.249874,0.19444,0.259358
min,2020.0,1.0,0.0875,0.0,0.0,0.0,0.0
25%,2020.0,49.0,0.432,0.3571,0.3529,0.5599,0.3496
50%,2020.0,97.0,0.6129,0.5714,0.5765,0.7395,0.5669
75%,2020.0,145.0,0.7798,0.7976,0.7647,0.8414,0.7723
max,2020.0,193.0,0.9758,1.0,1.0,1.0,1.0


In [24]:
# df.Indicator.unique()

In [25]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Online Service Index'] 
df['Year'] = df['Survey Year']
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank_b(row,old_min=0,old_max=1))



In [26]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Iraq,2020,Online-Service-Index (OSI),0.3353,2.673147,True,Digital Public Services
1,Ireland,2020,Online-Service-Index (OSI),0.7706,4.845294,True,Digital Public Services
2,Israel,2020,Online-Service-Index (OSI),0.7471,4.728029,True,Digital Public Services
3,Italy,2020,Online-Service-Index (OSI),0.8294,5.138706,True,Digital Public Services
4,Jamaica,2020,Online-Service-Index (OSI),0.3882,2.937118,True,Digital Public Services
...,...,...,...,...,...,...,...
188,Senegal,2020,Online-Service-Index (OSI),0.4941,3.465559,True,Digital Public Services
189,Serbia,2020,Online-Service-Index (OSI),0.7941,4.962559,True,Digital Public Services
190,Seychelles,2020,Online-Service-Index (OSI),0.6176,4.081824,True,Digital Public Services
191,Singapore,2020,Online-Service-Index (OSI),0.9647,5.813853,True,Digital Public Services


In [27]:
# output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator))

## 2. E-Participation index


In [28]:
indicator = indicators[1]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

E-Participation index
e_government_index


In [29]:
df.head()

Unnamed: 0,Survey Year,Country Name,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
0,2020,Iraq,143,0.436,0.3095,0.3353,0.4358,0.537
1,2020,Ireland,27,0.8433,0.8571,0.7706,0.9494,0.81
2,2020,Israel,30,0.8361,0.7143,0.7471,0.8924,0.8689
3,2020,Italy,37,0.8231,0.8214,0.8294,0.8466,0.7932
4,2020,Jamaica,114,0.5392,0.369,0.3882,0.7142,0.5151


In [30]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Digital Public Services


In [31]:
df['Survey Year'].unique()

array([2020], dtype=int64)

In [32]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,4]
df['Year'] = df['Survey Year']
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank_b(row,old_min=0,old_max=1))


In [33]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Iraq,2020,E-Participation index,0.3095,2.544405,True,Digital Public Services
1,Ireland,2020,E-Participation index,0.8571,5.276929,True,Digital Public Services
2,Israel,2020,E-Participation index,0.7143,4.564357,True,Digital Public Services
3,Italy,2020,E-Participation index,0.8214,5.098786,True,Digital Public Services
4,Jamaica,2020,E-Participation index,0.3690,2.841310,True,Digital Public Services
...,...,...,...,...,...,...,...
188,Senegal,2020,E-Participation index,0.4405,3.198095,True,Digital Public Services
189,Serbia,2020,E-Participation index,0.8214,5.098786,True,Digital Public Services
190,Seychelles,2020,E-Participation index,0.5714,3.851286,True,Digital Public Services
191,Singapore,2020,E-Participation index,0.9762,5.871238,True,Digital Public Services


In [34]:
# output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator))

## 3. Use of public services online (% of services online, penetration, frequency of use)


In [35]:
indicator = indicators[2]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Use of public services online (% of services online, penetration, frequency of use)
digital_public_service_use


In [36]:
# drop first row
df = df.iloc[1: , :]

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 1 to 35
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Country Name           35 non-null     object 
 1   User Satisfaction (%)  35 non-null     float64
dtypes: float64(1), object(1)
memory usage: 692.0+ bytes


In [38]:
df.head(15)

Unnamed: 0,Country Name,User Satisfaction (%)
1,India,86.1
2,Saudi Arabia,84.2
3,China,81.8
4,Singapore,80.9
5,France,79.1
6,Chile,78.3
7,Hong Kong,76.6
8,Kenya,76.1
9,Poland,75.2
10,Australia,74.5


In [39]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Digital Public Services


In [40]:
# create the standard columns
df['higher_is_better'] = True
df['Indicator'] =  indicator
# df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df['User Satisfaction (%)']
df['Year'] = 2020
df['Sub-Pillar'] = subpillar

In [41]:
# convert 1-100 %  into 1-6
min_rank = df['data_col'].min()
max_rank = df['data_col'].max()
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=100))

df.sort_values(by='new_rank_score', ascending=False)

# prepare output
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]

# output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

In [42]:
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
1,India,2020,Use of public services online (% of services o...,86.1,5.29639,True,Digital Public Services
2,Saudi Arabia,2020,Use of public services online (% of services o...,84.2,5.20158,True,Digital Public Services
3,China,2020,Use of public services online (% of services o...,81.8,5.08182,True,Digital Public Services
4,Singapore,2020,Use of public services online (% of services o...,80.9,5.03691,True,Digital Public Services
5,France,2020,Use of public services online (% of services o...,79.1,4.94709,True,Digital Public Services
6,Chile,2020,Use of public services online (% of services o...,78.3,4.90717,True,Digital Public Services
7,Hong Kong,2020,Use of public services online (% of services o...,76.6,4.82234,True,Digital Public Services
8,Kenya,2020,Use of public services online (% of services o...,76.1,4.79739,True,Digital Public Services
9,Poland,2020,Use of public services online (% of services o...,75.2,4.75248,True,Digital Public Services
10,Australia,2020,Use of public services online (% of services o...,74.5,4.71755,True,Digital Public Services


## 4. Security incidents (# of relevant issues)



In [43]:
indicator = indicators[3]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Security incidents (# of relevant issues)
cyber_attacks


In [44]:
df.head()

Unnamed: 0,Country,Number of Significant Cyberattacks (2006-2020)
0,United States,156
1,United Kingdom,47
2,India,23
3,Germany,21
4,South Korea,18


In [45]:
# create a rank from the number of attacks fields
df['data_rank'] = df['Number of Significant Cyberattacks (2006-2020)'].rank(method='max')

In [46]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Digital Public Services


In [47]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = False
df['Indicator'] = indicator
df['data_col'] = df['data_rank']
df['Year'] = 2020
df['Sub-Pillar'] = subpillar

In [48]:
df = df[['Country Name', 'Year','Indicator','data_col','higher_is_better','Sub-Pillar']]

# Normalize country names as much as possible
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Bahamas.*$)', 'Bahamas (the)')
df['Country Name'] = df['Country Name'].replace('Bahrain (Kingdom of)','Bahrain')
df['Country Name'] = df['Country Name'].replace('Bolivia','Bolivia (Plurinational State of)')
df['Country Name'] = df['Country Name'].replace('Bolivia, Plurinational State of','Bolivia (Plurinational State of)')
df['Country Name'] = df['Country Name'].replace('Brunei','Brunei Darussalam')
df['Country Name'] = df['Country Name'].replace('Bulgaria (Rep.)','Bulgaria')
df['Country Name'] = df['Country Name'].replace('Central African Republic','Central African Republic (the)')
df['Country Name'] = df['Country Name'].replace('Central African Rep.','Central African Republic (the)')
df['Country Name'] = df['Country Name'].replace("China (People's Rep.)",'China')
df['Country Name'] = df['Country Name'].replace("Comoros",'Comoros (the)')
df['Country Name'] = df['Country Name'].replace("Congo",'Congo (the)')
df['Country Name'] = df['Country Name'].replace("Congo (Brazzaville)",'Congo (the)')
df['Country Name'] = df['Country Name'].replace("Congo (Rep. of the)",'Congo (the)')
df['Country Name'] = df['Country Name'].replace("Congo (Democratic Republic of the)",'Democratic Republic of the Congo (the)')
df['Country Name'] = df['Country Name'].replace("Congo, Dem. Rep.",'Democratic Republic of the Congo (the)')
df['Country Name'] = df['Country Name'].replace("Congo, The Democratic Republic of the",'Democratic Republic of the Congo (the)')
df['Country Name'] = df['Country Name'].replace("DR Congo",'Democratic Republic of the Congo (the)')
df['Country Name'] = df['Country Name'].replace("Democratic Republic of Congo",'Democratic Republic of the Congo (the)')
df['Country Name'] = df['Country Name'].replace("Democratic Republic of the Congo",'Democratic Republic of the Congo (the)')
df['Country Name'] = df['Country Name'].replace("Dem. Rep. of the Congo",'Democratic Republic of the Congo (the)')
df['Country Name'] = df['Country Name'].replace("Cote d'Ivoire","Côte d'Ivoire")
df['Country Name'] = df['Country Name'].replace("Côte d’Ivoire","Côte d'Ivoire")
df['Country Name'] = df['Country Name'].replace("Cote D'Ivoire","Côte d'Ivoire")
df['Country Name'] = df['Country Name'].replace("Cote dIvoire","Côte d'Ivoire")
df['Country Name'] = df['Country Name'].str.replace(r"(^.*Côte d'Ivoire.*$)", "Côte d'Ivoire")
df['Country Name'] = df['Country Name'].replace("Cōte d'Ivoire","Côte d'Ivoire")
df['Country Name'] = df['Country Name'].replace("Ivory Coast","Côte d'Ivoire")
df['Country Name'] = df['Country Name'].replace("Dem. People's Rep. of Korea","Democratic People's Republic of Korea (the)")
df['Country Name'] = df['Country Name'].replace("Democratic People's Republic of Korea","Democratic People's Republic of Korea (the)")
df['Country Name'] = df['Country Name'].replace("Korea, Dem. People's Rep.","Democratic People's Republic of Korea (the)")
df['Country Name'] = df['Country Name'].replace("North Korea","Democratic People's Republic of Korea (the)")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Czech.*$)', 'Czechia')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Dominican Re.*$)', 'Dominican Republic (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Hong Kong.*$)', 'China, Hong Kong Special Administrative Region')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Hongkong.*$)', 'China, Hong Kong Special Administrative Region')
df['Country Name'] = df['Country Name'].replace("Eswatini (Kingdom of)",'Eswatini')
df['Country Name'] = df['Country Name'].replace("Swaziland",'Eswatini')
df['Country Name'] = df['Country Name'].replace("Faröe Islands",'Faroe Islands')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Gambia.*$)', 'Gambia (the)')
df['Country Name'] = df['Country Name'].replace("Georgia (Country)",'Georgia')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Iran.*$)', 'Iran (Islamic Republic of)')
df['Country Name'] = df['Country Name'].replace("Korea",'Republic of Korea (the)')
df['Country Name'] = df['Country Name'].replace("Korea (Rep. of)",'Republic of Korea (the)')
df['Country Name'] = df['Country Name'].replace("Korea (Rep.)",'Republic of Korea (the)')
df['Country Name'] = df['Country Name'].replace("Korea (Republic of)",'Republic of Korea (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Korea, Rep.*$)', 'Republic of Korea (the)')
df['Country Name'] = df['Country Name'].replace("Korea, South",'Republic of Korea (the)')
df['Country Name'] = df['Country Name'].replace("South Korea",'Republic of Korea (the)')
df['Country Name'] = df['Country Name'].replace("Republic of Korea",'Republic of Korea (the)')
df['Country Name'] = df['Country Name'].replace('Republic of Congo','Congo (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Kyrgyz.*$)', 'Kyrgyzstan')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Lao.*$)', "Lao People's Democratic Republic (the)")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Macao.*$)', "China, Macao Special Administrative Region")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Macau.*$)', "China, Macao Special Administrative Region")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Micronesia.*$)', "Micronesia (Federated States of)")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Moldova.*$)', "Republic of Moldova (the)")
df['Country Name'] = df['Country Name'].replace("Morroco",'Morocco')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Nepal.*$)', "Nepal")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*New Ze.*$)', "New Zealand")
df['Country Name'] = df['Country Name'].replace("Niger",'Niger (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Macedonia.*$)', "North Macedonia")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*New Ze.*$)', "New Zealand")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Palestin.*$)', "State of Palestine (the)")
df['Country Name'] = df['Country Name'].replace("West Bank and Gaza",'State of Palestine (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Panama.*$)', "Panama")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Philippines.*$)', "Philippines (the)")
df['Country Name'] = df['Country Name'].replace("Republic of the Congo",'Congo (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Myanmar.*$)', "Myanmar")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Puerto Rico.*$)', "Puerto Rico")
df['Country Name'] = df['Country Name'].replace("Russia",'Russian Federation (the)')
df['Country Name'] = df['Country Name'].replace("Russian Federation",'Russian Federation (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Slovak.*$)', "Slovakia")
df['Country Name'] = df['Country Name'].str.replace(r'\bSudan\b', 'Sudan (the)')
df['Country Name'] = df['Country Name'].str.replace(r'\bSudan (the)\b', 'Sudan (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*South Sudan.*$)', "South Sudan")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Syria.*$)', "Syrian Arab Republic (the)")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*São Tomé.*$)', "Sao Tome and Principe")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Taiwan.*$)', "Taiwan")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Taipei.*$)', "Taiwan")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Tanzania.*$)', "United Republic of Tanzania (the)")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Netherlands.*$)', "Netherlands (the)")
df['Country Name'] = df['Country Name'].replace("UAE",'United Arab Emirates (the)')
df['Country Name'] = df['Country Name'].replace("U.A.E",'United Arab Emirates (the)')
df['Country Name'] = df['Country Name'].replace("United Arab Emirates",'United Arab Emirates (the)')
df['Country Name'] = df['Country Name'].replace('United Kingdom','United Kingdom of Great Britain and Northern Ireland (the)')
df['Country Name'] = df['Country Name'].replace('UK','United Kingdom of Great Britain and Northern Ireland (the)')
df['Country Name'] = df['Country Name'].replace("Great Britain",'United Kingdom of Great Britain and Northern Ireland (the)')
df['Country Name'] = df['Country Name'].replace("United Kingdom of Great Britain and Northern Ireland",'United Kingdom of Great Britain and Northern Ireland (the)')
df['Country Name'] = df['Country Name'].replace('Vietnam','Viet Nam')
df['Country Name'] = df['Country Name'].replace('United States','United States of America (the)')
df['Country Name'] = df['Country Name'].replace('USA','United States of America (the)')
df['Country Name'] = df['Country Name'].replace('United States of America','United States of America (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Virgin Islands.*$)', "United States Virgin Islands")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Vatican.*$)', "Vatican")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Venezuela.*$)', "Venezuela (Bolivarian Republic of)")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Yemen.*$)', "Yemen")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Arab world.*$)', "Arab World")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*World.*$)', "World")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Kitts and Nevis.*$)', "Saint Kitts and Nevis")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Lucia.*$)', "Saint Lucia")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Martin (French Part).*$)', "Saint Martin (French Part)")
df['Country Name'] = df['Country Name'].replace('Sint Maarten','Saint Martin')
df['Country Name'] = df['Country Name'].replace('St. Martin (French part)','Saint Martin (French Part)')
df['Country Name'] = df['Country Name'].replace('Sint Maarten (Dutch part)','Saint Martin (Dutch Part)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Vincent and the Grenadines.*$)', "Saint Vincent and the Grenadines")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Verde.*$)', "Cabo Verde")
df['Country Name'] = df['Country Name'].replace('Congo, Democratic Republic','Democratic Republic of the Congo (the)')
df['Country Name'] = df['Country Name'].replace('Congo, Rep.','Congo (the)')
df['Country Name'] = df['Country Name'].replace('Republic of Congo','Congo (the)')
df['Country Name'] = df['Country Name'].replace('Congo (Rep.)','Congo (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Egypt.*$)', "Egypt")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Korea, D.*$)', "Democratic People's Republic of Korea (the)")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Tobago.*$)', "Trinidad and Tobago")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Timor-Leste.*$)', "Timor-Leste")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Emirates.*$)', "United Arab Emirates (the)")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Papua.*$)', "Papua New Guinea")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Bissau.*$)', "Guinea-Bissau")
df['Country Name'] = df['Country Name'].replace('Eq. Guinea','Equatorial Guinea')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Burma.*$)', "Myanmar")
df['Country Name'] = df['Country Name'].replace('C.A. Republic','Central African Republic (the)')
df['Country Name'] = df['Country Name'].replace('Ant.& Barb.','Antigua and Barbuda')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Bosnia.*$)', "Bosnia and Herzegovina")
df['Country Name'] = df['Country Name'].replace('Domin. Rep.','Dominican Republic (the)')
df['Country Name'] = df['Country Name'].replace('Dominica (Commonwealth of)','Dominica')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*European Union.*$)', "European Union")
df['Country Name'] = df['Country Name'].replace('R. of Congo','Congo (the)')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Principe.*$)', "Sao Tome and Principe")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Solomon.*$)', "Solomon Islands")
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Vincent.*$)', "Saint Vincent and the Grenadines")
df['Country Name'] = df['Country Name'].replace('Curacao','Curaçao')
df['Country Name'] = df['Country Name'].replace('Reunion','Réunion')
df['Country Name'] = df['Country Name'].str.replace(r'(^.*Kosovo.*$)', "Kosovo (UNSCR 1244)")

df

  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Bahamas.*$)', 'Bahamas (the)')
  df['Country Name'] = df['Country Name'].str.replace(r"(^.*Côte d'Ivoire.*$)", "Côte d'Ivoire")
  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Czech.*$)', 'Czechia')
  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Dominican Re.*$)', 'Dominican Republic (the)')
  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Hong Kong.*$)', 'China, Hong Kong Special Administrative Region')
  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Hongkong.*$)', 'China, Hong Kong Special Administrative Region')
  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Gambia.*$)', 'Gambia (the)')
  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Iran.*$)', 'Iran (Islamic Republic of)')
  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Korea, Rep.*$)', 'Republic of Korea (the)')
  df['Country Name'] = df['Country Name'].str.replace(r'(^.*Kyrgyz.*$)', 'Kyr

Unnamed: 0,Country Name,Year,Indicator,data_col,higher_is_better,Sub-Pillar
0,United States of America (the),2020,Security incidents (# of relevant issues),20.0,False,Digital Public Services
1,United Kingdom of Great Britain and Northern I...,2020,Security incidents (# of relevant issues),19.0,False,Digital Public Services
2,India,2020,Security incidents (# of relevant issues),18.0,False,Digital Public Services
3,Germany,2020,Security incidents (# of relevant issues),17.0,False,Digital Public Services
4,Republic of Korea (the),2020,Security incidents (# of relevant issues),16.0,False,Digital Public Services
5,Australia,2020,Security incidents (# of relevant issues),15.0,False,Digital Public Services
6,Ukraine,2020,Security incidents (# of relevant issues),15.0,False,Digital Public Services
7,China,2020,Security incidents (# of relevant issues),13.0,False,Digital Public Services
8,Iran (Islamic Republic of),2020,Security incidents (# of relevant issues),13.0,False,Digital Public Services
9,Saudi Arabia,2020,Security incidents (# of relevant issues),13.0,False,Digital Public Services


In [49]:
df=df.merge(pop,how='outer',on='Country Name')
# Calculate the number of incidents per 1,000,000 people
df['new_data_col'] = df['data_col']/df['2020']*1000000
df=df[df['Indicator'].notna()]
df=df[['Country Name','Year','Indicator','new_data_col','higher_is_better','Sub-Pillar']]

min_rank = df['new_data_col'].min()
max_rank = df['new_data_col'].max()

In [50]:
first = df['new_data_col'].quantile(1/5)
second = df['new_data_col'].quantile(2/5)
third = df['new_data_col'].quantile(3/5)
fourth = df['new_data_col'].quantile(4/5)
fifth = df['new_data_col'].quantile(5/5)
data = {'Min':min_rank,'Max':max_rank,'1':first,'2':second,'3':third,'4':fourth,'5':fifth}
sextile = pd.DataFrame(data, index=[0])
sextile

Unnamed: 0,Min,Max,1,2,3,4,5
0,0.009272,0.867971,0.033922,0.071961,0.21726,0.346578,0.867971


In [51]:
df.rename(columns={'new_data_col':'data_col'}, inplace=True)
min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

df.loc[df['data_col'] <= first, 'new_rank_score'] = df['data_col'].apply(lambda row: convert_rank_I(row, old_min=min_rank,old_max=first))
df.loc[(df['data_col'] > first) & (df['data_col'] <= second), 'new_rank_score'] = df['data_col'].apply(lambda row: convert_rank_II(row, old_min=first,old_max=second))
df.loc[(df['data_col'] > second) & (df['data_col'] <= third), 'new_rank_score'] = df['data_col'].apply(lambda row: convert_rank_III(row, old_min=second,old_max=third))
df.loc[(df['data_col'] > third) & (df['data_col'] <= fourth), 'new_rank_score'] = df['data_col'].apply(lambda row: convert_rank_IV(row, old_min=third,old_max=fourth))
df.loc[df['data_col'] > fourth, 'new_rank_score'] = df['data_col'].apply(lambda row: convert_rank_V(row, old_min=fourth,old_max=max_rank))

df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (5.99-row)+1)
df=df[df['new_rank_score'].notna()]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,higher_is_better,Sub-Pillar,new_rank_score
0,United States of America (the),2020.0,Security incidents (# of relevant issues),0.060701,False,Digital Public Services,4.29306
1,United Kingdom of Great Britain and Northern I...,2020.0,Security incidents (# of relevant issues),0.282674,False,Digital Public Services,2.489222
2,India,2020.0,Security incidents (# of relevant issues),0.013043,False,Digital Public Services,5.83852
3,Germany,2020.0,Security incidents (# of relevant issues),0.204227,False,Digital Public Services,3.0888
4,Republic of Korea (the),2020.0,Security incidents (# of relevant issues),0.308996,False,Digital Public Services,2.287709
5,Australia,2020.0,Security incidents (# of relevant issues),0.583952,False,Digital Public Services,1.539283
6,Ukraine,2020.0,Security incidents (# of relevant issues),0.339869,False,Digital Public Services,2.051362
7,China,2020.0,Security incidents (# of relevant issues),0.009272,False,Digital Public Services,5.99
8,Iran (Islamic Republic of),2020.0,Security incidents (# of relevant issues),0.154775,False,Digital Public Services,3.425747
9,Saudi Arabia,2020.0,Security incidents (# of relevant issues),0.373414,False,Digital Public Services,1.939044


In [52]:
# # output scores
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)
df.to_csv('../non-index/government_{}_scores.csv'.format(indicator), index=False)

In [53]:
## 5. Inherent Cyber Risk


In [54]:
indicator = indicators[4]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Inherent Cyber Risk
inherent_cyber_risk


In [55]:
df.head()

Unnamed: 0,Country,Inherent Cyber Risk
0,Albania,55.0
1,Algeria,48.9
2,Argentina,61.7
3,Armenia,46.3
4,Australia,63.5


In [56]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Digital Public Services


In [57]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,1]
df['Year'] = 2021
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=0,old_max=100))


In [58]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Albania,2021,Inherent Cyber Risk,55.0,3.74450,True,Digital Public Services
1,Algeria,2021,Inherent Cyber Risk,48.9,3.44011,True,Digital Public Services
2,Argentina,2021,Inherent Cyber Risk,61.7,4.07883,True,Digital Public Services
3,Armenia,2021,Inherent Cyber Risk,46.3,3.31037,True,Digital Public Services
4,Australia,2021,Inherent Cyber Risk,63.5,4.16865,True,Digital Public Services
...,...,...,...,...,...,...,...
121,Uruguay,2021,Inherent Cyber Risk,69.6,4.47304,True,Digital Public Services
122,Venezuela,2021,Inherent Cyber Risk,29.6,2.47704,True,Digital Public Services
123,Vietnam,2021,Inherent Cyber Risk,36.4,2.81636,True,Digital Public Services
124,Zambia,2021,Inherent Cyber Risk,80.7,5.02693,True,Digital Public Services


## 6. What is the % change of government digitizing public services?



In [59]:
indicator = indicators[5]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

What is the % change of government digitizing public services?
Egov_strategy


In [60]:
df.head()

Unnamed: 0,#,Flag,Code,Cnum,Economy,Level,Population,GNI,GNIPC,e-Government,...,NGTI-1,NGTI-2,NGTI-3,NGTI-4,GTI-1,GTI-2,GTI-3,GTI-4,data_country,data_year
0,1,,AFG,4.0,Afghanistan,LIC,38928,20726,540,https://mcit.gov.af/node/6938,...,0.69,0.52,0.31,0.62,-0.09,-0.34,0.02,-0.12,,
1,2,,ALB,8.0,Albania,UMIC,2878,14949,5240,https://e-albania.al/,...,0.81,0.78,0.6,0.77,0.29,0.62,0.74,0.26,,
2,3,,DZA,12.0,Algeria,LMIC,43851,170722,3970,https://www.mpttn.gov.dz/ar/content/%D8%A7%D9%...,...,0.73,0.46,0.02,0.6,0.06,-0.56,-0.68,-0.15,,
3,4,,ADO,20.0,Andorra,HIC,77,3154,40886,http://www.govern.ad,...,0.63,0.6,0.05,0.13,-0.28,-0.06,-0.62,-1.32,,
4,5,,AGO,24.0,Angola,LMIC,32866,97005,3050,http://www.governo.gov.ao,...,0.68,0.69,0.21,0.61,-0.12,0.27,-0.23,-0.13,,


In [61]:
# Must limit the database to the first 206 rows, the remaining rows do not contain any useful information
df = df.iloc[0:205,:]

# Drop the superfluous rows by dropping na
df = df[pd.to_numeric(df['#'], errors='coerce').notnull()]
df['#'] = df[df['#'].notna()]

# Must convert data in the DPL column into float
df['CGSI'] = df['CGSI'].replace('-',np.nan)
df['CGSI'] = df['CGSI'].astype(float, errors = 'ignore')

In [62]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Digital Public Services


In [63]:
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['CGSI']
df['Country Name'] = df['Economy']
df['Year'] = 2020
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [64]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Afghanistan,2020,What is the % change of government digitizing ...,0.69,4.513367,True,Digital Public Services
1,Albania,2020,What is the % change of government digitizing ...,0.79,5.022551,True,Digital Public Services
2,Algeria,2020,What is the % change of government digitizing ...,0.71,4.615204,True,Digital Public Services
3,Andorra,2020,What is the % change of government digitizing ...,0.58,3.953265,True,Digital Public Services
4,Angola,2020,What is the % change of government digitizing ...,0.67,4.411531,True,Digital Public Services
5,Antigua and Barbuda,2020,What is the % change of government digitizing ...,0.68,4.462449,True,Digital Public Services
6,Argentina,2020,What is the % change of government digitizing ...,0.81,5.124388,True,Digital Public Services
7,Armenia,2020,What is the % change of government digitizing ...,0.73,4.717041,True,Digital Public Services
8,Australia,2020,What is the % change of government digitizing ...,0.89,5.531735,True,Digital Public Services
9,Austria,2020,What is the % change of government digitizing ...,,,True,Digital Public Services


In [65]:
# output scores


In [66]:
## 7. R&D spending (% of GDP)

In [67]:
indicator = indicators[6]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

R&D spending (% of GDP)
RD_Percentage_GDP


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 67 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   1960            0 non-null      float64
 1   1961            0 non-null      float64
 2   1962            0 non-null      float64
 3   1963            0 non-null      float64
 4   1964            0 non-null      float64
 5   1965            0 non-null      float64
 6   1966            0 non-null      float64
 7   1967            0 non-null      float64
 8   1968            0 non-null      float64
 9   1969            0 non-null      float64
 10  1970            0 non-null      float64
 11  1971            0 non-null      float64
 12  1972            0 non-null      float64
 13  1973            0 non-null      float64
 14  1974            0 non-null      float64
 15  1975            0 non-null      float64
 16  1976            0 non-null      float64
 17  1977            0 non-null      flo

In [69]:
subpillars[1]
subpillar = subpillars[1]
print(subpillar)

Funding and procurement


In [70]:
df.head()

Unnamed: 0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2017,2018,2019,2020,Country Name,Country Code,Indicator Name,Indicator Code,data_country,data_year
0,,,,,,,,,,,...,,,,,Aruba,ABW,Research and development expenditure (% of GDP),GB.XPD.RSDV.GD.ZS,,
1,,,,,,,,,,,...,,,,,Africa Eastern and Southern,AFE,Research and development expenditure (% of GDP),GB.XPD.RSDV.GD.ZS,,
2,,,,,,,,,,,...,,,,,Afghanistan,AFG,Research and development expenditure (% of GDP),GB.XPD.RSDV.GD.ZS,,
3,,,,,,,,,,,...,,,,,Africa Western and Central,AFW,Research and development expenditure (% of GDP),GB.XPD.RSDV.GD.ZS,,
4,,,,,,,,,,,...,,,,,Angola,AGO,Research and development expenditure (% of GDP),GB.XPD.RSDV.GD.ZS,,


In [71]:
# create standard columns
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2018'] 
df['Sub-Pillar'] = subpillar
df['Year'] = 2018

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=0,old_max=100))

In [72]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Aruba,2018,R&D spending (% of GDP),,,True,Funding and procurement
1,Africa Eastern and Southern,2018,R&D spending (% of GDP),,,True,Funding and procurement
2,Afghanistan,2018,R&D spending (% of GDP),,,True,Funding and procurement
3,Africa Western and Central,2018,R&D spending (% of GDP),,,True,Funding and procurement
4,Angola,2018,R&D spending (% of GDP),,,True,Funding and procurement
...,...,...,...,...,...,...,...
261,Kosovo,2018,R&D spending (% of GDP),,,True,Funding and procurement
262,"Yemen, Rep.",2018,R&D spending (% of GDP),,,True,Funding and procurement
263,South Africa,2018,R&D spending (% of GDP),,,True,Funding and procurement
264,Zambia,2018,R&D spending (% of GDP),,,True,Funding and procurement


In [73]:
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

## 8. ICT investment as a percentage of GDP


In [74]:
indicator = indicators[7]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

ICT investment as a percentage of GDP
ICT_Investment


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country Name  450 non-null    object 
 1   INDICATOR     450 non-null    object 
 2   SUBJECT       450 non-null    object 
 3   MEASURE       450 non-null    object 
 4   FREQUENCY     450 non-null    object 
 5   TIME          450 non-null    int64  
 6   Value         450 non-null    float64
dtypes: float64(1), int64(1), object(5)
memory usage: 24.7+ KB


In [76]:
df.head()

Unnamed: 0,Country Name,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value
0,Australia,ICTINVST,TOT,PC,A,1985,12.465454
1,Australia,ICTINVST,TOT,PC,A,1986,13.939533
2,Australia,ICTINVST,TOT,PC,A,1987,14.142429
3,Australia,ICTINVST,TOT,PC,A,1988,14.289993
4,Australia,ICTINVST,TOT,PC,A,1989,15.348707


In [77]:
df.SUBJECT.unique()

array(['TOT'], dtype=object)

In [78]:
df.TIME.max()

2010

In [79]:
df.Value.describe()

count    450.000000
mean      16.074272
std        5.636379
min        2.798417
25%       12.410442
50%       14.840726
75%       19.419433
max       32.601105
Name: Value, dtype: float64

In [80]:
subpillars[1]
subpillar = subpillars[1]
print(subpillar)

Funding and procurement


In [81]:
dcol = 'Value'
indicol = indicator
cname = 'Country Name'

# filter most recent year
df = df[(df.TIME==2010)]

# create standard columns
df['higher_is_better'] = True
df['Indicator'] =  indicator
df['Country Name'] = df[cname]
# # df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df[dcol]
df['Sub-Pillar'] = subpillar
df['Year'] = 2010

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 1-20 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=0,old_max=100))

# df.sort_values(by='new_rank_score', ascending=False)

# # prepare output
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]

# # output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

In [82]:
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
72,Canada,2010,ICT investment as a percentage of GDP,17.018367,1.849217,True,Funding and procurement
121,Finland,2010,ICT investment as a percentage of GDP,15.519828,1.774439,True,Funding and procurement
172,Germany,2010,ICT investment as a percentage of GDP,12.690394,1.633251,True,Funding and procurement
198,Ireland,2010,ICT investment as a percentage of GDP,12.412805,1.619399,True,Funding and procurement
224,Italy,2010,ICT investment as a percentage of GDP,11.026056,1.5502,True,Funding and procurement
274,Republic of Korea (the),2010,ICT investment as a percentage of GDP,10.716026,1.53473,True,Funding and procurement
323,New Zealand,2010,ICT investment as a percentage of GDP,21.238183,2.059785,True,Funding and procurement
349,Spain,2010,ICT investment as a percentage of GDP,13.763415,1.686794,True,Funding and procurement
400,Switzerland,2010,ICT investment as a percentage of GDP,18.506691,1.923484,True,Funding and procurement
449,United States of America (the),2010,ICT investment as a percentage of GDP,32.137836,2.603678,True,Funding and procurement


## 9. Evidence of digital strategies in/across Ministries



In [83]:
indicator = indicators[8]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Evidence of digital strategies in/across Ministries
Egov_strategy


In [84]:
df.head()

Unnamed: 0,#,Flag,Code,Cnum,Economy,Level,Population,GNI,GNIPC,e-Government,...,NGTI-1,NGTI-2,NGTI-3,NGTI-4,GTI-1,GTI-2,GTI-3,GTI-4,data_country,data_year
0,1,,AFG,4.0,Afghanistan,LIC,38928,20726,540,https://mcit.gov.af/node/6938,...,0.69,0.52,0.31,0.62,-0.09,-0.34,0.02,-0.12,,
1,2,,ALB,8.0,Albania,UMIC,2878,14949,5240,https://e-albania.al/,...,0.81,0.78,0.6,0.77,0.29,0.62,0.74,0.26,,
2,3,,DZA,12.0,Algeria,LMIC,43851,170722,3970,https://www.mpttn.gov.dz/ar/content/%D8%A7%D9%...,...,0.73,0.46,0.02,0.6,0.06,-0.56,-0.68,-0.15,,
3,4,,ADO,20.0,Andorra,HIC,77,3154,40886,http://www.govern.ad,...,0.63,0.6,0.05,0.13,-0.28,-0.06,-0.62,-1.32,,
4,5,,AGO,24.0,Angola,LMIC,32866,97005,3050,http://www.governo.gov.ao,...,0.68,0.69,0.21,0.61,-0.12,0.27,-0.23,-0.13,,


In [85]:
# Must limit the database to the first 206 rows, the remaining rows do not contain any useful information
df = df.iloc[0:205,:]

# Drop the superfluous rows by dropping na
df = df[pd.to_numeric(df['#'], errors='coerce').notnull()]
df['#'] = df[df['#'].notna()]

# Must convert data in the DPL column into float
df['DG St'] = df['DG St'].replace('-',np.nan)
df['DG St'] = df['DG St'].astype(float, errors = 'ignore')

In [86]:
subpillars[1]
subpillar = subpillars[1]
print(subpillar)

Funding and procurement


In [87]:
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['DG St']
df['Country Name'] = df['Economy']
df['Year'] = 2020
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

max_rank

3.0

In [88]:
# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [89]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Afghanistan,2020,Evidence of digital strategies in/across Minis...,2.0,4.326667,True,Funding and procurement
1,Albania,2020,Evidence of digital strategies in/across Minis...,3.0,5.99,True,Funding and procurement
2,Algeria,2020,Evidence of digital strategies in/across Minis...,2.0,4.326667,True,Funding and procurement
3,Andorra,2020,Evidence of digital strategies in/across Minis...,3.0,5.99,True,Funding and procurement
4,Angola,2020,Evidence of digital strategies in/across Minis...,2.0,4.326667,True,Funding and procurement
5,Antigua and Barbuda,2020,Evidence of digital strategies in/across Minis...,1.0,2.663333,True,Funding and procurement
6,Argentina,2020,Evidence of digital strategies in/across Minis...,3.0,5.99,True,Funding and procurement
7,Armenia,2020,Evidence of digital strategies in/across Minis...,1.0,2.663333,True,Funding and procurement
8,Australia,2020,Evidence of digital strategies in/across Minis...,3.0,5.99,True,Funding and procurement
9,Austria,2020,Evidence of digital strategies in/across Minis...,3.0,5.99,True,Funding and procurement


In [90]:
# # output scores


## 10. Evidence of focus on vulnerable groups


In [91]:
indicator = indicators[9]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

df = df.iloc[1:,:]

Evidence of focus on vulnerable groups
Egov_strategy


In [92]:
df.head()

Unnamed: 0,#,Flag,Code,Cnum,Economy,Level,Population,GNI,GNIPC,e-Government,...,NGTI-1,NGTI-2,NGTI-3,NGTI-4,GTI-1,GTI-2,GTI-3,GTI-4,data_country,data_year
1,2,,ALB,8.0,Albania,UMIC,2878,14949,5240,https://e-albania.al/,...,0.81,0.78,0.6,0.77,0.29,0.62,0.74,0.26,,
2,3,,DZA,12.0,Algeria,LMIC,43851,170722,3970,https://www.mpttn.gov.dz/ar/content/%D8%A7%D9%...,...,0.73,0.46,0.02,0.6,0.06,-0.56,-0.68,-0.15,,
3,4,,ADO,20.0,Andorra,HIC,77,3154,40886,http://www.govern.ad,...,0.63,0.6,0.05,0.13,-0.28,-0.06,-0.62,-1.32,,
4,5,,AGO,24.0,Angola,LMIC,32866,97005,3050,http://www.governo.gov.ao,...,0.68,0.69,0.21,0.61,-0.12,0.27,-0.23,-0.13,,
5,6,,ATG,28.0,Antigua and Barbuda,HIC,98,1618,16660,http://www.ab.gov.ag,...,0.72,0.55,0.05,0.42,0.0,-0.23,-0.62,-0.59,,


In [93]:
# Must limit the database to the first 206 rows, the remaining rows do not contain any useful information
df = df.iloc[0:205,:]

# Drop the superfluous rows by dropping na
df = df[pd.to_numeric(df['#'], errors='coerce').notnull()]
df['#'] = df[df['#'].notna()]

# Must convert data in the DPL column into float
df['WoG'] = df['WoG'].replace('-',np.nan)
df['WoG'] = df['WoG'].astype(float, errors = 'ignore')

In [94]:
df['WoG'].describe

<bound method NDFrame.describe of 1      0.03
2     -0.03
3     -0.03
4      0.03
5     -0.03
       ... 
200    0.03
201   -0.03
202   -0.03
203   -0.03
204   -0.03
Name: WoG, Length: 197, dtype: float64>

In [95]:
subpillars[2]
subpillar = subpillars[2]
print(subpillar)

Leadership and coordination


In [96]:
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['WoG']
df['Country Name'] = df['Economy']
df['Year'] = 2020
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

max_rank

0.09

In [97]:
# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min= -0.03,old_max=0.09))

In [98]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
1,Albania,2020,Evidence of focus on vulnerable groups,0.03,3.495,True,Leadership and coordination
2,Algeria,2020,Evidence of focus on vulnerable groups,-0.03,1.000,True,Leadership and coordination
3,Andorra,2020,Evidence of focus on vulnerable groups,-0.03,1.000,True,Leadership and coordination
4,Angola,2020,Evidence of focus on vulnerable groups,0.03,3.495,True,Leadership and coordination
5,Antigua and Barbuda,2020,Evidence of focus on vulnerable groups,-0.03,1.000,True,Leadership and coordination
...,...,...,...,...,...,...,...
200,Vietnam,2020,Evidence of focus on vulnerable groups,0.03,3.495,True,Leadership and coordination
201,Palestine,2020,Evidence of focus on vulnerable groups,-0.03,1.000,True,Leadership and coordination
202,Yemen,2020,Evidence of focus on vulnerable groups,-0.03,1.000,True,Leadership and coordination
203,Zambia,2020,Evidence of focus on vulnerable groups,-0.03,1.000,True,Leadership and coordination


## 11. % of digital skills certifications / training courses completed


In [99]:
indicator = indicators[10]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))
df['Average'] = (df['Technology']+df['Data Science'])/2

% of digital skills certifications / training courses completed
digital_skill_level


In [100]:
df

Unnamed: 0,Global Rank,Region,Country and Region,Year,Technology,Data Science,Average
0,1,Europe,Switzerland,2020,0.84,0.96,0.900
1,2,Europe,Luxembourg,2020,0.62,0.85,0.735
2,3,Europe,Austria,2020,0.88,0.95,0.915
3,4,Asia Pacific,Japan,2020,1.00,0.88,0.940
4,5,Europe,Germany,2020,0.89,0.94,0.915
...,...,...,...,...,...,...,...
103,104,Asia Pacific,Uzbekistan,2020,0.06,0.09,0.075
104,105,Sub-Saharan Africa,Sierra Leone,2020,0.02,0.04,0.030
105,106,Latin America and the Caribbean,Paraguay,2020,0.07,0.11,0.090
106,107,Latin America and the Caribbean,Guyana,2020,0.08,0.02,0.050


In [101]:
subpillars[3]
subpillar = subpillars[3]
print(subpillar)

Capabilities


In [102]:
dcol = 'Global Rank'
indicol = indicator
cname = 'Country and Region'

# filter most recent year
# df = df[(df.TIME==2010)]

# create standard columns
df['higher_is_better'] = True
df['Indicator'] =  indicator
df['Country Name'] = df[cname]
# # df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df['Average']
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=0,old_max=1))


# # prepare output
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]

# # output scores
df.to_csv('../indicator_scores/government_ percentage digital skills certifications_scores.csv'.format(indicator), index=False)

In [103]:
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Switzerland,2020,% of digital skills certifications / training ...,0.900,5.49100,True,Capabilities
1,Luxembourg,2020,% of digital skills certifications / training ...,0.735,4.66765,True,Capabilities
2,Austria,2020,% of digital skills certifications / training ...,0.915,5.56585,True,Capabilities
3,Japan,2020,% of digital skills certifications / training ...,0.940,5.69060,True,Capabilities
4,Germany,2020,% of digital skills certifications / training ...,0.915,5.56585,True,Capabilities
...,...,...,...,...,...,...,...
103,Uzbekistan,2020,% of digital skills certifications / training ...,0.075,1.37425,True,Capabilities
104,Sierra Leone,2020,% of digital skills certifications / training ...,0.030,1.14970,True,Capabilities
105,Paraguay,2020,% of digital skills certifications / training ...,0.090,1.44910,True,Capabilities
106,Guyana,2020,% of digital skills certifications / training ...,0.050,1.24950,True,Capabilities


### Score Aggregating

In [104]:
import os


In [105]:
# get list of files in scores folder
scores = os.listdir('../indicator_scores/')
scores = [s for s in scores if s.startswith('government')]

In [106]:
scores

['government_ percentage digital skills certifications_scores.csv',
 'government_E-Participation index_scores.csv',
 'government_ICT investment as a percentage of GDP_scores.csv',
 'government_Online-Service-Index (OSI)_scores.csv',
 'government_R&D spending (% of GDP)_scores.csv',
 'government_Security incidents (# of relevant issues)_scores.csv',
 'government_Use of public services online (% of services online, penetration, frequency of use)_scores.csv']

In [107]:
# create a dataframe that concatenates all these file into one table
df = pd.concat([pd.read_csv('../indicator_scores/{}'.format(s)) for s in scores])    

In [108]:
df

Unnamed: 0.1,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar,Unnamed: 0
0,Switzerland,2020.0,% of digital skills certifications / training ...,0.900,5.49100,True,Capabilities,
1,Luxembourg,2020.0,% of digital skills certifications / training ...,0.735,4.66765,True,Capabilities,
2,Austria,2020.0,% of digital skills certifications / training ...,0.915,5.56585,True,Capabilities,
3,Japan,2020.0,% of digital skills certifications / training ...,0.940,5.69060,True,Capabilities,
4,Germany,2020.0,% of digital skills certifications / training ...,0.915,5.56585,True,Capabilities,
...,...,...,...,...,...,...,...,...
30,Germany,2020.0,Use of public services online (% of services o...,62.700,4.12873,True,Digital Public Services,
31,Netherlands,2020.0,Use of public services online (% of services o...,62.700,4.12873,True,Digital Public Services,
32,Morocco,2020.0,Use of public services online (% of services o...,62.300,4.10877,True,Digital Public Services,
33,South Korea,2020.0,Use of public services online (% of services o...,59.700,3.97903,True,Digital Public Services,


In [109]:
# Data cleaning
df['new_rank_score'] = df['new_rank_score'].fillna(0)
df.sort_values(by=['Country Name'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)

In [110]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country Name      825 non-null    object 
 1   Year              825 non-null    float64
 2   Indicator         825 non-null    object 
 3   data_col          667 non-null    float64
 4   new_rank_score    825 non-null    float64
 5   higher_is_better  825 non-null    bool   
 6   Sub-Pillar        825 non-null    object 
 7   Unnamed: 0        386 non-null    float64
dtypes: bool(1), float64(4), object(3)
memory usage: 46.0+ KB


In [111]:
df.head(15)

Unnamed: 0.1,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar,Unnamed: 0
0,Afghanistan,2018.0,R&D spending (% of GDP),,0.0,True,Funding and procurement,
1,Afghanistan,2020.0,E-Participation index,0.4643,3.316857,True,Digital Public Services,29.0
2,Afghanistan,2020.0,Online-Service-Index (OSI),0.4118,3.054882,True,Digital Public Services,29.0
3,Africa Eastern and Southern,2018.0,R&D spending (% of GDP),,0.0,True,Funding and procurement,
4,Africa Western and Central,2018.0,R&D spending (% of GDP),,0.0,True,Funding and procurement,
5,Albania,2018.0,R&D spending (% of GDP),,0.0,True,Funding and procurement,
6,Albania,2020.0,Online-Service-Index (OSI),0.8412,5.197588,True,Digital Public Services,36.0
7,Albania,2020.0,E-Participation index,0.8452,5.217548,True,Digital Public Services,36.0
8,Algeria,2020.0,% of digital skills certifications / training ...,0.26,2.2974,True,Capabilities,
9,Algeria,2018.0,R&D spending (% of GDP),,0.0,True,Funding and procurement,


In [112]:
df.describe()

Unnamed: 0.1,Year,data_col,new_rank_score,Unnamed: 0
count,825.0,667.0,825.0,386.0
mean,2019.233939,4.552073,2.682094,96.0
std,1.384664,15.774896,1.909363,55.785861
min,2010.0,0.0,0.0,0.0
25%,2018.0,0.337584,1.033457,48.0
50%,2020.0,0.61,2.732029,96.0
75%,2020.0,0.857797,4.434118,144.0
max,2020.0,86.1,5.99,192.0


In [113]:
# checking country names
# sorted(df['Country Name'].unique().tolist())

In [114]:
# remove trailing whitespaces from country name
df['Country Name'] = df['Country Name'].str.strip()
df['Country Name'] = df['Country Name'].str.strip('**')
df['Country Name'] = df['Country Name'].str.strip('*')

In [115]:
df.head()

Unnamed: 0.1,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar,Unnamed: 0
0,Afghanistan,2018.0,R&D spending (% of GDP),,0.0,True,Funding and procurement,
1,Afghanistan,2020.0,E-Participation index,0.4643,3.316857,True,Digital Public Services,29.0
2,Afghanistan,2020.0,Online-Service-Index (OSI),0.4118,3.054882,True,Digital Public Services,29.0
3,Africa Eastern and Southern,2018.0,R&D spending (% of GDP),,0.0,True,Funding and procurement,
4,Africa Western and Central,2018.0,R&D spending (% of GDP),,0.0,True,Funding and procurement,


In [116]:
# checking country names
# sorted(df['Country Name'].unique().tolist())

In [117]:
# average indicator scores per country
agg_df = df.groupby(['Country Name']).agg({'new_rank_score':'mean','data_col':'count'})

In [118]:
agg_df.columns = ['agg_score', 'count_source' ]

In [119]:
max_number_sources = agg_df.describe()['count_source']['max']

In [120]:
agg_df['agg_score_wt'] = agg_df['agg_score']*(agg_df['count_source']/max_number_sources)

In [121]:
agg_df.sort_values(by='agg_score', ascending=False, inplace=True)

In [122]:
agg_df.head(25)

Unnamed: 0_level_0,agg_score,count_source,agg_score_wt
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
United Kingdom of Great Britain and Northern Ireland,5.827825,2,1.665093
United States of America,5.414653,3,2.320566
Republic of Korea,5.11675,3,2.192893
Russian Federation (the),4.96982,1,0.709974
Democratic People's Republic of Korea (the),4.863262,1,0.694752
Taiwan,4.8423,1,0.691757
Republic of Moldova,4.779426,2,1.36555
Hong Kong,4.695095,2,1.341456
Viet Nam,4.62917,3,1.98393
Denmark,4.555105,5,3.253646


In [123]:
agg_df.to_csv('../pillar_scores/government_scores_v0.csv')

In [124]:
### Score Aggregating by Subpillars

In [125]:
df.insert(0,'Pillar','Government')
df

Unnamed: 0.1,Pillar,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar,Unnamed: 0
0,Government,Afghanistan,2018.0,R&D spending (% of GDP),,0.000000,True,Funding and procurement,
1,Government,Afghanistan,2020.0,E-Participation index,0.4643,3.316857,True,Digital Public Services,29.0
2,Government,Afghanistan,2020.0,Online-Service-Index (OSI),0.4118,3.054882,True,Digital Public Services,29.0
3,Government,Africa Eastern and Southern,2018.0,R&D spending (% of GDP),,0.000000,True,Funding and procurement,
4,Government,Africa Western and Central,2018.0,R&D spending (% of GDP),,0.000000,True,Funding and procurement,
...,...,...,...,...,...,...,...,...,...
820,Government,Zambia,2020.0,E-Participation index,0.3095,2.544405,True,Digital Public Services,177.0
821,Government,Zambia,2020.0,Online-Service-Index (OSI),0.2588,2.291412,True,Digital Public Services,177.0
822,Government,Zimbabwe,2018.0,R&D spending (% of GDP),,0.000000,True,Funding and procurement,
823,Government,Zimbabwe,2020.0,Online-Service-Index (OSI),0.5235,3.612265,True,Digital Public Services,178.0


In [126]:
sub_df = df.groupby(['Pillar','Sub-Pillar','Country Name']).agg({'new_rank_score':'mean','data_col':'count'})

In [127]:
sub_df.columns = ['agg_score', 'count_source' ]

In [128]:
max_number_sources = sub_df.describe()['count_source']['max']

In [129]:
sub_df['agg_score_wt'] = sub_df['agg_score']*(sub_df['count_source']/max_number_sources)

In [130]:
sub_df.to_csv('../subpillar_score/government_scores_subpillar_v0.csv')

### Sources Generation

In [131]:
#Get all countries from Countries.xlsx
countries = pd.read_excel('../../data/Countries.xlsx')
col_names = ['Country or Area']
countries = countries[col_names]
countries.rename(columns = {'Country or Area': 'Country Name'}, inplace = True)

In [132]:
#Get all indicators from names dataframe retrieve at the begining of the script
bnames=bnames[['check','Sub-Pillar','Indicator','Data Source','Data Link']]
bnames.rename(columns = {'check': 'Pillar'}, inplace = True)
bnames = bnames.replace('\n','', regex=True)

In [133]:
#Do a nice cross join so that we have combination of all countries vs all indicators
sources = countries.merge(bnames, how='cross')
sources

Unnamed: 0,Country Name,Pillar,Sub-Pillar,Indicator,Data Source,Data Link
0,Algeria,Government,Digital Public Services,Online-Service-Index (OSI),UN: E-Government Survey,https://publicadministration.un.org/egovkb
1,Algeria,Government,Digital Public Services,E-Participation index,UN: E-Government Survey,https://publicadministration.un.org/egovkb
2,Algeria,Government,Digital Public Services,Use of public services online (% of services o...,Boston Consulting Group/Salesforce: The Global...,https://www.salesforce.com
3,Algeria,Government,Digital Public Services,Security incidents (# of relevant issues),SPECOPS,https://specopssoft.com
4,Algeria,Government,Digital Public Services,Inherent Cyber Risk,FM Global: FM Global Resilience Index,https://www.fmglobal.com/
...,...,...,...,...,...,...
2734,Wallis and Futuna Islands,Government,Funding and procurement,R&D spending (% of GDP),World Bank: World Development Indicators,https://datacatalog.worldbank.org
2735,Wallis and Futuna Islands,Government,Funding and procurement,ICT investment as a percentage of GDP,OECD: Going Digital Toolkit,https://data.oecd.org
2736,Wallis and Futuna Islands,Government,Funding and procurement,Evidence of digital strategies in/across Minis...,World Bank: GovTech Dataset,https://datacatalog.worldbank.org
2737,Wallis and Futuna Islands,Government,Leadership and coordination,Evidence of focus on vulnerable groups,World Bank: GovTech Dataset,https://datacatalog.worldbank.org


In [134]:
#Make copy of a scores dataframe and add the column available, with value of 1 (string)
#denoting all the country/indicator combinations that have value
dfsources = df[['Country Name','Pillar','Sub-Pillar','Indicator']].copy()
dfsources['Available'] = '1'

In [135]:
#If sources.csv exists, get the contents, remove everyhing from this pillar, append prepared sources, save csv.
#if sources.csv does not exist, create new file from sources.
from os.path import exists

if exists('../../dashboard/Sources.csv') :
    CurrentSources = pd.read_csv('../../dashboard/Sources.csv', dtype=str)
    CurrentSources = CurrentSources[['Country Name','Pillar','Sub-Pillar','Indicator','Data Source','Data Link','Available']]
    CurrentSources = CurrentSources.loc[CurrentSources['Pillar'] != 'Government']
    CurrentSources = CurrentSources.append(sources)
else :
    CurrentSources = sources
CurrentSources

Unnamed: 0,Country Name,Pillar,Sub-Pillar,Indicator,Data Source,Data Link,Available
0,Algeria,Foundations,Digital Payments,Digital payments penetration,Portulans Institute: Network Readiness Index,https://networkreadinessindex.org,0
1,Algeria,Foundations,Digital Payments,% of population with digital finance account,World Bank: Global Findex database,https://datacatalog.worldbank.org,1
2,Algeria,Foundations,Digital Payments,Made or received digital payments in the past ...,World Bank: Global Findex database,https://datacatalog.worldbank.org,1
3,Algeria,Foundations,Digital Payments,Made or received digital payments in the past ...,World Bank: Global Findex database,https://datacatalog.worldbank.org,1
4,Algeria,Foundations,Digital Payments,Used a mobile phone or the internet to check a...,World Bank: Global Findex database,https://datacatalog.worldbank.org,1
...,...,...,...,...,...,...,...
2734,Wallis and Futuna Islands,Government,Funding and procurement,R&D spending (% of GDP),World Bank: World Development Indicators,https://datacatalog.worldbank.org,
2735,Wallis and Futuna Islands,Government,Funding and procurement,ICT investment as a percentage of GDP,OECD: Going Digital Toolkit,https://data.oecd.org,
2736,Wallis and Futuna Islands,Government,Funding and procurement,Evidence of digital strategies in/across Minis...,World Bank: GovTech Dataset,https://datacatalog.worldbank.org,
2737,Wallis and Futuna Islands,Government,Leadership and coordination,Evidence of focus on vulnerable groups,World Bank: GovTech Dataset,https://datacatalog.worldbank.org,


In [136]:
CurrentSources.to_csv('../../dashboard/Sources.csv', index=False)