In [1]:
import pandas as pd
import numpy as np



### Get all the pillar names from the excel

In [2]:
names = pd.read_excel('../../UNDP Digital Assessment Data Framework Filename Matching V7.xlsx')

In [3]:
col_names = ['Indicator','check', 'Data Source','Index','Filename']

In [4]:
names = names[col_names]

In [5]:
names.head()

Unnamed: 0,Indicator,check,Data Source,Index,Filename
0,Countries,,United Nations,False,Countries
1,"Database of Global Administrative Areas (GADM,...",,GADM maps and data,False,
2,High Resolution Population Density Maps + Demo...,,Facebook,False,
3,population density vs openstreetmap object den...,,Kontur,False,
4,Population Density,Infrastructure,World Bank,False,population_density


In [6]:
# get all the files per pillar
data_stats = names.groupby('check').agg({'Filename':'count','Indicator':'count'})

In [7]:
data_stats

Unnamed: 0_level_0,Filename,Indicator
check,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,20,25
Foundations,8,12
Government,10,15
Infrastructure,38,48
People,35,47
Regulation,6,7
Strategy,1,1


### Foundations

In [8]:
bnames = names[(names.check=='Foundations')&(~names.Filename.isna())]#&(names.Index==False)]

In [9]:
bnames.head(25)

Unnamed: 0,Indicator,check,Data Source,Index,Filename
148,Digital payments penetration,Foundations,Portulans Institute,True,digital_payments_penetration
149,% of population with digital finance account -...,Foundations,World Bank,False,population_digital_financial_services
150,% of population with digital finance account -...,Foundations,World Bank,False,population_digital_financial_services
154,% of population with ID,Foundations,World Bank,False,id4d_nid
155,% of services that can be accessed,Foundations,World Bank,False,id4d_services
156,can ID be used for transactions,Foundations,World Bank,False,id4d_services
157,Is personal data siloed,Foundations,World Bank,False,Egov_strategy
158,Open data index,Foundations,Open Knowledge Foundation,True,open_data_idx


In [10]:
# get list of names for all indicators
indicators = bnames.Indicator.unique()

In [11]:
# get all file names
bfiles = bnames.Filename.unique()

In [12]:
bfiles

array(['digital_payments_penetration',
       'population_digital_financial_services', 'id4d_nid',
       'id4d_services', 'Egov_strategy', 'open_data_idx'], dtype=object)

In [13]:
# ls digital-readiness-assessment-main/processed/

In [14]:
##ict_goods and services not in process data

In [15]:
# formula for converting scale
def convert_rank(old_value, old_min=1, old_max=7, new_min=1, new_max=6 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

### 1. Digital payments penetration

In [16]:
indicators[0]

'Digital payments penetration'

In [17]:
# load data
indicator = indicators[0]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Digital payments penetration
digital_payments_penetration


In [18]:
df.head()

Unnamed: 0,RANK,COUNTRY/ECONOMY,VALUE,SCORE
0,1.0,Norway,0.85,100.0
1,2.0,Denmark,0.83,97.24
2,3.0,Finland,0.8,93.95
3,4.0,Sweden,0.8,93.08
4,5.0,Netherlands,0.76,89.01


In [19]:
# score looks like the one to use
df.describe()

Unnamed: 0,RANK,VALUE,SCORE
count,122.0,122.0,122.0
mean,61.5,0.315738,36.076967
std,35.362409,0.209012,24.867406
min,1.0,0.01,0.0
25%,31.25,0.15,16.235
50%,61.5,0.27,30.87
75%,91.75,0.44,51.11
max,122.0,0.85,100.0


In [20]:
# df.Indicator.unique()

In [21]:
# create standard columns
df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['SCORE'] 


min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 1-100 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))



In [22]:
df

Unnamed: 0,RANK,Country Name,VALUE,SCORE,higher_is_better,Indicator,data_col,new_rank_score
0,1.0,Norway,0.85,100.00,True,Digital payments penetration,100.00,6.0000
1,2.0,Denmark,0.83,97.24,True,Digital payments penetration,97.24,5.8620
2,3.0,Finland,0.80,93.95,True,Digital payments penetration,93.95,5.6975
3,4.0,Sweden,0.80,93.08,True,Digital payments penetration,93.08,5.6540
4,5.0,Netherlands,0.76,89.01,True,Digital payments penetration,89.01,5.4505
...,...,...,...,...,...,...,...,...
129,,Jamaica,,,True,Digital payments penetration,,
130,,Madagascar,,,True,Digital payments penetration,,
131,,Oman,,,True,Digital payments penetration,,
132,,Qatar,,,True,Digital payments penetration,,


In [23]:

# output scores to csv
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/foundations_{}_scores.csv'.format(indicator), index=False)

In [103]:
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,Afghanistan,% of population with ID,67,3.857143,True
1,Albania,% of population with ID,100,6.0,True
2,Algeria,% of population with ID,89,5.285714,True
3,Andorra,% of population with ID,47,2.558442,True
4,Angola,% of population with ID,44,2.363636,True
5,Antigua and Barbuda,% of population with ID,75,4.376623,True
6,Argentina,% of population with ID,100,6.0,True
7,Armenia,% of population with ID,100,6.0,True
8,Australia,% of population with ID,88,5.220779,True
9,Austria,% of population with ID,88,5.220779,True


## 2. % of population with digital finance account - registered


In [25]:
indicator = indicators[1]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population with digital finance account - registered
population_digital_financial_services


In [26]:
df.head(15)

Unnamed: 0,Year,ISO,Country Name,Region,Income Group,Account (% age 15+),"Account, male (% age 15+)","Account, in labor force (% age 15+)","Account, out of labor force (% age 15+)","Account, female (% age 15+)",...,"Mobile money account, female (% age 15+)","Mobile money account, young adults (% age 15-24)","Mobile money account, older adults (% age 25+)","Mobile money account, primary education or less (% age 15+)","Mobile money account, secondary education or less (% age 15+)","Mobile money account, income, poorest 40% (% age 15+)","Mobile money account, income, richest 60% (% age 15+)","Mobile money account, rural (% age 15+)",data_country,data_year
0,2011,AFG,Afghanistan,South Asia,Low income,9%,15%,15%,2%,3%,...,,,,,,,,,,
1,2014,AFG,Afghanistan,South Asia,Low income,10%,16%,15%,4%,4%,...,0%,0%,0%,0%,0%,0%,1%,0%,,
2,2017,AFG,Afghanistan,South Asia,Low income,15%,23%,25%,4%,7%,...,1%,0%,1%,0%,2%,0%,1%,1%,,
3,2011,AGO,Angola,Sub-Saharan Africa (excluding high income),Lower middle income,39%,39%,46%,31%,39%,...,,,,,,,,,,
4,2014,AGO,Angola,Sub-Saharan Africa (excluding high income),Lower middle income,29%,36%,36%,12%,22%,...,,,,,,,,,,
5,2011,ALB,Albania,Europe & Central Asia (excluding high income),Upper middle income,28%,34%,36%,15%,23%,...,,,,,,,,,,
6,2014,ALB,Albania,Europe & Central Asia (excluding high income),Upper middle income,38%,43%,44%,29%,34%,...,,,,,,,,,,
7,2017,ALB,Albania,Europe & Central Asia (excluding high income),Upper middle income,40%,42%,53%,27%,38%,...,2%,6%,1%,1%,4%,0%,4%,2%,,
8,2011,ARB,Arab world,,,22%,30%,33%,11%,14%,...,,,,,,,,,,
9,2014,ARB,Arab world,,,30%,38%,42%,18%,22%,...,,,,,,,,,,


In [27]:
df = df[(df.Year == 2017)]

# Must convert the data to float by removing the % sign
df['Account (% age 15+)'] = df['Account (% age 15+)'].str.replace('%','')
df['Account (% age 15+)'] = df['Account (% age 15+)'].astype(float)

In [28]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Account (% age 15+)'] 


min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 1-20 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [29]:
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]
df

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
2,Afghanistan,% of population with digital finance account -...,15.0,1.329670,True
7,Albania,% of population with digital finance account -...,40.0,2.703297,True
10,Arab world,% of population with digital finance account -...,37.0,2.538462,True
13,United Arab Emirates,% of population with digital finance account -...,88.0,5.340659,True
16,Argentina,% of population with digital finance account -...,49.0,3.197802,True
...,...,...,...,...,...
479,World,% of population with digital finance account -...,69.0,4.296703,True
482,Kosovo,% of population with digital finance account -...,52.0,3.362637,True
487,South Africa,% of population with digital finance account -...,69.0,4.296703,True
490,Zambia,% of population with digital finance account -...,46.0,3.032967,True


In [30]:
# output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

## 3. % of population with digital finance account - active (90 days)

In [31]:
indicator = indicators[2]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population with digital finance account - active (90 days)
population_digital_financial_services


In [32]:
df.head()

Unnamed: 0,Year,ISO,Country Name,Region,Income Group,Account (% age 15+),"Account, male (% age 15+)","Account, in labor force (% age 15+)","Account, out of labor force (% age 15+)","Account, female (% age 15+)",...,"Mobile money account, female (% age 15+)","Mobile money account, young adults (% age 15-24)","Mobile money account, older adults (% age 25+)","Mobile money account, primary education or less (% age 15+)","Mobile money account, secondary education or less (% age 15+)","Mobile money account, income, poorest 40% (% age 15+)","Mobile money account, income, richest 60% (% age 15+)","Mobile money account, rural (% age 15+)",data_country,data_year
0,2011,AFG,Afghanistan,South Asia,Low income,9%,15%,15%,2%,3%,...,,,,,,,,,,
1,2014,AFG,Afghanistan,South Asia,Low income,10%,16%,15%,4%,4%,...,0%,0%,0%,0%,0%,0%,1%,0%,,
2,2017,AFG,Afghanistan,South Asia,Low income,15%,23%,25%,4%,7%,...,1%,0%,1%,0%,2%,0%,1%,1%,,
3,2011,AGO,Angola,Sub-Saharan Africa (excluding high income),Lower middle income,39%,39%,46%,31%,39%,...,,,,,,,,,,
4,2014,AGO,Angola,Sub-Saharan Africa (excluding high income),Lower middle income,29%,36%,36%,12%,22%,...,,,,,,,,,,


In [33]:
df = df[(df.Year == 2017)]

# Must convert the data to float by removing the % sign
df['Made or received digital payments in the past year (% age 15+)'] = df['Made or received digital payments in the past year (% age 15+)'].str.replace('%','')
df['Made or received digital payments in the past year (% age 15+)'] = df['Made or received digital payments in the past year (% age 15+)'].astype(float)

In [34]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Made or received digital payments in the past year (% age 15+)'] 


min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 1-20 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [35]:
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]
df

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
2,Afghanistan,% of population with digital finance account -...,11.0,1.217391,True
7,Albania,% of population with digital finance account -...,29.0,2.195652,True
10,Arab world,% of population with digital finance account -...,26.0,2.032609,True
13,United Arab Emirates,% of population with digital finance account -...,84.0,5.184783,True
16,Argentina,% of population with digital finance account -...,40.0,2.793478,True
...,...,...,...,...,...
479,World,% of population with digital finance account -...,52.0,3.445652,True
482,Kosovo,% of population with digital finance account -...,39.0,2.739130,True
487,South Africa,% of population with digital finance account -...,60.0,3.880435,True
490,Zambia,% of population with digital finance account -...,39.0,2.739130,True


In [36]:
# output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(bf))

## 4. % of population with ID


In [98]:
indicator = indicators[3]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population with ID
id4d_nid


In [99]:
df.head(15)

Unnamed: 0,id,Economy,Country Code,Region,Income,OECD,Lending category,Other,GCC adjusted,Inclusion Criteria,...,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40
0,1,Afghanistan,AFG,SAS,LIC,,IDA,HIPC,,INCLUDE,...,Voter,13549892.0,7296096.0,36373176,18191591,9336665,8854905,18181585,9398623,8782976
1,2,Albania,ALB,ECS,UMC,,IBRD,,,INCLUDE,...,Direct,2267673.0,2188099.0,2934363,540349,280793,259561,2394014,1199481,1194510
2,3,Algeria,DZA,MEA,UMC,,IBRD,,,INCLUDE,...,Voter,,,42008054,14075724,7181719,6894010,27932330,14033462,13898865
3,4,Andorra,AND,ECS,HIC,,..,,,EXCLUDE,...,Voter,,,80209,13580,6971,6609,66629,34060,32569
4,5,Angola,AGO,SSF,LMC,,IBRD,,,INCLUDE,...,Voter,,,30774205,16389360,8157462,8231869,14384845,6937053,7447820
5,6,Antigua and Barbuda,ATG,LCN,HIC,,IBRD,,,INCLUDE,...,Voter,,,103050,29388,14745,14644,73662,34732,38932
6,7,Argentina,ARG,LCN,UMC,,IBRD,,,INCLUDE,...,Voter,,,44688864,11751904,5977086,5774773,32936960,15898099,17038900
7,8,Armenia,ARM,ECS,LMC,,IBRD,,,INCLUDE,...,Voter,,,2934152,689894,367840,322065,2244258,1012449,1231792
8,9,Australia,AUS,EAS,HIC,OECD,..,,,EXCLUDE,...,Voter,7847109.0,8269334.0,24772247,5624607,2884647,2739917,19147640,9456278,9691403
9,10,Austria,AUT,ECS,HIC,OECD,..,EMU,,EXCLUDE,...,Voter,3093348.0,3307645.0,8751820,1318229,676854,641377,7433591,3617222,3816362


In [100]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
# Subtract the unregister proportion column by 100 and absolute the value
df['data_col'] = df['UP in % of Country Population'].sub(100).abs()
df ['Year'] = 2018
df['Country Name'] = df.iloc[:,1]


min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 1-20 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [102]:
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]
df

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,Afghanistan,% of population with ID,67,3.857143,True
1,Albania,% of population with ID,100,6.000000,True
2,Algeria,% of population with ID,89,5.285714,True
3,Andorra,% of population with ID,47,2.558442,True
4,Angola,% of population with ID,44,2.363636,True
...,...,...,...,...,...
193,"Venezuela, RB",% of population with ID,87,5.155844,True
194,Vietnam,% of population with ID,96,5.740260,True
195,"Yemen, Rep.",% of population with ID,50,2.753247,True
196,Zambia,% of population with ID,44,2.363636,True


In [None]:
# output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

## 5. % of services that can be accessed

In [134]:
indicator = indicators[4]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of services that can be accessed
id4d_services


In [135]:
df.head(15)

Unnamed: 0,ID,Economy,Region,Income,Code,Inclusion Criteria,National ID,NID Issued at Birth,Mandatory NID age,Digitized ID system,...,URL,NID card/number name,NID cost,Primary CR entity,CR Gov Entity URL,CR entity category,Mandatory Birth Registration Period,Birth Registration cost,data_country,data_year
0,1.0,Afghanistan,SAS,LIC,AFG,INCLUDE,1,1.0,0,1,...,http://mcit.gov.af/en/page/7081,Tazkira / eNID,Afs 30-35 (US$ 0.5),Ministry of Interior Affairs,"http://moi.gov.af/en/page/7180 , http://moi.go...",2.0,6 m,free,,
1,2.0,Albania,ECS,UMC,ALB,INCLUDE,1,0.0,16,1,...,http://www.punetebrendshme.gov.al,Letërnjoftimi / Albanian Identity Card,10 euros,"Vital Statistics Offices , Ministry of Interio...",http://www.punetebrendshme.gov.al/,5.0,60 d,100 Leke,,
2,3.0,Algeria,MEA,UMC,DZA,INCLUDE,1,0.0,18,1,...,http://www.interieur.gov.dz/,Carte Nationale d'Identité Biométrique Electro...,free,Ministry of the Interior and Local Governments,http://www.interieur.gov.dz,2.0,5 - 60 d,free,,
3,4.0,Andorra,ECS,HIC,AND,EXCLUDE,0,,-,-,...,,-,-,Civil Registry Office,http://www.registrecivil.ad,1.0,15 d,free,,
4,5.0,Angola,SSF,LMC,AGO,INCLUDE,1,0.0,10,1,...,http://www.minjusdh.gov.ao/VerPrestadorServico...,Bilhete de Identidade (National ID card),15 kwanza,Direcção Nacional dos Registos e do Notariado ...,http://www.minjusdh.gov.ao,1.0,5 d,free,,
5,6.0,Antigua and Barbuda,LCN,HIC,ATG,INCLUDE,0,,-,-,...,,-,-,"Civil Registry, Ministry of Justice and Legal ...",http://www.legalaffairs.gov.ag/,1.0,30 d,EC 10,,
6,7.0,Argentina,LCN,UMC,ARG,INCLUDE,1,1.0,0,1,...,http://www.nuevodni.gov.ar/inicio/index.php,DNI (Documento Nacional de Identidad) / SIBIOS...,free,"Regional Civil Registries, Provincial Authorit...",http://www.mininterior.gov.ar/renaper/renaper.php,6.0,40 d,free,,
7,8.0,Armenia,ECS,LMC,ARM,INCLUDE,1,1.0,16,1,...,https://www.ekeng.am/hy/,National ID Card / National Passport,free,"Civil Registry Office, RA Ministry of Justice",http://www.moj.am/services/civil_registry/item...,1.0,1 y,free,,
8,9.0,Australia,EAS,HIC,AUS,EXCLUDE,0,,-,-,...,,-,-,"Registrar-General, Departments of Justice",http://www.australia.gov.au/topics/law-and-jus...,1.0,6 m,free,,
9,10.0,Austria,ECS,HIC,AUT,EXCLUDE,1,0.0,0,1,...,https://www.help.gv.at/Portal.Node/hlpd/public...,Identitätsausweis / Personalausweis (Austrian ...,61.5 EUR,"Register Office, Baby-Point",https://www.help.gv.at/Portal.Node/hlpd/public...,2.0,1 m,free,,


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 1 to 198
Data columns (total 23 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   1.0                                                     197 non-null    float64
 1   Afghanistan                                             198 non-null    object 
 2   SAS                                                     198 non-null    object 
 3   LIC                                                     198 non-null    object 
 4   AFG                                                     198 non-null    object 
 5   INCLUDE                                                 198 non-null    object 
 6   1                                                       198 non-null    object 
 7   1                                                       197 non-null    object 
 8   0                                       

In [44]:
df.head()

Unnamed: 0,1.0,Afghanistan,SAS,LIC,AFG,INCLUDE,1,1.1,0,1.2,...,http://mcit.gov.af/en/page/7081,Tazkira / eNID,Afs 30-35 (US$ 0.5),Ministry of Interior Affairs,"http://moi.gov.af/en/page/7180 , http://moi.gov.af/fa",2.0,6 m,free,NaN,NaN.1
1,2.0,Albania,ECS,UMC,ALB,INCLUDE,1,0.0,16,1,...,http://www.punetebrendshme.gov.al,Letërnjoftimi / Albanian Identity Card,10 euros,"Vital Statistics Offices , Ministry of Interio...",http://www.punetebrendshme.gov.al/,5.0,60 d,100 Leke,,
2,3.0,Algeria,MEA,UMC,DZA,INCLUDE,1,0.0,18,1,...,http://www.interieur.gov.dz/,Carte Nationale d'Identité Biométrique Electro...,free,Ministry of the Interior and Local Governments,http://www.interieur.gov.dz,2.0,5 - 60 d,free,,
3,4.0,Andorra,ECS,HIC,AND,EXCLUDE,0,,-,-,...,,-,-,Civil Registry Office,http://www.registrecivil.ad,1.0,15 d,free,,
4,5.0,Angola,SSF,LMC,AGO,INCLUDE,1,0.0,10,1,...,http://www.minjusdh.gov.ao/VerPrestadorServico...,Bilhete de Identidade (National ID card),15 kwanza,Direcção Nacional dos Registos e do Notariado ...,http://www.minjusdh.gov.ao,1.0,5 d,free,,
5,6.0,Antigua and Barbuda,LCN,HIC,ATG,INCLUDE,0,,-,-,...,,-,-,"Civil Registry, Ministry of Justice and Legal ...",http://www.legalaffairs.gov.ag/,1.0,30 d,EC 10,,


In [45]:
# dcol = 'Percentage of total merchandise trade'
# indicol = 'IctProductCategory Label'
# cname = 'Economy Label'

# # filter most recent year and exports
# df = df[(df.Year==2019)&(df['Flow Label']=='Imports')]

# # create standard columns
# df['higher_is_better'] = True
# df['Indicator'] =  df[indicol]+' '+ df['Flow Label']
# df['Country Name'] = df[cname]
# # df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
# df['data_col'] = df[dcol]


# # convert 1-100 %  into 1-6
# df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=100))

# df.sort_values(by='new_rank_score', ascending=False)

# # prepare output
# df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# # output scores
# df.to_csv('../indicator_scores/{}_scores.csv'.format(bf+'_imports'), index=False)

## 6. can ID be used for transactions



In [124]:
indicator = indicators[5]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

can ID be used for transactions
id4d_services


In [125]:
df.head(20)

Unnamed: 0,ID,Economy,Region,Income,Code,Inclusion Criteria,National ID,NID Issued at Birth,Mandatory NID age,Digitized ID system,...,URL,NID card/number name,NID cost,Primary CR entity,CR Gov Entity URL,CR entity category,Mandatory Birth Registration Period,Birth Registration cost,data_country,data_year
0,1.0,Afghanistan,SAS,LIC,AFG,INCLUDE,1,1.0,0,1,...,http://mcit.gov.af/en/page/7081,Tazkira / eNID,Afs 30-35 (US$ 0.5),Ministry of Interior Affairs,"http://moi.gov.af/en/page/7180 , http://moi.go...",2.0,6 m,free,,
1,2.0,Albania,ECS,UMC,ALB,INCLUDE,1,0.0,16,1,...,http://www.punetebrendshme.gov.al,Letërnjoftimi / Albanian Identity Card,10 euros,"Vital Statistics Offices , Ministry of Interio...",http://www.punetebrendshme.gov.al/,5.0,60 d,100 Leke,,
2,3.0,Algeria,MEA,UMC,DZA,INCLUDE,1,0.0,18,1,...,http://www.interieur.gov.dz/,Carte Nationale d'Identité Biométrique Electro...,free,Ministry of the Interior and Local Governments,http://www.interieur.gov.dz,2.0,5 - 60 d,free,,
3,4.0,Andorra,ECS,HIC,AND,EXCLUDE,0,,-,-,...,,-,-,Civil Registry Office,http://www.registrecivil.ad,1.0,15 d,free,,
4,5.0,Angola,SSF,LMC,AGO,INCLUDE,1,0.0,10,1,...,http://www.minjusdh.gov.ao/VerPrestadorServico...,Bilhete de Identidade (National ID card),15 kwanza,Direcção Nacional dos Registos e do Notariado ...,http://www.minjusdh.gov.ao,1.0,5 d,free,,
5,6.0,Antigua and Barbuda,LCN,HIC,ATG,INCLUDE,0,,-,-,...,,-,-,"Civil Registry, Ministry of Justice and Legal ...",http://www.legalaffairs.gov.ag/,1.0,30 d,EC 10,,
6,7.0,Argentina,LCN,UMC,ARG,INCLUDE,1,1.0,0,1,...,http://www.nuevodni.gov.ar/inicio/index.php,DNI (Documento Nacional de Identidad) / SIBIOS...,free,"Regional Civil Registries, Provincial Authorit...",http://www.mininterior.gov.ar/renaper/renaper.php,6.0,40 d,free,,
7,8.0,Armenia,ECS,LMC,ARM,INCLUDE,1,1.0,16,1,...,https://www.ekeng.am/hy/,National ID Card / National Passport,free,"Civil Registry Office, RA Ministry of Justice",http://www.moj.am/services/civil_registry/item...,1.0,1 y,free,,
8,9.0,Australia,EAS,HIC,AUS,EXCLUDE,0,,-,-,...,,-,-,"Registrar-General, Departments of Justice",http://www.australia.gov.au/topics/law-and-jus...,1.0,6 m,free,,
9,10.0,Austria,ECS,HIC,AUT,EXCLUDE,1,0.0,0,1,...,https://www.help.gv.at/Portal.Node/hlpd/public...,Identitätsausweis / Personalausweis (Austrian ...,61.5 EUR,"Register Office, Baby-Point",https://www.help.gv.at/Portal.Node/hlpd/public...,2.0,1 m,free,,


In [126]:
df['Digitized ID system']= df['Digitized ID system'].replace('-',np.nan)
df['Digitized ID system']= df['Digitized ID system'].astype(float)

In [127]:
df.head(15)

Unnamed: 0,ID,Economy,Region,Income,Code,Inclusion Criteria,National ID,NID Issued at Birth,Mandatory NID age,Digitized ID system,...,URL,NID card/number name,NID cost,Primary CR entity,CR Gov Entity URL,CR entity category,Mandatory Birth Registration Period,Birth Registration cost,data_country,data_year
0,1.0,Afghanistan,SAS,LIC,AFG,INCLUDE,1,1.0,0,1.0,...,http://mcit.gov.af/en/page/7081,Tazkira / eNID,Afs 30-35 (US$ 0.5),Ministry of Interior Affairs,"http://moi.gov.af/en/page/7180 , http://moi.go...",2.0,6 m,free,,
1,2.0,Albania,ECS,UMC,ALB,INCLUDE,1,0.0,16,1.0,...,http://www.punetebrendshme.gov.al,Letërnjoftimi / Albanian Identity Card,10 euros,"Vital Statistics Offices , Ministry of Interio...",http://www.punetebrendshme.gov.al/,5.0,60 d,100 Leke,,
2,3.0,Algeria,MEA,UMC,DZA,INCLUDE,1,0.0,18,1.0,...,http://www.interieur.gov.dz/,Carte Nationale d'Identité Biométrique Electro...,free,Ministry of the Interior and Local Governments,http://www.interieur.gov.dz,2.0,5 - 60 d,free,,
3,4.0,Andorra,ECS,HIC,AND,EXCLUDE,0,,-,,...,,-,-,Civil Registry Office,http://www.registrecivil.ad,1.0,15 d,free,,
4,5.0,Angola,SSF,LMC,AGO,INCLUDE,1,0.0,10,1.0,...,http://www.minjusdh.gov.ao/VerPrestadorServico...,Bilhete de Identidade (National ID card),15 kwanza,Direcção Nacional dos Registos e do Notariado ...,http://www.minjusdh.gov.ao,1.0,5 d,free,,
5,6.0,Antigua and Barbuda,LCN,HIC,ATG,INCLUDE,0,,-,,...,,-,-,"Civil Registry, Ministry of Justice and Legal ...",http://www.legalaffairs.gov.ag/,1.0,30 d,EC 10,,
6,7.0,Argentina,LCN,UMC,ARG,INCLUDE,1,1.0,0,1.0,...,http://www.nuevodni.gov.ar/inicio/index.php,DNI (Documento Nacional de Identidad) / SIBIOS...,free,"Regional Civil Registries, Provincial Authorit...",http://www.mininterior.gov.ar/renaper/renaper.php,6.0,40 d,free,,
7,8.0,Armenia,ECS,LMC,ARM,INCLUDE,1,1.0,16,1.0,...,https://www.ekeng.am/hy/,National ID Card / National Passport,free,"Civil Registry Office, RA Ministry of Justice",http://www.moj.am/services/civil_registry/item...,1.0,1 y,free,,
8,9.0,Australia,EAS,HIC,AUS,EXCLUDE,0,,-,,...,,-,-,"Registrar-General, Departments of Justice",http://www.australia.gov.au/topics/law-and-jus...,1.0,6 m,free,,
9,10.0,Austria,ECS,HIC,AUT,EXCLUDE,1,0.0,0,1.0,...,https://www.help.gv.at/Portal.Node/hlpd/public...,Identitätsausweis / Personalausweis (Austrian ...,61.5 EUR,"Register Office, Baby-Point",https://www.help.gv.at/Portal.Node/hlpd/public...,2.0,1 m,free,,


In [129]:
# create standard columns
df.rename(columns={'Economy':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Year'] = 2018
df['Indicator'] = indicator
df['data_col'] = df['Digitized ID system']
df.rename(columns={'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [132]:
df = df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Afghanistan,2018,can ID be used for transactions,1.0,6.0,True
1,Albania,2018,can ID be used for transactions,1.0,6.0,True
2,Algeria,2018,can ID be used for transactions,1.0,6.0,True
3,Andorra,2018,can ID be used for transactions,,,True
4,Angola,2018,can ID be used for transactions,1.0,6.0,True
5,Antigua and Barbuda,2018,can ID be used for transactions,,,True
6,Argentina,2018,can ID be used for transactions,1.0,6.0,True
7,Armenia,2018,can ID be used for transactions,1.0,6.0,True
8,Australia,2018,can ID be used for transactions,,,True
9,Austria,2018,can ID be used for transactions,1.0,6.0,True


In [133]:
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

## 7. Is personal data siloed


In [50]:
indicator = indicators[6]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# Must limit the database to the first 206 rows, the remaining rows do not contain any useful information
df = df.iloc[0:205,:]

# Drop the superfluous rows by dropping na
df = df[df['#'].notna()]

# Must convert data in the DPL column into float
df['DPL'] = df['DPL'].astype(float, errors = 'ignore')


Is personal data siloed
Egov_strategy


In [51]:
df.head(104)

Unnamed: 0,#,Flag,Code,Cnum,Economy,Level,Population,GNI,GNIPC,e-Government,...,NGTI-1,NGTI-2,NGTI-3,NGTI-4,GTI-1,GTI-2,GTI-3,GTI-4,data_country,data_year
0,1,,AFG,4.0,Afghanistan,LIC,38928,20726,540,https://mcit.gov.af/node/6938,...,0.69,0.52,0.31,0.62,-0.09,-0.34,0.02,-0.12,,
1,2,,ALB,8.0,Albania,UMIC,2878,14949,5240,https://e-albania.al/,...,0.81,0.78,0.60,0.77,0.29,0.62,0.74,0.26,,
2,3,,DZA,12.0,Algeria,LMIC,43851,170722,3970,https://www.mpttn.gov.dz/ar/content/%D8%A7%D9%...,...,0.73,0.46,0.02,0.60,0.06,-0.56,-0.68,-0.15,,
3,4,,ADO,20.0,Andorra,HIC,77,3154,40886,http://www.govern.ad,...,0.63,0.60,0.05,0.13,-0.28,-0.06,-0.62,-1.32,,
4,5,,AGO,24.0,Angola,LMIC,32866,97005,3050,http://www.governo.gov.ao,...,0.68,0.69,0.21,0.61,-0.12,0.27,-0.23,-0.13,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,98,,LSO,426.0,Lesotho,LMIC,2142,2901,1360,http://www.gov.ls,...,,,,,,,,,,
104,99,,LBR,430.0,Liberia,LIC,5058,2852,580,http://www.emansion.gov.lr,...,0.69,0.42,0.13,0.50,-0.08,-0.69,-0.42,-0.39,,
105,100,,LBY,434.0,Libya,UMIC,6871,51757,7640,http://www.pm.gov.ly,...,0.26,0.31,0.01,0.50,-1.49,-1.09,-0.71,-0.42,,
106,101,,LIE,438.0,Liechtenstein,HIC,38,4160,116430,http://regierung.li,...,0.30,0.48,0.07,0.87,-1.35,-0.49,-0.57,0.51,,


In [52]:
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,205]
df['Country Name'] = df['Economy']
df['Year'] = 2020

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [53]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(107)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Afghanistan,2020,Is personal data siloed,-0.04,1.0,True
1,Albania,2020,Is personal data siloed,0.02,6.0,True
2,Algeria,2020,Is personal data siloed,-0.04,1.0,True
3,Andorra,2020,Is personal data siloed,0.02,6.0,True
4,Angola,2020,Is personal data siloed,0.02,6.0,True
...,...,...,...,...,...,...
106,Liechtenstein,2020,Is personal data siloed,0.02,6.0,True
107,Lithuania,2020,Is personal data siloed,0.02,6.0,True
108,Luxembourg,2020,Is personal data siloed,0.02,6.0,True
109,"Macao SAR, China",2020,Is personal data siloed,0.02,6.0,True


## 8. Open data index

In [54]:
bnames

Unnamed: 0,Indicator,check,Data Source,Index,Filename
148,Digital payments penetration,Foundations,Portulans Institute,True,digital_payments_penetration
149,% of population with digital finance account -...,Foundations,World Bank,False,population_digital_financial_services
150,% of population with digital finance account -...,Foundations,World Bank,False,population_digital_financial_services
154,% of population with ID,Foundations,World Bank,False,id4d_nid
155,% of services that can be accessed,Foundations,World Bank,False,id4d_services
156,can ID be used for transactions,Foundations,World Bank,False,id4d_services
157,Is personal data siloed,Foundations,World Bank,False,Egov_strategy
158,Open data index,Foundations,Open Knowledge Foundation,True,open_data_idx


In [55]:
indicator = indicators[7]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Open data index
open_data_idx


In [56]:
df.head(15)

Unnamed: 0,id,site,name,slug,region,continent,rank,score
0,tw,global,Taiwan,taiwan,,,1,90
1,au,global,Australia,australia,,,2,79
2,gb,global,Great Britain,united_kingdom,,,2,79
3,fr,global,France,france,,,4,70
4,fi,global,Finland,finland,,,5,69
5,ca,global,Canada,canada,,,5,69
6,no,global,Norway,norway,,,5,69
7,br,global,Brazil,brazil,,,8,68
8,nz,global,New Zealand,new_zealand,,,8,68
9,nir,global,Northern Ireland,northern_ireland,,,10,67


In [57]:
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['score'] 
df['Country Name'] = df['name']
df['Year'] = 2016

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [58]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Taiwan,2016,Open data index,90,6.0,True
1,Australia,2016,Open data index,79,5.382022,True
2,Great Britain,2016,Open data index,79,5.382022,True
3,France,2016,Open data index,70,4.876404,True
4,Finland,2016,Open data index,69,4.820225,True
5,Canada,2016,Open data index,69,4.820225,True
6,Norway,2016,Open data index,69,4.820225,True
7,Brazil,2016,Open data index,68,4.764045,True
8,New Zealand,2016,Open data index,68,4.764045,True
9,Northern Ireland,2016,Open data index,67,4.707865,True


### Score Aggregating

In [59]:
import os


In [60]:
# get list of files in scores folder
scores = os.listdir('../indicator_scores/')
scores = [s for s in scores if s.startswith('foundations')]

In [61]:
scores

['foundations_digital_payments_penetration_scores.csv']

In [62]:
# create a dataframe that concatenates all these file into one table
df = pd.concat([pd.read_csv('../indicator_scores/{}'.format(s)) for s in scores])    

In [63]:
df

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,Norway,Digital payments penetration,100.00,6.0000,True
1,Denmark,Digital payments penetration,97.24,5.8620,True
2,Finland,Digital payments penetration,93.95,5.6975,True
3,Sweden,Digital payments penetration,93.08,5.6540,True
4,Netherlands,Digital payments penetration,89.01,5.4505,True
...,...,...,...,...,...
129,Jamaica,Digital payments penetration,,,True
130,Madagascar,Digital payments penetration,,,True
131,Oman,Digital payments penetration,,,True
132,Qatar,Digital payments penetration,,,True


In [64]:
# Data cleaning
df['new_rank_score'] = df['new_rank_score'].fillna(0)
df.sort_values(by=['Country Name'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country Name      134 non-null    object 
 1   Indicator         134 non-null    object 
 2   data_col          122 non-null    float64
 3   new_rank_score    134 non-null    float64
 4   higher_is_better  134 non-null    bool   
dtypes: bool(1), float64(2), object(2)
memory usage: 4.4+ KB


In [66]:
df.head(15)

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,Albania,Digital payments penetration,8.62,1.431,True
1,Algeria,Digital payments penetration,4.16,1.208,True
2,Angola,Digital payments penetration,,0.0,True
3,Argentina,Digital payments penetration,22.93,2.1465,True
4,Armenia,Digital payments penetration,17.52,1.876,True
5,Australia,Digital payments penetration,79.37,4.9685,True
6,Austria,Digital payments penetration,59.89,3.9945,True
7,Azerbaijan,Digital payments penetration,6.93,1.3465,True
8,Bahrain,Digital payments penetration,40.0,3.0,True
9,Bangladesh,Digital payments penetration,14.08,1.704,True


In [67]:
df.describe()

Unnamed: 0,data_col,new_rank_score
count,122.0,134.0
mean,36.076967,2.552757
std,24.867406,1.432577
min,0.0,0.0
25%,16.235,1.627375
50%,30.87,2.40025
75%,51.11,3.507125
max,100.0,6.0


In [68]:
# checking country names
sorted(df['Country Name'].unique().tolist())

['Albania',
 'Algeria',
 'Angola',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahrain',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Benin',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Congo, Dem. Rep.',
 'Costa Rica',
 'Croatia',
 'Cyprus',
 'Czech Republic',
 "Côte d'Ivoire",
 'Denmark',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Finland',
 'France',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Guinea',
 'Honduras',
 'Hong Kong (China)',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran, Islamic Rep.',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Korea, Rep.',
 'Kuwait',
 'Kyrgyzstan',
 'Lao PDR',
 'Latvia',
 'Lebanon',
 'Lesotho',
 'Lithuania',
 'Luxembourg',
 'Madagascar

In [69]:
# remove trailing whitespaces from country name
df['Country Name'] = df['Country Name'].str.strip()


In [70]:
# checking country names
sorted(df['Country Name'].unique().tolist())

['Albania',
 'Algeria',
 'Angola',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahrain',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Benin',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Congo, Dem. Rep.',
 'Costa Rica',
 'Croatia',
 'Cyprus',
 'Czech Republic',
 "Côte d'Ivoire",
 'Denmark',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Finland',
 'France',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Guinea',
 'Honduras',
 'Hong Kong (China)',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran, Islamic Rep.',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Korea, Rep.',
 'Kuwait',
 'Kyrgyzstan',
 'Lao PDR',
 'Latvia',
 'Lebanon',
 'Lesotho',
 'Lithuania',
 'Luxembourg',
 'Madagascar

In [71]:
# average indicator scores per country
agg_df = df.groupby(['Country Name']).agg({'new_rank_score':'mean','data_col':'count'})

In [72]:
agg_df.columns = ['agg_score', 'count_source' ]

In [73]:
max_number_sources = agg_df.describe()['count_source']['max']

In [74]:
agg_df['agg_score_wt'] = agg_df['agg_score']*(agg_df['count_source']/max_number_sources)

In [75]:
agg_df.sort_values(by='agg_score', ascending=False, inplace=True)

In [76]:
agg_df.head(25)

Unnamed: 0_level_0,agg_score,count_source,agg_score_wt
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Norway,6.0,1,6.0
Denmark,5.862,1,5.862
Finland,5.6975,1,5.6975
Sweden,5.654,1,5.654
Netherlands,5.4505,1,5.4505
New Zealand,5.3365,1,5.3365
United States,5.22,1,5.22
Estonia,5.141,1,5.141
"Korea, Rep.",5.132,1,5.132
Canada,5.0765,1,5.0765


In [77]:
agg_df.to_csv('../pillar_scores/foundation_scores_v0.csv')