In [172]:
import pandas as pd
import numpy as np

### Get all the pillar names from the excel

In [173]:
names = pd.read_excel('../../UNDP Digital Assessment Data Framework Filename Matching V7.xlsx')

In [174]:
col_names = ['Indicator','check', 'Data Source','Data Link','Index','Filename','Sub-Pillar']

In [175]:
names = names[col_names]

In [176]:
names.head()

Unnamed: 0,Indicator,check,Data Source,Data Link,Index,Filename,Sub-Pillar
0,Countries,,UN Statistics Division: List of Countries,https://unstats.un.org,False,Countries,
1,"Database of Global Administrative Areas (GADM,...",,,https://gadm.org,False,,
2,High Resolution Population Density Maps + Demo...,,,,False,,
3,population density vs openstreetmap object den...,,,,False,,
4,Population Density,Infrastructure,World Bank: World Development Indicators,https://datacatalog.worldbank.org,False,population_density,Connectivity Technology


In [177]:
# get all the files per pillar
data_stats = names.groupby('check').agg({'Filename':'count','Indicator':'count'})

In [178]:
data_stats

Unnamed: 0_level_0,Filename,Indicator
check,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,18,27
Foundations,14,22
Government,10,15
Infrastructure,47,58
People,35,49
Regulation,5,8
Strategy,1,1


### Government

In [179]:
bnames = names[(names.check=='Government')&(~names.Filename.isna())]#&(names.Index==False)]

In [180]:
bnames.head(25)

Unnamed: 0,Indicator,check,Data Source,Data Link,Index,Filename,Sub-Pillar
63,Online-Service-Index (OSI),Government,UN: E-Government Survey,https://publicadministration.un.org/egovkb,True,e_government_index,Digital Public Services
64,E-Participation index,Government,UN: E-Government Survey,https://publicadministration.un.org/egovkb,True,e_government_index,Digital Public Services
65,Use of public services online (% of services o...,Government,Boston Consulting Group/Salesforce: The Global...,https://www.salesforce.com,False,digital_public_service_use,Digital Public Services
66,Security incidents (# of relevant issues),Government,SPECOPS,https://specopssoft.com,False,cyber_attacks,Digital Public Services
67,What is the % change of government digitizing ...,Government,World Bank: GovTech Dataset,https://datacatalog.worldbank.org,True,Egov_strategy,Digital Public Services
68,R&D spending (% of GDP),Government,World Bank: World Development Indicators,https://datacatalog.worldbank.org,False,RD_Percentage_GDP,Funding and procurement
69,ICT investment as a percentage of GDP,Government,OECD: Going Digital Toolkit,https://data.oecd.org,False,ICT_Investment,Funding and procurement
71,Evidence of digital strategies in/across Minst...,Government,World Bank: GovTech Dataset,https://datacatalog.worldbank.org,False,Egov_strategy,Funding and procurement
72,Evidence of focus on vulnerable groups,Government,World Bank: GovTech Dataset,https://datacatalog.worldbank.org,False,Egov_strategy,Leadership and coordination
75,% of digital skills certifications / training ...,Government,Coursera: Global Skills Reports,https://www.coursera.org/skills-reports/global,False,digital_skill_level,Capabilities


In [181]:
# get list of names for all indicators
indicators = bnames.Indicator.unique()
subpillars = bnames['Sub-Pillar'].unique()

In [182]:
# get all file names
bfiles = bnames.Filename.unique()

In [183]:
bfiles

array(['e_government_index', 'digital_public_service_use',
       'cyber_attacks', 'Egov_strategy', 'RD_Percentage_GDP',
       'ICT_Investment', 'digital_skill_level'], dtype=object)

In [184]:
subpillars

array(['Digital Public Services', 'Funding and procurement',
       'Leadership and coordination', 'Capabilities'], dtype=object)

In [185]:
# formula for converting scale
def convert_rank(old_value, old_min=1, old_max=7, new_min=1, new_max=6 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

### 1. Online-Service-Index (OSI)

In [186]:
indicators[0]

'Online-Service-Index (OSI)'

In [187]:
# load data
indicator = indicators[0]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Online-Service-Index (OSI)
e_government_index


In [188]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Digital Public Services


In [189]:
df.head()

Unnamed: 0,Survey Year,Country Name,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
0,2020,Iraq,143,0.436,0.3095,0.3353,0.4358,0.537
1,2020,Ireland,27,0.8433,0.8571,0.7706,0.9494,0.81
2,2020,Israel,30,0.8361,0.7143,0.7471,0.8924,0.8689
3,2020,Italy,37,0.8231,0.8214,0.8294,0.8466,0.7932
4,2020,Jamaica,114,0.5392,0.369,0.3882,0.7142,0.5151


In [190]:
# all data from 2020
df['Survey Year'].value_counts()

2020    193
Name: Survey Year, dtype: int64

In [191]:
# score looks like the one to use
df.describe()

Unnamed: 0,Survey Year,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
count,193.0,193.0,193.0,193.0,193.0,193.0,193.0
mean,2020.0,97.0,0.598767,0.567723,0.561961,0.687992,0.546354
std,0.0,55.858452,0.214869,0.259592,0.249874,0.19444,0.259358
min,2020.0,1.0,0.0875,0.0,0.0,0.0,0.0
25%,2020.0,49.0,0.432,0.3571,0.3529,0.5599,0.3496
50%,2020.0,97.0,0.6129,0.5714,0.5765,0.7395,0.5669
75%,2020.0,145.0,0.7798,0.7976,0.7647,0.8414,0.7723
max,2020.0,193.0,0.9758,1.0,1.0,1.0,1.0


In [192]:
# df.Indicator.unique()

In [193]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Online Service Index'] 
df['Year'] = df['Survey Year']
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))



In [194]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Iraq,2020,Online-Service-Index (OSI),0.3353,2.6765,True,Digital Public Services
1,Ireland,2020,Online-Service-Index (OSI),0.7706,4.8530,True,Digital Public Services
2,Israel,2020,Online-Service-Index (OSI),0.7471,4.7355,True,Digital Public Services
3,Italy,2020,Online-Service-Index (OSI),0.8294,5.1470,True,Digital Public Services
4,Jamaica,2020,Online-Service-Index (OSI),0.3882,2.9410,True,Digital Public Services
...,...,...,...,...,...,...,...
188,Senegal,2020,Online-Service-Index (OSI),0.4941,3.4705,True,Digital Public Services
189,Serbia,2020,Online-Service-Index (OSI),0.7941,4.9705,True,Digital Public Services
190,Seychelles,2020,Online-Service-Index (OSI),0.6176,4.0880,True,Digital Public Services
191,Singapore,2020,Online-Service-Index (OSI),0.9647,5.8235,True,Digital Public Services


In [195]:
# output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator))

## 2. E-Participation index


In [196]:
indicator = indicators[1]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

E-Participation index
e_government_index


In [197]:
df.head()

Unnamed: 0,Survey Year,Country Name,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
0,2020,Iraq,143,0.436,0.3095,0.3353,0.4358,0.537
1,2020,Ireland,27,0.8433,0.8571,0.7706,0.9494,0.81
2,2020,Israel,30,0.8361,0.7143,0.7471,0.8924,0.8689
3,2020,Italy,37,0.8231,0.8214,0.8294,0.8466,0.7932
4,2020,Jamaica,114,0.5392,0.369,0.3882,0.7142,0.5151


In [198]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Digital Public Services


In [199]:
df['Survey Year'].unique()

array([2020], dtype=int64)

In [200]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,4]
df['Year'] = df['Survey Year']
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))


In [201]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Iraq,2020,E-Participation index,0.3095,2.5475,True,Digital Public Services
1,Ireland,2020,E-Participation index,0.8571,5.2855,True,Digital Public Services
2,Israel,2020,E-Participation index,0.7143,4.5715,True,Digital Public Services
3,Italy,2020,E-Participation index,0.8214,5.1070,True,Digital Public Services
4,Jamaica,2020,E-Participation index,0.3690,2.8450,True,Digital Public Services
...,...,...,...,...,...,...,...
188,Senegal,2020,E-Participation index,0.4405,3.2025,True,Digital Public Services
189,Serbia,2020,E-Participation index,0.8214,5.1070,True,Digital Public Services
190,Seychelles,2020,E-Participation index,0.5714,3.8570,True,Digital Public Services
191,Singapore,2020,E-Participation index,0.9762,5.8810,True,Digital Public Services


In [202]:
# output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator))

## 3. Use of public services online (% of services online, penetration, frequency of use)


In [203]:
indicator = indicators[2]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Use of public services online (% of services online, penetration, frequency of use)
digital_public_service_use


In [204]:
# drop first row
df = df.iloc[1: , :]

In [205]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 1 to 31
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Country             31 non-null     object
 1   Net Perception (%)  31 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 628.0+ bytes


In [206]:
df.head(15)

Unnamed: 0,Country,Net Perception (%)
1,UAE,61
2,Saudi Arabia,59
3,Singapore,54
4,China,53
5,New Zealand,52
6,Netherlands,51
7,Qatar,51
8,Canada,40
9,Denmark,48
10,India,45


In [207]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Digital Public Services


In [208]:
# create the standard columns
df['higher_is_better'] = True
df['Indicator'] =  indicator
df['Country Name'] = df['Country']
# df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df['Net Perception (%)']
df['Year'] = 2020
df['Sub-Pillar'] = subpillar

In [209]:
# convert 1-100 %  into 1-6
min_rank = df['data_col'].min()
max_rank = df['data_col'].max()
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

df.sort_values(by='new_rank_score', ascending=False)

# prepare output
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]

# output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

In [210]:
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
1,UAE,2020,Use of public services online (% of services o...,61,6.0,True,Digital Public Services
2,Saudi Arabia,2020,Use of public services online (% of services o...,59,5.876543,True,Digital Public Services
3,Singapore,2020,Use of public services online (% of services o...,54,5.567901,True,Digital Public Services
4,China,2020,Use of public services online (% of services o...,53,5.506173,True,Digital Public Services
5,New Zealand,2020,Use of public services online (% of services o...,52,5.444444,True,Digital Public Services
6,Netherlands,2020,Use of public services online (% of services o...,51,5.382716,True,Digital Public Services
7,Qatar,2020,Use of public services online (% of services o...,51,5.382716,True,Digital Public Services
8,Canada,2020,Use of public services online (% of services o...,40,4.703704,True,Digital Public Services
9,Denmark,2020,Use of public services online (% of services o...,48,5.197531,True,Digital Public Services
10,India,2020,Use of public services online (% of services o...,45,5.012346,True,Digital Public Services


## 4. Security incidents (# of relevant issues)



In [211]:
indicator = indicators[3]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Security incidents (# of relevant issues)
cyber_attacks


In [212]:
df.head()

Unnamed: 0,Country,Number of Significant Cyberattacks (2006-2020)
0,United States,156
1,United Kingdom,47
2,India,23
3,Germany,21
4,South Korea,18


In [213]:
# create a rank from the number of attacks fields
df['data_rank'] = df['Number of Significant Cyberattacks (2006-2020)'].rank(method='max')

In [214]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Digital Public Services


In [215]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['data_rank']
df['Year'] = 2020
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 1-20 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert since higher rank is not better
df['new_rank_score'] = (6-df['new_rank_score'])+1

# # prepare output
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]

df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,United States,2020,Security incidents (# of relevant issues),20.0,1.0,True,Digital Public Services
1,United Kingdom,2020,Security incidents (# of relevant issues),19.0,1.263158,True,Digital Public Services
2,India,2020,Security incidents (# of relevant issues),18.0,1.526316,True,Digital Public Services
3,Germany,2020,Security incidents (# of relevant issues),17.0,1.789474,True,Digital Public Services
4,South Korea,2020,Security incidents (# of relevant issues),16.0,2.052632,True,Digital Public Services
5,Australia,2020,Security incidents (# of relevant issues),15.0,2.315789,True,Digital Public Services
6,Ukraine,2020,Security incidents (# of relevant issues),15.0,2.315789,True,Digital Public Services
7,China,2020,Security incidents (# of relevant issues),13.0,2.842105,True,Digital Public Services
8,Iran,2020,Security incidents (# of relevant issues),13.0,2.842105,True,Digital Public Services
9,Saudi Arabia,2020,Security incidents (# of relevant issues),13.0,2.842105,True,Digital Public Services


In [216]:
# # output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

## 5. What is the % change of government digitizing public services?



In [217]:
indicator = indicators[4]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

What is the % change of government digitizing public services?
Egov_strategy


In [218]:
df.head()

Unnamed: 0,#,Flag,Code,Cnum,Economy,Level,Population,GNI,GNIPC,e-Government,...,NGTI-1,NGTI-2,NGTI-3,NGTI-4,GTI-1,GTI-2,GTI-3,GTI-4,data_country,data_year
0,1,,AFG,4.0,Afghanistan,LIC,38928,20726,540,https://mcit.gov.af/node/6938,...,0.69,0.52,0.31,0.62,-0.09,-0.34,0.02,-0.12,,
1,2,,ALB,8.0,Albania,UMIC,2878,14949,5240,https://e-albania.al/,...,0.81,0.78,0.6,0.77,0.29,0.62,0.74,0.26,,
2,3,,DZA,12.0,Algeria,LMIC,43851,170722,3970,https://www.mpttn.gov.dz/ar/content/%D8%A7%D9%...,...,0.73,0.46,0.02,0.6,0.06,-0.56,-0.68,-0.15,,
3,4,,ADO,20.0,Andorra,HIC,77,3154,40886,http://www.govern.ad,...,0.63,0.6,0.05,0.13,-0.28,-0.06,-0.62,-1.32,,
4,5,,AGO,24.0,Angola,LMIC,32866,97005,3050,http://www.governo.gov.ao,...,0.68,0.69,0.21,0.61,-0.12,0.27,-0.23,-0.13,,


In [219]:
# Must limit the database to the first 206 rows, the remaining rows do not contain any useful information
df = df.iloc[0:205,:]

# Drop the superfluous rows by dropping na
df = df[pd.to_numeric(df['#'], errors='coerce').notnull()]
df['#'] = df[df['#'].notna()]

# Must convert data in the DPL column into float
df['CGSI'] = df['CGSI'].replace('-',np.nan)
df['CGSI'] = df['CGSI'].astype(float, errors = 'ignore')

ValueError: Columns must be same length as key

In [220]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Digital Public Services


In [221]:
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['CGSI']
df['Country Name'] = df['Economy']
df['Year'] = 2020
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [222]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Afghanistan,2020,What is the % change of government digitizing ...,0.69,4.520408,True,Digital Public Services
1,Albania,2020,What is the % change of government digitizing ...,0.79,5.030612,True,Digital Public Services
2,Algeria,2020,What is the % change of government digitizing ...,0.71,4.622449,True,Digital Public Services
3,Andorra,2020,What is the % change of government digitizing ...,0.58,3.959184,True,Digital Public Services
4,Angola,2020,What is the % change of government digitizing ...,0.67,4.418367,True,Digital Public Services
5,Antigua and Barbuda,2020,What is the % change of government digitizing ...,0.68,4.469388,True,Digital Public Services
6,Argentina,2020,What is the % change of government digitizing ...,0.81,5.132653,True,Digital Public Services
7,Armenia,2020,What is the % change of government digitizing ...,0.73,4.72449,True,Digital Public Services
8,Australia,2020,What is the % change of government digitizing ...,0.89,5.540816,True,Digital Public Services
9,Austria,2020,What is the % change of government digitizing ...,,,True,Digital Public Services


In [223]:
# output scores
df.to_csv('../indicator_scores/government_percent change of government digitalizing_scores.csv'.format(indicator), index=False)

In [224]:
## 6. R&D spending (% of GDP)

In [225]:
indicator = indicators[5]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

R&D spending (% of GDP)
RD_Percentage_GDP


In [226]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 67 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   1960            0 non-null      float64
 1   1961            0 non-null      float64
 2   1962            0 non-null      float64
 3   1963            0 non-null      float64
 4   1964            0 non-null      float64
 5   1965            0 non-null      float64
 6   1966            0 non-null      float64
 7   1967            0 non-null      float64
 8   1968            0 non-null      float64
 9   1969            0 non-null      float64
 10  1970            0 non-null      float64
 11  1971            0 non-null      float64
 12  1972            0 non-null      float64
 13  1973            0 non-null      float64
 14  1974            0 non-null      float64
 15  1975            0 non-null      float64
 16  1976            0 non-null      float64
 17  1977            0 non-null      flo

In [227]:
df.head()

Unnamed: 0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2017,2018,2019,2020,Country Name,Country Code,Indicator Name,Indicator Code,data_country,data_year
0,,,,,,,,,,,...,,,,,Aruba,ABW,Research and development expenditure (% of GDP),GB.XPD.RSDV.GD.ZS,,
1,,,,,,,,,,,...,,,,,Africa Eastern and Southern,AFE,Research and development expenditure (% of GDP),GB.XPD.RSDV.GD.ZS,,
2,,,,,,,,,,,...,,,,,Afghanistan,AFG,Research and development expenditure (% of GDP),GB.XPD.RSDV.GD.ZS,,
3,,,,,,,,,,,...,,,,,Africa Western and Central,AFW,Research and development expenditure (% of GDP),GB.XPD.RSDV.GD.ZS,,
4,,,,,,,,,,,...,,,,,Angola,AGO,Research and development expenditure (% of GDP),GB.XPD.RSDV.GD.ZS,,


In [228]:
# create standard columns
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2018'] 
df['Sub-Pillar'] = subpillar
df['Year'] = 2018

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [229]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Aruba,2018,R&D spending (% of GDP),,,True,Digital Public Services
1,Africa Eastern and Southern,2018,R&D spending (% of GDP),,,True,Digital Public Services
2,Afghanistan,2018,R&D spending (% of GDP),,,True,Digital Public Services
3,Africa Western and Central,2018,R&D spending (% of GDP),,,True,Digital Public Services
4,Angola,2018,R&D spending (% of GDP),,,True,Digital Public Services
...,...,...,...,...,...,...,...
261,Kosovo,2018,R&D spending (% of GDP),,,True,Digital Public Services
262,"Yemen, Rep.",2018,R&D spending (% of GDP),,,True,Digital Public Services
263,South Africa,2018,R&D spending (% of GDP),,,True,Digital Public Services
264,Zambia,2018,R&D spending (% of GDP),,,True,Digital Public Services


In [230]:
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

## 7. ICT investment as a percentage of GDP


In [231]:
indicator = indicators[6]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

ICT investment as a percentage of GDP
ICT_Investment


In [232]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   LOCATION    450 non-null    object 
 1   INDICATOR   450 non-null    object 
 2   SUBJECT     450 non-null    object 
 3   MEASURE     450 non-null    object 
 4   FREQUENCY   450 non-null    object 
 5   TIME        450 non-null    int64  
 6   Value       450 non-null    float64
 7   Flag Codes  0 non-null      float64
dtypes: float64(2), int64(1), object(5)
memory usage: 28.2+ KB


In [233]:
df.head()

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,ICTINVST,TOT,PC,A,1985,12.465454,
1,AUS,ICTINVST,TOT,PC,A,1986,13.939533,
2,AUS,ICTINVST,TOT,PC,A,1987,14.142429,
3,AUS,ICTINVST,TOT,PC,A,1988,14.289993,
4,AUS,ICTINVST,TOT,PC,A,1989,15.348707,


In [234]:
df.SUBJECT.unique()

array(['TOT'], dtype=object)

In [235]:
df.TIME.max()

2010

In [236]:
df.Value.describe()

count    450.000000
mean      16.074272
std        5.636379
min        2.798417
25%       12.410442
50%       14.840726
75%       19.419433
max       32.601105
Name: Value, dtype: float64

In [237]:
subpillars[1]
subpillar = subpillars[1]
print(subpillar)

Funding and procurement


In [238]:
dcol = 'Value'
indicol = indicator
cname = 'LOCATION'

# filter most recent year
df = df[(df.TIME==2010)]

# create standard columns
df['higher_is_better'] = True
df['Indicator'] =  indicator
df['Country Name'] = df[cname]
# # df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df[dcol]
df['Sub-Pillar'] = subpillar
df['Year'] = 2010

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 1-20 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# df.sort_values(by='new_rank_score', ascending=False)

# # prepare output
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]

# # output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

In [239]:
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
72,CAN,2010,ICT investment as a percentage of GDP,17.018367,2.47101,True,Funding and procurement
121,FIN,2010,ICT investment as a percentage of GDP,15.519828,2.121241,True,Funding and procurement
172,DEU,2010,ICT investment as a percentage of GDP,12.690394,1.460831,True,Funding and procurement
198,IRL,2010,ICT investment as a percentage of GDP,12.412805,1.39604,True,Funding and procurement
224,ITA,2010,ICT investment as a percentage of GDP,11.026056,1.072363,True,Funding and procurement
274,KOR,2010,ICT investment as a percentage of GDP,10.716026,1.0,True,Funding and procurement
323,NZL,2010,ICT investment as a percentage of GDP,21.238183,3.455945,True,Funding and procurement
349,ESP,2010,ICT investment as a percentage of GDP,13.763415,1.711282,True,Funding and procurement
400,CHE,2010,ICT investment as a percentage of GDP,18.506691,2.818396,True,Funding and procurement
449,USA,2010,ICT investment as a percentage of GDP,32.137836,6.0,True,Funding and procurement


## 8. Evidence of digital strategies in/across Ministries



In [240]:
indicator = indicators[7]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Evidence of digital strategies in/across Minstries
Egov_strategy


In [241]:
df.head()

Unnamed: 0,#,Flag,Code,Cnum,Economy,Level,Population,GNI,GNIPC,e-Government,...,NGTI-1,NGTI-2,NGTI-3,NGTI-4,GTI-1,GTI-2,GTI-3,GTI-4,data_country,data_year
0,1,,AFG,4.0,Afghanistan,LIC,38928,20726,540,https://mcit.gov.af/node/6938,...,0.69,0.52,0.31,0.62,-0.09,-0.34,0.02,-0.12,,
1,2,,ALB,8.0,Albania,UMIC,2878,14949,5240,https://e-albania.al/,...,0.81,0.78,0.6,0.77,0.29,0.62,0.74,0.26,,
2,3,,DZA,12.0,Algeria,LMIC,43851,170722,3970,https://www.mpttn.gov.dz/ar/content/%D8%A7%D9%...,...,0.73,0.46,0.02,0.6,0.06,-0.56,-0.68,-0.15,,
3,4,,ADO,20.0,Andorra,HIC,77,3154,40886,http://www.govern.ad,...,0.63,0.6,0.05,0.13,-0.28,-0.06,-0.62,-1.32,,
4,5,,AGO,24.0,Angola,LMIC,32866,97005,3050,http://www.governo.gov.ao,...,0.68,0.69,0.21,0.61,-0.12,0.27,-0.23,-0.13,,


In [242]:
# Must limit the database to the first 206 rows, the remaining rows do not contain any useful information
df = df.iloc[0:205,:]

# Drop the superfluous rows by dropping na
df = df[pd.to_numeric(df['#'], errors='coerce').notnull()]
df['#'] = df[df['#'].notna()]

# Must convert data in the DPL column into float
df['DG St'] = df['DG St'].replace('-',np.nan)
df['DG St'] = df['DG St'].astype(float, errors = 'ignore')

ValueError: Columns must be same length as key

In [243]:
subpillars[1]
subpillar = subpillars[1]
print(subpillar)

Funding and procurement


In [244]:
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['DG St']
df['Country Name'] = df['Economy']
df['Year'] = 2020
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

max_rank

'3'

In [245]:
# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [246]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df.head(15)

KeyError: "['new_rank_score'] not in index"

In [247]:
# # output scores
df.to_csv('../indicator_scores/government_Evidence of digital strategies_scores.csv'.format(indicator), index=False)

## 9. Evidence of focus on vulnerable groups


In [248]:
indicator = indicators[8]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

df = df.iloc[1:,:]

Evidence of focus on vulnerable groups
Egov_strategy


In [249]:
df.head()

Unnamed: 0,#,Flag,Code,Cnum,Economy,Level,Population,GNI,GNIPC,e-Government,...,NGTI-1,NGTI-2,NGTI-3,NGTI-4,GTI-1,GTI-2,GTI-3,GTI-4,data_country,data_year
1,2,,ALB,8.0,Albania,UMIC,2878,14949,5240,https://e-albania.al/,...,0.81,0.78,0.6,0.77,0.29,0.62,0.74,0.26,,
2,3,,DZA,12.0,Algeria,LMIC,43851,170722,3970,https://www.mpttn.gov.dz/ar/content/%D8%A7%D9%...,...,0.73,0.46,0.02,0.6,0.06,-0.56,-0.68,-0.15,,
3,4,,ADO,20.0,Andorra,HIC,77,3154,40886,http://www.govern.ad,...,0.63,0.6,0.05,0.13,-0.28,-0.06,-0.62,-1.32,,
4,5,,AGO,24.0,Angola,LMIC,32866,97005,3050,http://www.governo.gov.ao,...,0.68,0.69,0.21,0.61,-0.12,0.27,-0.23,-0.13,,
5,6,,ATG,28.0,Antigua and Barbuda,HIC,98,1618,16660,http://www.ab.gov.ag,...,0.72,0.55,0.05,0.42,0.0,-0.23,-0.62,-0.59,,


In [250]:
# Must limit the database to the first 206 rows, the remaining rows do not contain any useful information
df = df.iloc[0:205,:]

# Drop the superfluous rows by dropping na
df = df[pd.to_numeric(df['#'], errors='coerce').notnull()]
df['#'] = df[df['#'].notna()]

# Must convert data in the DPL column into float
df['WoG'] = df['WoG'].replace('-',np.nan)
df['WoG'] = df['WoG'].astype(float, errors = 'ignore')

ValueError: Columns must be same length as key

In [251]:
df['WoG'].describe

<bound method NDFrame.describe of 1       0.03
2      -0.03
3      -0.03
4       0.03
5      -0.03
       ...  
200     0.03
201    -0.03
202    -0.03
203    -0.03
204    -0.03
Name: WoG, Length: 197, dtype: object>

In [252]:
subpillars[2]
subpillar = subpillars[2]
print(subpillar)

Leadership and coordination


In [253]:
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['WoG']
df['Country Name'] = df['Economy']
df['Year'] = 2020
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

max_rank

'0.09'

In [254]:
# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [255]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

KeyError: "['new_rank_score'] not in index"

In [256]:
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

## 10. % of digital skills certifications / training courses completed


In [257]:
indicator = indicators[9]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of digital skills certifications / training courses completed
digital_skill_level


In [258]:
df

Unnamed: 0,Global Rank,Region,Country and Region,Year,Technology,Data Science
0,1,Europe,Switzerland,2020,84%,96%
1,2,Europe,Luxembourg,2020,62%,85%
2,3,Europe,Austria,2020,88%,95%
3,4,Asia Pacific,Japan,2020,100%,88%
4,5,Europe,Germany,2020,89%,94%
...,...,...,...,...,...,...
103,104,Asia Pacific,Uzbekistan,2020,6%,9%
104,105,Sub-Saharan Africa,Sierra Leone,2020,2%,4%
105,106,Latin America and the Caribbean,Paraguay,2020,7%,11%
106,107,Latin America and the Caribbean,Guyana,2020,8%,2%


In [259]:
subpillars[3]
subpillar = subpillars[3]
print(subpillar)

Capabilities


In [260]:
dcol = 'Global Rank'
indicol = indicator
cname = 'Country and Region'

# filter most recent year
# df = df[(df.TIME==2010)]

# create standard columns
df['higher_is_better'] = True
df['Indicator'] =  indicator
df['Country Name'] = df[cname]
# # df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df[dcol]
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 1-20 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# # need to invert score since higher rank is not better 
df['new_rank_score'] = (6-df['new_rank_score'])+1

df.sort_values(by='new_rank_score', ascending=False)

# # prepare output
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]

# # output scores
df.to_csv('../indicator_scores/government_ percentage digital skills certifications_scores.csv'.format(indicator), index=False)

In [261]:
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Switzerland,2020,% of digital skills certifications / training ...,1,6.000000,True,Capabilities
1,Luxembourg,2020,% of digital skills certifications / training ...,2,5.953271,True,Capabilities
2,Austria,2020,% of digital skills certifications / training ...,3,5.906542,True,Capabilities
3,Japan,2020,% of digital skills certifications / training ...,4,5.859813,True,Capabilities
4,Germany,2020,% of digital skills certifications / training ...,5,5.813084,True,Capabilities
...,...,...,...,...,...,...,...
103,Uzbekistan,2020,% of digital skills certifications / training ...,104,1.186916,True,Capabilities
104,Sierra Leone,2020,% of digital skills certifications / training ...,105,1.140187,True,Capabilities
105,Paraguay,2020,% of digital skills certifications / training ...,106,1.093458,True,Capabilities
106,Guyana,2020,% of digital skills certifications / training ...,107,1.046729,True,Capabilities


### Score Aggregating

In [262]:
import os


In [263]:
# get list of files in scores folder
scores = os.listdir('../indicator_scores/')
scores = [s for s in scores if s.startswith('government')]

In [264]:
scores

['government_ percentage digital skills certifications_scores.csv',
 'government_E-Participation index_scores.csv',
 'government_Evidence of digital strategies_scores.csv',
 'government_Evidence of focus on vulnerable groups_scores.csv',
 'government_ICT investment as a percentage of GDP_scores.csv',
 'government_Online-Service-Index (OSI)_scores.csv',
 'government_percent change of government digitalizing_scores.csv',
 'government_R&D spending (% of GDP)_scores.csv',
 'government_Security incidents (# of relevant issues)_scores.csv',
 'government_Use of public services online (% of services online, penetration, frequency of use)_scores.csv']

In [265]:
# create a dataframe that concatenates all these file into one table
df = pd.concat([pd.read_csv('../indicator_scores/{}'.format(s)) for s in scores])    

In [266]:
df

Unnamed: 0.1,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar,Unnamed: 0,#,Flag,...,NGTI-1,NGTI-2,NGTI-3,NGTI-4,GTI-1,GTI-2,GTI-3,GTI-4,data_country,data_year
0,Switzerland,2020,% of digital skills certifications / training ...,1.0,6.000000,True,Capabilities,,,,...,,,,,,,,,,
1,Luxembourg,2020,% of digital skills certifications / training ...,2.0,5.953271,True,Capabilities,,,,...,,,,,,,,,,
2,Austria,2020,% of digital skills certifications / training ...,3.0,5.906542,True,Capabilities,,,,...,,,,,,,,,,
3,Japan,2020,% of digital skills certifications / training ...,4.0,5.859813,True,Capabilities,,,,...,,,,,,,,,,
4,Germany,2020,% of digital skills certifications / training ...,5.0,5.813084,True,Capabilities,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,Germany,2020,Use of public services online (% of services o...,17.0,3.283951,True,Digital Public Services,,,,...,,,,,,,,,,
27,Argentina,2020,Use of public services online (% of services o...,15.0,3.160494,True,Digital Public Services,,,,...,,,,,,,,,,
28,Morocco,2020,Use of public services online (% of services o...,-2.0,2.111111,True,Digital Public Services,,,,...,,,,,,,,,,
29,Switzerland,2020,Use of public services online (% of services o...,-18.0,1.123457,True,Digital Public Services,,,,...,,,,,,,,,,


In [267]:
# Data cleaning
df['new_rank_score'] = df['new_rank_score'].fillna(0)
df.sort_values(by=['Country Name'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)

In [268]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1414 entries, 0 to 1413
Columns: 340 entries, Country Name to data_year
dtypes: bool(1), float64(119), int64(1), object(219)
memory usage: 3.7+ MB


In [269]:
df.head(15)

Unnamed: 0.1,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar,Unnamed: 0,#,Flag,...,NGTI-1,NGTI-2,NGTI-3,NGTI-4,GTI-1,GTI-2,GTI-3,GTI-4,data_country,data_year
0,Afghanistan,2020,What is the % change of government digitizing ...,0.69,4.520408,True,Digital Public Services,,,,...,,,,,,,,,,
1,Afghanistan,2018,R&D spending (% of GDP),,0.0,True,Digital Public Services,,,,...,,,,,,,,,,
2,Afghanistan,2020,E-Participation index,0.4643,3.3215,True,Digital Public Services,29.0,,,...,,,,,,,,,,
3,Afghanistan,2020,Evidence of digital strategies in/across Minst...,2.0,0.0,True,Funding and procurement,,1.0,,...,0.69,0.52,0.31,0.62,-0.09,-0.34,0.02,-0.12,,
4,Afghanistan,2020,Online-Service-Index (OSI),0.4118,3.059,True,Digital Public Services,29.0,,,...,,,,,,,,,,
5,Africa Eastern and Southern,2018,R&D spending (% of GDP),,0.0,True,Digital Public Services,,,,...,,,,,,,,,,
6,Africa Western and Central,2018,R&D spending (% of GDP),,0.0,True,Digital Public Services,,,,...,,,,,,,,,,
7,Albania,2018,R&D spending (% of GDP),,0.0,True,Digital Public Services,,,,...,,,,,,,,,,
8,Albania,2020,Evidence of digital strategies in/across Minst...,3.0,0.0,True,Funding and procurement,,2.0,,...,0.81,0.78,0.6,0.77,0.29,0.62,0.74,0.26,,
9,Albania,2020,What is the % change of government digitizing ...,0.79,5.030612,True,Digital Public Services,,,,...,,,,,,,,,,


In [270]:
df.describe()

Unnamed: 0.1,Year,data_col,new_rank_score,Unnamed: 0,#,Flag,Cnum,eSrv,ePart'20,DG St,...,NGTI-1,NGTI-2,NGTI-3,NGTI-4,GTI-1,GTI-2,GTI-3,GTI-4,data_country,data_year
count,1414.0,1249.0,1414.0,386.0,395.0,0.0,395.0,395.0,395.0,395.0,...,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,0.0,0.0
mean,2019.553041,6.573519,2.249773,96.0,99.749367,,434.313924,0.003241,0.000304,2.318987,...,0.717822,0.612021,0.303491,0.661207,0.000446,0.002835,0.004462,-0.003885,,
std,1.122857,18.441544,2.128344,55.785861,57.086011,,253.830808,0.053952,0.059944,1.019863,...,0.200766,0.215494,0.293886,0.252401,0.653818,0.779548,0.726303,0.626705,,
min,2010.0,-20.0,0.0,0.0,1.0,,4.0,-0.08,-0.13,0.0,...,0.0,0.0,0.0,0.0,-2.34,-2.21,-0.75,-1.65,,
25%,2020.0,0.2738,0.0,48.0,50.5,,213.0,-0.02,-0.05,2.0,...,0.66,0.47,0.05,0.51,-0.2,-0.5,-0.62,-0.38,,
50%,2020.0,0.7235,2.126724,96.0,100.0,,430.0,0.05,0.0,3.0,...,0.76,0.61,0.18,0.7,0.13,0.01,-0.31,0.08,,
75%,2020.0,2.0,4.329439,144.0,149.0,,659.0,0.05,0.05,3.0,...,0.85,0.76,0.55,0.88,0.43,0.54,0.6,0.53,,
max,2020.0,108.0,6.0,192.0,198.0,,926.0,0.05,0.1,3.0,...,1.0,1.0,1.0,1.0,0.92,1.4,1.72,0.84,,


In [271]:
# checking country names
# sorted(df['Country Name'].unique().tolist())

In [272]:
# remove trailing whitespaces from country name
df['Country Name'] = df['Country Name'].str.strip()
df['Country Name'] = df['Country Name'].str.strip('**')
df['Country Name'] = df['Country Name'].str.strip('*')

In [273]:
df.head()

Unnamed: 0.1,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar,Unnamed: 0,#,Flag,...,NGTI-1,NGTI-2,NGTI-3,NGTI-4,GTI-1,GTI-2,GTI-3,GTI-4,data_country,data_year
0,Afghanistan,2020,What is the % change of government digitizing ...,0.69,4.520408,True,Digital Public Services,,,,...,,,,,,,,,,
1,Afghanistan,2018,R&D spending (% of GDP),,0.0,True,Digital Public Services,,,,...,,,,,,,,,,
2,Afghanistan,2020,E-Participation index,0.4643,3.3215,True,Digital Public Services,29.0,,,...,,,,,,,,,,
3,Afghanistan,2020,Evidence of digital strategies in/across Minst...,2.0,0.0,True,Funding and procurement,,1.0,,...,0.69,0.52,0.31,0.62,-0.09,-0.34,0.02,-0.12,,
4,Afghanistan,2020,Online-Service-Index (OSI),0.4118,3.059,True,Digital Public Services,29.0,,,...,,,,,,,,,,


In [274]:
# checking country names
# sorted(df['Country Name'].unique().tolist())

In [275]:
# average indicator scores per country
agg_df = df.groupby(['Country Name']).agg({'new_rank_score':'mean','data_col':'count'})

In [276]:
agg_df.columns = ['agg_score', 'count_source' ]

In [277]:
max_number_sources = agg_df.describe()['count_source']['max']

In [278]:
agg_df['agg_score_wt'] = agg_df['agg_score']*(agg_df['count_source']/max_number_sources)

In [279]:
agg_df.sort_values(by='agg_score', ascending=False, inplace=True)

In [280]:
agg_df.head(25)

Unnamed: 0_level_0,agg_score,count_source,agg_score_wt
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
UAE,6.0,1,0.666667
USA,6.0,1,0.666667
North Korea,6.0,1,0.666667
United Kingdom of Great Britain and Northern Ireland,5.8375,2,1.297222
Hong Kong,5.091244,2,1.131388
Republic of Korea,5.034268,3,1.678089
Taiwan,4.878505,1,0.542056
UK,4.82716,1,0.536351
Russia,4.794672,2,1.065483
Republic of Moldova,4.787,2,1.063778


In [281]:
agg_df.to_csv('../pillar_scores/government_scores_v0.csv')

In [282]:
### Score Aggregating by Subpillars

In [283]:
df.insert(0,'Pillar','Government')
df

In [284]:
sub_df = df.groupby(['Pillar','Sub-Pillar','Country Name']).agg({'new_rank_score':'mean','data_col':'count'})

In [285]:
sub_df.columns = ['agg_score', 'count_source' ]

In [286]:
max_number_sources = sub_df.describe()['count_source']['max']

In [287]:
sub_df['agg_score_wt'] = sub_df['agg_score']*(sub_df['count_source']/max_number_sources)

In [288]:
sub_df.to_csv('../subpillar_score/government_scores_subpillar_v0.csv')

### Sources Generation

In [289]:
#Get all countries from Countries.xlsx
countries = pd.read_excel('../../data/Countries.xlsx')
col_names = ['Country or Area']
countries = countries[col_names]
countries.rename(columns = {'Country or Area': 'Country Name'}, inplace = True)

In [290]:
#Get all indicators from names dataframe retrieve at the begining of the script
bnames=bnames[['check','Sub-Pillar','Indicator','Data Source','Data Link']]
bnames.rename(columns = {'check': 'Pillar'}, inplace = True)

In [291]:
#Do a nice cross join so that we have combination of all countries vs all indicators
sources = countries.merge(bnames, how='cross')
sources

Unnamed: 0,Country Name,Pillar,Sub-Pillar,Indicator,Data Source,Data Link
0,Algeria,Government,Digital Public Services,Online-Service-Index (OSI),UN: E-Government Survey,https://publicadministration.un.org/egovkb
1,Algeria,Government,Digital Public Services,E-Participation index,UN: E-Government Survey,https://publicadministration.un.org/egovkb
2,Algeria,Government,Digital Public Services,Use of public services online (% of services o...,Boston Consulting Group/Salesforce: The Global...,https://www.salesforce.com
3,Algeria,Government,Digital Public Services,Security incidents (# of relevant issues),SPECOPS,https://specopssoft.com
4,Algeria,Government,Digital Public Services,What is the % change of government digitizing ...,World Bank: GovTech Dataset,https://datacatalog.worldbank.org
...,...,...,...,...,...,...
2485,Wallis and Futuna Islands,Government,Funding and procurement,R&D spending (% of GDP),World Bank: World Development Indicators,https://datacatalog.worldbank.org
2486,Wallis and Futuna Islands,Government,Funding and procurement,ICT investment as a percentage of GDP,OECD: Going Digital Toolkit,https://data.oecd.org
2487,Wallis and Futuna Islands,Government,Funding and procurement,Evidence of digital strategies in/across Minst...,World Bank: GovTech Dataset,https://datacatalog.worldbank.org
2488,Wallis and Futuna Islands,Government,Leadership and coordination,Evidence of focus on vulnerable groups,World Bank: GovTech Dataset,https://datacatalog.worldbank.org


In [292]:
#Make copy of a scores dataframe and add the column available, with value of 1 (string)
#denoting all the country/indicator combinations that have value
dfsources = df[['Country Name','Pillar','Sub-Pillar','Indicator']].copy()
dfsources['Available'] = '1'

In [293]:
#If sources.csv exists, get the contents, remove everyhing from this pillar, append prepared sources, save csv.
#if sources.csv does not exist, create new file from sources.
from os.path import exists

if exists('../../dashboard/Sources.csv') :
    CurrentSources = pd.read_csv('../../dashboard/Sources.csv', dtype=str)
    CurrentSources = CurrentSources[['Country Name','Pillar','Sub-Pillar','Indicator','Data Source','Data Link','Available']]
    CurrentSources.loc[CurrentSources['Pillar'] != 'Government']
    CurrentSources = CurrentSources.append(sources)
else :
    CurrentSources = sources
CurrentSources

Unnamed: 0,Country Name,Pillar,Sub-Pillar,Indicator,Data Source,Data Link,Available
0,Algeria,Strategy,Ambition,SDG Index,Sustainable Development Report,https://www.sdgindex.org,1
1,Egypt,Strategy,Ambition,SDG Index,Sustainable Development Report,https://www.sdgindex.org,0
2,Libya,Strategy,Ambition,SDG Index,Sustainable Development Report,https://www.sdgindex.org,1
3,Morocco,Strategy,Ambition,SDG Index,Sustainable Development Report,https://www.sdgindex.org,1
4,Sudan,Strategy,Ambition,SDG Index,Sustainable Development Report,https://www.sdgindex.org,1
...,...,...,...,...,...,...,...
2485,Wallis and Futuna Islands,Government,Funding and procurement,R&D spending (% of GDP),World Bank: World Development Indicators,https://datacatalog.worldbank.org,
2486,Wallis and Futuna Islands,Government,Funding and procurement,ICT investment as a percentage of GDP,OECD: Going Digital Toolkit,https://data.oecd.org,
2487,Wallis and Futuna Islands,Government,Funding and procurement,Evidence of digital strategies in/across Minst...,World Bank: GovTech Dataset,https://datacatalog.worldbank.org,
2488,Wallis and Futuna Islands,Government,Leadership and coordination,Evidence of focus on vulnerable groups,World Bank: GovTech Dataset,https://datacatalog.worldbank.org,


In [294]:
CurrentSources.to_csv('../../dashboard/Sources.csv', index=False)