In [1]:
import pandas as pd
import numpy as np

In [2]:
### Get all the pillar names from the excel

In [3]:
names = pd.read_excel('../../UNDP Digital Assessment Data Framework Filename Matching V7.xlsx')

In [4]:
col_names = ['Indicator','check', 'Data Source','Index','Filename']

In [5]:
names = names[col_names]

In [6]:
names.head()

Unnamed: 0,Indicator,check,Data Source,Index,Filename
0,Countries,,United Nations,False,Countries
1,"Database of Global Administrative Areas (GADM,...",,GADM maps and data,False,
2,High Resolution Population Density Maps + Demo...,,Facebook,False,
3,population density vs openstreetmap object den...,,Kontur,False,
4,Population Density,Infrastructure,World Bank,False,population_density


In [7]:
# get all the files per pillar
data_stats = names.groupby('check').agg({'Filename':'count','Indicator':'count'})

In [8]:
data_stats

Unnamed: 0_level_0,Filename,Indicator
check,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,20,25
Foundations,9,12
Government,11,15
Infrastructure,45,48
People,38,46
Regulation,6,7
Strategy,1,1


In [9]:
### People

In [12]:
bnames = names[(names.check=='People')&(~names.Filename.isna())]#&(names.Index==False)]
bnames

Unnamed: 0,Indicator,check,Data Source,Index,Filename
99,Human Capital Index (HCI),People,DESA,True,e_government_index
100,% of population using internet (all),People,World Bank,False,population_using_internet
101,% of population using internet (female),People,ITU,False,gender_gap_internet_usage
102,% of population using internet (male),People,ITU,False,gender_gap_internet_usage
103,SDG 4.4 Digital literacy data,People,UNESCO,False,SDG 4.4_Digital_literacy_data
104,UNDP Human Development Index (HDI),People,UNDP,True,undp_human_developmnt
105,Facebook Social Connectedness Index,People,Facebook,True,fb_social_connectedness
106,Share of individuals using the Internet to int...,People,OECD,False,population_interacting_public_officials
107,Level of satisfaction for online public servic...,People,Boston Consulting Group/SalesForce,False,digital_public_service_use
108,Number of mobile apps available in national la...,People,GSMA Mobile Connectivity Index,False,apps_in_national_language


In [14]:
# get list of names for all indicators
indicators = bnames.Indicator.unique()

In [15]:
# get all file names
bfiles = bnames.Filename.unique()

In [16]:
bfiles

array(['e_government_index', 'population_using_internet',
       'gender_gap_internet_usage', 'SDG 4.4_Digital_literacy_data',
       'undp_human_developmnt', 'fb_social_connectedness',
       'population_interacting_public_officials',
       'digital_public_service_use', 'apps_in_national_language',
       'time_spent_online', ' happiness_score', 'cryptocurrency_adoption',
       'not_buying_online_concern_about_returning',
       'not_buying_online_concern_about_security',
       'ewaste_per_inhabitant', 'automation_led_unemployment',
       'cyberbullying_rate', 'global_wellbeing_initiative ',
       'financial_inclusiveness ',
       'individuals buying online and frequency', 'e-commerce_activity',
       'top_sites', 'youtube_searches', 'google_trends', 'intenet_usage',
       'household_internet_access', 'FB_users', 'gender_gaps',
       'population_digital_financial_services',
       'mobile_broadband_pricing', 'tax_percent_mobile_ownership',
       'percent_mobile_subscription'

In [17]:
# formula for converting scale
def convert_rank(old_value, old_min=1, old_max=7, new_min=1, new_max=6 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

In [18]:
### 1. Human Capital Index (HCI)

In [21]:
indicators[0]

# load data
indicator = indicators[0]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Human Capital Index (HCI)
e_government_index


In [23]:
df.head(10)

Unnamed: 0,Survey Year,Country Name,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
0,2020,Iraq,143,0.436,0.3095,0.3353,0.4358,0.537
1,2020,Ireland,27,0.8433,0.8571,0.7706,0.9494,0.81
2,2020,Israel,30,0.8361,0.7143,0.7471,0.8924,0.8689
3,2020,Italy,37,0.8231,0.8214,0.8294,0.8466,0.7932
4,2020,Jamaica,114,0.5392,0.369,0.3882,0.7142,0.5151
5,2020,Japan,14,0.8989,0.9881,0.9059,0.8684,0.9223
6,2020,Jordan,117,0.5309,0.3333,0.3588,0.68,0.554
7,2020,Kazakhstan,29,0.8375,0.881,0.9235,0.8866,0.7024
8,2020,Kenya,116,0.5326,0.5952,0.6765,0.5812,0.3402
9,2020,Kiribati,145,0.432,0.5595,0.4941,0.6778,0.1241


In [25]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Human Capital Index'] 
df['Year'] = df['Survey Year']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [27]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Iraq,2020,Human Capital Index (HCI),0.4358,3.179,True
1,Ireland,2020,Human Capital Index (HCI),0.9494,5.747,True
2,Israel,2020,Human Capital Index (HCI),0.8924,5.462,True
3,Italy,2020,Human Capital Index (HCI),0.8466,5.233,True
4,Jamaica,2020,Human Capital Index (HCI),0.7142,4.571,True
5,Japan,2020,Human Capital Index (HCI),0.8684,5.342,True
6,Jordan,2020,Human Capital Index (HCI),0.68,4.4,True
7,Kazakhstan,2020,Human Capital Index (HCI),0.8866,5.433,True
8,Kenya,2020,Human Capital Index (HCI),0.5812,3.906,True
9,Kiribati,2020,Human Capital Index (HCI),0.6778,4.389,True


In [28]:
### 2. % of population using internet (all)

In [30]:
indicators[1]

# load data
indicator = indicators[1]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population using internet (all)
population_using_internet


FileNotFoundError: [Errno 2] No such file or directory: '../../processed/population_using_internet.csv'

In [31]:
### 3. % of population using internet (female)

In [32]:
indicators[2]

# load data
indicator = indicators[2]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population using internet (female)
gender_gap_internet_usage


In [34]:
df.head(10)
# Need to move the top row down one

Unnamed: 0.1,Unnamed: 0,Latest,All,Gender,Urban,Rural,data_country,data_year
0,,year,Individuals,Male,Total,Total,,
1,,2020,72.2,73.2,...,...,,
2,,2018,49.0,55.1,56.2,34.1,,
3,,2017,91.6,92.9,...,...,,
4,,2017,74.3,75.2,74.3,...,,
5,,2019,66.5,65.8,70.8,59.9,,
6,,2017,86.5,87.0,87.0,82.8,,
7,,2020,87.5,89.2,...,...,,
8,,2019,81.1,84.2,92.8,67.7,,
9,,2020,99.5,99.4,...,...,,


In [35]:
### 4. % of population using internet (male)

In [37]:
indicators[3]

# load data
indicator = indicators[3]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population using internet (male)
gender_gap_internet_usage


In [39]:
df.head(10)
# Need to move the top row down one

Unnamed: 0.1,Unnamed: 0,Latest,All,Gender,Urban,Rural,data_country,data_year
0,,year,Individuals,Male,Total,Total,,
1,,2020,72.2,73.2,...,...,,
2,,2018,49.0,55.1,56.2,34.1,,
3,,2017,91.6,92.9,...,...,,
4,,2017,74.3,75.2,74.3,...,,
5,,2019,66.5,65.8,70.8,59.9,,
6,,2017,86.5,87.0,87.0,82.8,,
7,,2020,87.5,89.2,...,...,,
8,,2019,81.1,84.2,92.8,67.7,,
9,,2020,99.5,99.4,...,...,,


In [40]:
### 5. SDG 4.4 Digital literacy data

In [42]:
indicators[4]

# load data
indicator = indicators[4]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

SDG 4.4 Digital literacy data
SDG 4.4_Digital_literacy_data


FileNotFoundError: [Errno 2] No such file or directory: '../../processed/SDG 4.4_Digital_literacy_data.csv'

In [43]:
### 6. UNDP Human Development Index (HDI)

In [45]:
indicators[5]

# load data
indicator = indicators[5]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

UNDP Human Development Index (HDI) 
undp_human_developmnt


In [47]:
df.head(15)
# Need to move the top row down

Unnamed: 0,Table 3. Inequality-adjusted Human Development Index,Unnamed: 1,data_country,data_year
0,Table 4. Gender Development Index,,,
1,Table 5. Gender Inequality Index,,,
2,Table 6. Multidimensional Poverty Index: devel...,,,
3,,,,
4,Human development indicators,,,
5,Table 7. Population trends,,,
6,Table 8. Health outcomes,,,
7,Table 9. Education achievements,,,
8,Table 10. National income and composition of r...,,,
9,Table 11. Work and employment,,,


In [48]:
### 7. Facebook Social Connectedness Index

In [51]:
indicators[6]

# load data
indicator = indicators[6]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Facebook Social Connectedness Index
fb_social_connectedness


In [53]:
df.head(25)

Unnamed: 0,user_loc,fr_loc,scaled_sci
0,1001,AE,8729
1,1001,AG,95256
2,1001,AL,3122
3,1001,AM,3470
4,1001,AO,2839
5,1001,AR,3729
6,1001,AT,6977
7,1001,AU,21136
8,1001,AW,27607
9,1001,AZ,1108


In [55]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['scaled_sci'] 
df['Country Name'] = df['fr_loc']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [58]:
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(30)

# Need to find a way to convert ISO codes to full country names

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,AE,Facebook Social Connectedness Index,8729,1.000044,True
1,AG,Facebook Social Connectedness Index,95256,1.000476,True
2,AL,Facebook Social Connectedness Index,3122,1.000016,True
3,AM,Facebook Social Connectedness Index,3470,1.000017,True
4,AO,Facebook Social Connectedness Index,2839,1.000014,True
5,AR,Facebook Social Connectedness Index,3729,1.000019,True
6,AT,Facebook Social Connectedness Index,6977,1.000035,True
7,AU,Facebook Social Connectedness Index,21136,1.000106,True
8,AW,Facebook Social Connectedness Index,27607,1.000138,True
9,AZ,Facebook Social Connectedness Index,1108,1.000006,True


In [59]:
### 8. Share of individuals using the Internet to interact with officials

In [96]:
indicators[7]

# load data
indicator = indicators[7]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Share of individuals using the Internet to interact with public authorities
population_interacting_public_officials


In [97]:
df.head(15)

Unnamed: 0,Country,Indicator,Breakdowns,Time,Unit,PowerCode,Reference Period,Value,Flags
0,Australia,Individuals using the Internet for downloading...,All (individuals aged 16-74),2010,Percentage,Units,,38.11,Difference in methodology
1,Australia,Individuals using the Internet for downloading...,All (individuals aged 16-74),2012,Percentage,Units,,49.96,Difference in methodology
2,Austria,Individuals using the Internet for visiting or...,All (individuals aged 16-74),2005,Percentage,Units,,29.194,
3,Austria,Individuals using the Internet for visiting or...,All (individuals aged 16-74),2006,Percentage,Units,,32.9733,
4,Austria,Individuals using the Internet for visiting or...,All (individuals aged 16-74),2007,Percentage,Units,,27.4741,
5,Austria,Individuals using the Internet for visiting or...,All (individuals aged 16-74),2008,Percentage,Units,,51.2252,Break
6,Austria,Individuals using the Internet for visiting or...,All (individuals aged 16-74),2009,Percentage,Units,,48.8515,
7,Austria,Individuals using the Internet for visiting or...,All (individuals aged 16-74),2010,Percentage,Units,,51.0458,
8,Austria,Individuals using the Internet for visiting or...,All (individuals aged 16-74),2011,Percentage,Units,,51.2893,
9,Austria,Individuals using the Internet for visiting or...,All (individuals aged 16-74),2012,Percentage,Units,,52.8274,


In [98]:
# filter most recent year
df = df[(df.Time==2019)]
df = df[(df.Indicator=='Individuals using the Internet for visiting or interacting with public authorities websites - last 12 m (%)')]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df['Year'] = df['Time']
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [101]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

# Need to find a way to convert ISO codes to full country names

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
16,Austria,2019,Share of individuals using the Internet to int...,69.7355,4.715671,True
64,Belgium,2019,Share of individuals using the Internet to int...,58.605,4.063974,True
109,Brazil,2019,Share of individuals using the Internet to int...,34.194129,2.634704,True
148,Colombia,2019,Share of individuals using the Internet to int...,6.274581,1.0,True
169,Czech Republic,2019,Share of individuals using the Internet to int...,53.7766,3.781269,True
216,Denmark,2019,Share of individuals using the Internet to int...,91.6709,6.0,True
262,Estonia,2019,Share of individuals using the Internet to int...,80.0156,5.317576,True
310,Finland,2019,Share of individuals using the Internet to int...,87.2954,5.743812,True
357,France,2019,Share of individuals using the Internet to int...,74.7242,5.007762,True
397,Germany,2019,Share of individuals using the Internet to int...,59.0977,4.092822,True


In [118]:
### 9. Level of satisfaction for online public service

In [119]:
indicators[8]

# load data
indicator = indicators[8]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Level of satisfaction for online public services (% of users, by type of interaction and service)
digital_public_service_use


In [120]:
df.head(15)

Unnamed: 0,Country,Net Perception
0,Estonia,67%
1,UAE,61%
2,Saudi Arabia,59%
3,Singapore,54%
4,China,53%
5,New Zealand,52%
6,Netherlands,51%
7,Qatar,51%
8,Canada,49%
9,Denmark,48%


In [129]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Net Perception'] 
df['Year'] = 2020
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# Need to replace the % mark 

KeyError: 'Net Perception'

In [None]:
### 10. Number of mobile apps available in national language

In [124]:
indicators[9]

# load data
indicator = indicators[9]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Number of mobile apps available in national language(s)
apps_in_national_language


In [125]:
df.head(15)

Unnamed: 0.1,Unnamed: 0,ISO Code,Country,Region,Year,Number of apps in national language
0,1,AFG,Afghanistan,South Asia,2014,2.444741
1,2,AFG,Afghanistan,South Asia,2015,2.793221
2,3,AFG,Afghanistan,South Asia,2016,2.849881
3,4,AFG,Afghanistan,South Asia,2017,2.913741
4,5,AFG,Afghanistan,South Asia,2018,2.961247
5,6,AFG,Afghanistan,South Asia,2019,3.0
6,7,AGO,Angola,Sub-Saharan Africa,2014,53.333237
7,8,AGO,Angola,Sub-Saharan Africa,2015,55.08091
8,9,AGO,Angola,Sub-Saharan Africa,2016,56.516411
9,10,AGO,Angola,Sub-Saharan Africa,2017,57.061077


In [128]:
# filter most recent year and global value
df = df[(df.Year==2019)]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Number of apps in national language'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [131]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Afghanistan,2019,Number of mobile apps available in national la...,3.0,1.0,True
11,Angola,2019,Number of mobile apps available in national la...,57.9785,3.833943,True
17,Albania,2019,Number of mobile apps available in national la...,67.286446,4.313734,True
23,United Arab Emirates,2019,Number of mobile apps available in national la...,78.453438,4.889352,True
29,Argentina,2019,Number of mobile apps available in national la...,89.724289,5.470324,True
35,Armenia,2019,Number of mobile apps available in national la...,12.541476,1.491829,True
41,Australia,2019,Number of mobile apps available in national la...,100.0,6.0,True
47,Austria,2019,Number of mobile apps available in national la...,89.197319,5.443161,True
53,Azerbaijan,2019,Number of mobile apps available in national la...,25.400829,2.154682,True
59,Burundi,2019,Number of mobile apps available in national la...,6.738045,1.192683,True


In [132]:
### 11. Device Addiction (time of use on internet)

In [134]:
indicators[10]

# load data
indicator = indicators[10]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Device Addiction (time of use on internet / on devices)
time_spent_online


In [135]:
df.head(15)

# Need to move the row further down

Unnamed: 0,"Daily time spent online by users worldwide 2020, by region",Unnamed: 1
0,Average daily time spent using the internet by...,
1,Philippines,10.56
2,Brazil,10.08
3,Colombia,10.07
4,South Africa,10.06
5,Argentina,9.39
6,Malaysia,9.17
7,Mexico,9.01
8,Indonesia,8.52
9,Thailand,8.44


In [136]:
# filter most recent year and global value
df = df[(df.Year==2019)]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Number of apps in national language'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

AttributeError: 'DataFrame' object has no attribute 'Year'

In [137]:
### 12. Gross National Wellbeing