In [1]:
import pandas as pd
import numpy as np

In [2]:
### Get all the pillar names from the excel

In [3]:
names = pd.read_excel('../../UNDP Digital Assessment Data Framework Filename Matching V7.xlsx')

In [4]:
col_names = ['Indicator','check', 'Data Source','Index','Filename']

In [5]:
names = names[col_names]

In [6]:
names.head()

Unnamed: 0,Indicator,check,Data Source,Index,Filename
0,Countries,,United Nations,False,Countries
1,"Database of Global Administrative Areas (GADM,...",,GADM maps and data,False,
2,High Resolution Population Density Maps + Demo...,,Facebook,False,
3,population density vs openstreetmap object den...,,Kontur,False,
4,Population Density,Infrastructure,World Bank,False,population_density


In [7]:
# get all the files per pillar
data_stats = names.groupby('check').agg({'Filename':'count','Indicator':'count'})

In [8]:
data_stats

Unnamed: 0_level_0,Filename,Indicator
check,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,20,25
Foundations,9,12
Government,10,15
Infrastructure,39,48
People,39,47
Regulation,6,7
Strategy,1,1


In [9]:
### Infrastructure

In [10]:
bnames = names[(names.check=='Infrastructure')&(~names.Filename.isna())]#&(names.Index==False)]
bnames.head(25)

Unnamed: 0,Indicator,check,Data Source,Index,Filename
4,Population Density,Infrastructure,World Bank,False,population_density
5,Broadband Density,Infrastructure,ITU,False,ITU_database
6,% of population covered by internet connectivity,Infrastructure,ITU,False,ITU_database
7,% of population covered by mobile 2G+ data con...,Infrastructure,GSMA Mobile Connectivity Index,False,countries_mobile_connectivity
8,% of population covered by mobile 3G+ data con...,Infrastructure,GSMA Mobile Connectivity Index,False,countries_mobile_connectivity
9,% of population covered by mobile 4G+ data con...,Infrastructure,GSMA Mobile Connectivity Index,False,countries_mobile_connectivity
10,% of population covered by mobile 5G+ data con...,Infrastructure,GSMA Mobile Connectivity Index,False,countries_mobile_connectivity
11,Mobile Coverage Maps,Infrastructure,ITU,False,ITU_database
12,Electricity Density,Infrastructure,Energy Data,False,electricity_yearbook
13,% of population covered by electricity,Infrastructure,World Bank,False,population_electricity_coverage


In [11]:
# get list of names for all indicators
indicators = bnames.Indicator.unique()

In [12]:
# get all file names
bfiles = bnames.Filename.unique()

In [13]:
bfiles

array(['population_density', 'ITU_database',
       'countries_mobile_connectivity', 'electricity_yearbook',
       'population_electricity_coverage', 'elect_supply_quality',
       'sustainability_index', 'mobile_density', 'e_government_index',
       'spectrum_allocated_mobile_providers', 'countries_ixp',
       'internet_speed', 'mobile_latency', 'mobile_speed',
       'fixed_bdbd_spd_dl_ul', 'postal_coverage',
       'logistics_performance_index', 'national_cybersecurity_index',
       'dice_export_global_cybersecurity_index',
       'software_developer_ecosystem_size',
       'digital_platform_economy_index', 'migration_skill',
       'migration_industry', 'migration_country',
       'global_fintech_ranking', 'tech_hubs', 'banking_sector_size',
       'angel_investment', 'startup_eco_size',
       'international_co_inventions'], dtype=object)

In [14]:
# formula for converting scale
def convert_rank(old_value, old_min=1, old_max=7, new_min=1, new_max=6 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

In [15]:
### 2. Broadband Density

In [16]:
indicators[1]

# load data
indicator = indicators[1]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Broadband Density
ITU_database


In [17]:
df.head(15)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
0,Angola,Africa,AGO,Female mobile phone ownership as a % of total ...,2010.0,,,
1,Benin,Africa,BEN,Female mobile phone ownership as a % of total ...,2010.0,,,
2,Botswana,Africa,BWA,Female mobile phone ownership as a % of total ...,2010.0,,,
3,Burkina Faso,Africa,BFA,Female mobile phone ownership as a % of total ...,2010.0,,,
4,Burundi,Africa,BDI,Female mobile phone ownership as a % of total ...,2010.0,,,
5,Cabo Verde,Africa,CPV,Female mobile phone ownership as a % of total ...,2010.0,,,
6,Cameroon,Africa,CMR,Female mobile phone ownership as a % of total ...,2010.0,,,
7,Central African Rep.,Africa,CAF,Female mobile phone ownership as a % of total ...,2010.0,,,
8,Chad,Africa,TCD,Female mobile phone ownership as a % of total ...,2010.0,,,
9,Congo (Rep. of the),Africa,COG,Female mobile phone ownership as a % of total ...,2010.0,,,


In [18]:
df = df[(df.Year == 2020)]
df = df[(df['Indicator name']== 'Fixed broadband subscriptions per 100 inhabitants')]
df


Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
36787,Angola,Africa,AGO,Fixed broadband subscriptions per 100 inhabitants,2020.0,0.701662,,
36788,Benin,Africa,BEN,Fixed broadband subscriptions per 100 inhabitants,2020.0,0.247303,,
36789,Botswana,Africa,BWA,Fixed broadband subscriptions per 100 inhabitants,2020.0,3.057373,,
36790,Burkina Faso,Africa,BFA,Fixed broadband subscriptions per 100 inhabitants,2020.0,0.066875,,
36791,Burundi,Africa,BDI,Fixed broadband subscriptions per 100 inhabitants,2020.0,0.035574,,
...,...,...,...,...,...,...,...,...
36978,Suriname,The Americas,SUR,Fixed broadband subscriptions per 100 inhabitants,2020.0,15.728770,,
36979,Trinidad and Tobago,The Americas,TTO,Fixed broadband subscriptions per 100 inhabitants,2020.0,26.866254,,
36980,United States,The Americas,USA,Fixed broadband subscriptions per 100 inhabitants,2020.0,36.413908,,
36981,Uruguay,The Americas,URY,Fixed broadband subscriptions per 100 inhabitants,2020.0,,,


In [19]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value']
df.rename(columns={'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [20]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
36787,Angola,2020.0,Broadband Density,0.701662,1.065945,True
36788,Benin,2020.0,Broadband Density,0.247303,1.023242,True
36789,Botswana,2020.0,Broadband Density,3.057373,1.287344,True
36790,Burkina Faso,2020.0,Broadband Density,0.066875,1.006285,True
36791,Burundi,2020.0,Broadband Density,0.035574,1.003343,True
36792,Cabo Verde,2020.0,Broadband Density,4.46755,1.419877,True
36793,Cameroon,2020.0,Broadband Density,2.689101,1.252732,True
36794,Central African Rep.,2020.0,Broadband Density,,,True
36795,Chad,2020.0,Broadband Density,0.0,1.0,True
36796,Congo (Rep. of the),2020.0,Broadband Density,,,True


In [21]:
### 3. % of population covered by internet connectivity

In [22]:
indicators[2]

# load data
indicator = indicators[2]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by internet connectivity
ITU_database


In [23]:
df = df[(df.Year == 2020)]
df = df[(df['Indicator name']== 'Individuals using the Internet, total (%)')]
df

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
47744,Mauritius,Africa,MUS,"Individuals using the Internet, total (%)",2020.0,64.884904,,
47745,Uganda,Africa,UGA,"Individuals using the Internet, total (%)",2020.0,,,
47746,Bahrain,Arab States,BHR,"Individuals using the Internet, total (%)",2020.0,99.539512,,
47747,Egypt,Arab States,EGY,"Individuals using the Internet, total (%)",2020.0,71.914200,,
47748,Iraq,Arab States,IRQ,"Individuals using the Internet, total (%)",2020.0,,,
...,...,...,...,...,...,...,...,...
47805,Bolivia (Plurinational State of),The Americas,BOL,"Individuals using the Internet, total (%)",2020.0,55.139051,,
47806,Costa Rica,The Americas,CRI,"Individuals using the Internet, total (%)",2020.0,80.530186,,
47807,Mexico,The Americas,MEX,"Individuals using the Internet, total (%)",2020.0,71.970000,,
47808,Paraguay,The Americas,PRY,"Individuals using the Internet, total (%)",2020.0,74.515240,,


In [24]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value']
df.rename(columns={'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [25]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
47744,Mauritius,2020.0,% of population covered by internet connectivity,64.884904,3.298839,True
47745,Uganda,2020.0,% of population covered by internet connectivity,,,True
47746,Bahrain,2020.0,% of population covered by internet connectivity,99.539512,5.964578,True
47747,Egypt,2020.0,% of population covered by internet connectivity,71.9142,3.839554,True
47748,Iraq,2020.0,% of population covered by internet connectivity,,,True
47749,Kuwait,2020.0,% of population covered by internet connectivity,98.599995,5.892307,True
47750,Morocco,2020.0,% of population covered by internet connectivity,84.120363,4.778489,True
47751,Oman,2020.0,% of population covered by internet connectivity,95.232293,5.633253,True
47752,Qatar,2020.0,% of population covered by internet connectivity,99.652794,5.973292,True
47753,Saudi Arabia,2020.0,% of population covered by internet connectivity,97.862332,5.835564,True


In [26]:
### 4. % of population covered by mobile 2G

In [27]:
indicators[3]

'% of population covered by mobile 2G+ data connectivity'

In [28]:
# load data
indicator = indicators[3]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by mobile 2G+ data connectivity
countries_mobile_connectivity


In [29]:
df.head(10)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [30]:
# filter most recent year
df = df[(df.Year==2019)]


In [31]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2G Coverage'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [32]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Afghanistan,2019,% of population covered by mobile 2G+ data con...,90.0,4.921251,True
11,Angola,2019,% of population covered by mobile 2G+ data con...,90.0,4.921251,True
17,Albania,2019,% of population covered by mobile 2G+ data con...,99.86,5.984898,True
23,United Arab Emirates,2019,% of population covered by mobile 2G+ data con...,100.0,6.0,True
29,Argentina,2019,% of population covered by mobile 2G+ data con...,98.0,5.78425,True
35,Armenia,2019,% of population covered by mobile 2G+ data con...,100.0,6.0,True
41,Australia,2019,% of population covered by mobile 2G+ data con...,99.4,5.935275,True
47,Austria,2019,% of population covered by mobile 2G+ data con...,99.0,5.892125,True
53,Azerbaijan,2019,% of population covered by mobile 2G+ data con...,100.0,6.0,True
59,Burundi,2019,% of population covered by mobile 2G+ data con...,53.65,1.0,True


In [33]:
### 5. % of population covered by mobile 3G

In [34]:
indicators[4]

# load data
indicator = indicators[4]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by mobile 3G+ data connectivity
countries_mobile_connectivity


In [35]:
df.head(10)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [36]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['3G Coverage'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [37]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Afghanistan,2019,% of population covered by mobile 3G+ data con...,58.7,3.282895,True
11,Angola,2019,% of population covered by mobile 3G+ data con...,71.0,4.092105,True
17,Albania,2019,% of population covered by mobile 3G+ data con...,97.0,5.802632,True
23,United Arab Emirates,2019,% of population covered by mobile 3G+ data con...,100.0,6.0,True
29,Argentina,2019,% of population covered by mobile 3G+ data con...,95.0,5.671053,True
35,Armenia,2019,% of population covered by mobile 3G+ data con...,99.0,5.934211,True
41,Australia,2019,% of population covered by mobile 3G+ data con...,99.5,5.967105,True
47,Austria,2019,% of population covered by mobile 3G+ data con...,99.0,5.934211,True
53,Azerbaijan,2019,% of population covered by mobile 3G+ data con...,95.0,5.671053,True
59,Burundi,2019,% of population covered by mobile 3G+ data con...,40.0,2.052632,True


In [38]:
### 6. % of population covered by mobile 4G

In [39]:
indicators[5]

# load data
indicator = indicators[5]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by mobile 4G+ data connectivity
countries_mobile_connectivity


In [40]:
df.head(10)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [41]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['4G Coverage'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [42]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Afghanistan,2019,% of population covered by mobile 4G+ data con...,15.0,1.75,True
11,Angola,2019,% of population covered by mobile 4G+ data con...,50.0,3.5,True
17,Albania,2019,% of population covered by mobile 4G+ data con...,96.0,5.8,True
23,United Arab Emirates,2019,% of population covered by mobile 4G+ data con...,99.0,5.95,True
29,Argentina,2019,% of population covered by mobile 4G+ data con...,89.77,5.4885,True
35,Armenia,2019,% of population covered by mobile 4G+ data con...,95.0,5.75,True
41,Australia,2019,% of population covered by mobile 4G+ data con...,99.2,5.96,True
47,Austria,2019,% of population covered by mobile 4G+ data con...,99.0,5.95,True
53,Azerbaijan,2019,% of population covered by mobile 4G+ data con...,90.0,5.5,True
59,Burundi,2019,% of population covered by mobile 4G+ data con...,25.0,2.25,True


In [43]:
### 7. % of population covered by mobile 5G

In [44]:
indicators[6]

# load data
indicator = indicators[6]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by mobile 5G+ data connectivity
countries_mobile_connectivity


In [45]:
df.head(10)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [46]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['5G Coverage'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [47]:
df[['Country','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Afghanistan,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True
11,Angola,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True
17,Albania,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True
23,United Arab Emirates,2019,% of population covered by mobile 5G+ data con...,100.0,6.0,True
29,Argentina,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True
35,Armenia,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True
41,Australia,2019,% of population covered by mobile 5G+ data con...,100.0,6.0,True
47,Austria,2019,% of population covered by mobile 5G+ data con...,100.0,6.0,True
53,Azerbaijan,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True
59,Burundi,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True


In [48]:
### 8. Mobile Coverage Maps

In [49]:
indicators[7]

# load data
indicator = indicators[7]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Mobile Coverage Maps
ITU_database


In [50]:
df.head(15)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
0,Angola,Africa,AGO,Female mobile phone ownership as a % of total ...,2010.0,,,
1,Benin,Africa,BEN,Female mobile phone ownership as a % of total ...,2010.0,,,
2,Botswana,Africa,BWA,Female mobile phone ownership as a % of total ...,2010.0,,,
3,Burkina Faso,Africa,BFA,Female mobile phone ownership as a % of total ...,2010.0,,,
4,Burundi,Africa,BDI,Female mobile phone ownership as a % of total ...,2010.0,,,
5,Cabo Verde,Africa,CPV,Female mobile phone ownership as a % of total ...,2010.0,,,
6,Cameroon,Africa,CMR,Female mobile phone ownership as a % of total ...,2010.0,,,
7,Central African Rep.,Africa,CAF,Female mobile phone ownership as a % of total ...,2010.0,,,
8,Chad,Africa,TCD,Female mobile phone ownership as a % of total ...,2010.0,,,
9,Congo (Rep. of the),Africa,COG,Female mobile phone ownership as a % of total ...,2010.0,,,


In [51]:
df = df[(df.Year == 2020)]
df = df[(df['Indicator name']== 'Mobile-cellular subscriptions per 100 inhabitants')]
df

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
32475,Angola,Africa,AGO,Mobile-cellular subscriptions per 100 inhabitants,2020.0,44.559511,,
32476,Benin,Africa,BEN,Mobile-cellular subscriptions per 100 inhabitants,2020.0,91.897280,,
32477,Botswana,Africa,BWA,Mobile-cellular subscriptions per 100 inhabitants,2020.0,162.399011,,
32478,Burkina Faso,Africa,BFA,Mobile-cellular subscriptions per 100 inhabitants,2020.0,105.807440,,
32479,Burundi,Africa,BDI,Mobile-cellular subscriptions per 100 inhabitants,2020.0,55.767172,,
...,...,...,...,...,...,...,...,...
32666,Suriname,The Americas,SUR,Mobile-cellular subscriptions per 100 inhabitants,2020.0,153.305479,,
32667,Trinidad and Tobago,The Americas,TTO,Mobile-cellular subscriptions per 100 inhabitants,2020.0,142.051665,,
32668,United States,The Americas,USA,Mobile-cellular subscriptions per 100 inhabitants,2020.0,,,
32669,Uruguay,The Americas,URY,Mobile-cellular subscriptions per 100 inhabitants,2020.0,,,


In [52]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [53]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
32475,Angola,2020.0,Mobile Coverage Maps,44.559511,1.012692,True
32476,Benin,2020.0,Mobile Coverage Maps,91.89728,1.96815,True
32477,Botswana,2020.0,Mobile Coverage Maps,162.399011,3.391146,True
32478,Burkina Faso,2020.0,Mobile Coverage Maps,105.80744,2.248911,True
32479,Burundi,2020.0,Mobile Coverage Maps,55.767172,1.238906,True
32480,Cabo Verde,2020.0,Mobile Coverage Maps,97.975133,2.090825,True
32481,Cameroon,2020.0,Mobile Coverage Maps,95.100069,2.032795,True
32482,Central African Rep.,2020.0,Mobile Coverage Maps,,,True
32483,Chad,2020.0,Mobile Coverage Maps,52.887026,1.180774,True
32484,Congo (Rep. of the),2020.0,Mobile Coverage Maps,,,True


In [54]:
### 9. Electricity Density

In [55]:
indicators[8]

# load data
indicator = indicators[8]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Electricity Density
electricity_yearbook


In [56]:
df.head(20)

Unnamed: 0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2013,2014,2015,2016,2017,Unnamed: 28,2016 - 2017 (%),2000 - 2017 (%/year),data_country,data_year
0,11894.0,12173.0,12284.0,12574.0,12881.0,13327.0,13754.0,14045.0,14414.0,14823.0,...,23424.0,23910.0,24317.0,24918.0,25592.0,World,2.7,3.0,World,World
1,7712.0,7900.0,7975.0,8160.0,8389.0,8646.0,8889.0,9014.0,9244.0,9451.0,...,10902.0,10875.0,10914.0,10967.0,11069.0,OECD,0.9,0.7,OECD,OECD
2,6089.0,6236.0,6277.0,6414.0,6578.0,6760.0,6944.0,6985.0,7142.0,7272.0,...,7897.0,7876.0,7867.0,7848.0,7898.0,G7,0.6,0.3,G7,G7
3,2386.0,2468.0,2510.0,2583.0,2637.0,2754.0,2863.0,2961.0,3026.0,3170.0,...,8524.0,8880.0,9142.0,9550.0,10000.0,BRICS,4.7,6.6,BRICS,BRICS
4,2900.0,2937.0,2926.0,2931.0,2976.0,3071.0,3164.0,3199.0,3280.0,3327.0,...,3813.0,3744.0,3802.0,3839.0,3886.0,Europe,1.2,0.7,Europe,Europe
5,2595.0,2640.0,2624.0,2626.0,2667.0,2744.0,2846.0,2857.0,2923.0,2955.0,...,3269.0,3191.0,3234.0,3253.0,3275.0,European Union,0.7,0.4,European Union,European Union
6,71.0,72.0,72.0,71.0,72.0,74.0,76.0,79.0,83.0,85.0,...,83.0,73.0,71.0,86.0,87.0,Belgium,0.5,0.2,Belgium,Belgium
7,63.0,61.0,59.0,59.0,59.0,61.0,64.0,65.0,65.0,65.0,...,87.0,86.0,84.0,83.0,87.0,Czech Rep.,4.5,1.0,Czech Rep.,Czech Rep.
8,421.0,456.0,464.0,473.0,477.0,494.0,513.0,505.0,511.0,526.0,...,572.0,564.0,568.0,553.0,551.0,France,-0.4,0.1,France,France
9,550.0,540.0,537.0,526.0,529.0,537.0,555.0,552.0,556.0,556.0,...,639.0,628.0,647.0,648.0,653.0,Germany,0.6,0.7,Germany,Germany


In [57]:
# create standard columns
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2017'] 
df['Country Name'] = df['Unnamed: 28']
df['Year'] = 2017

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [58]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,World,2017,Electricity Density,25592.0,6.0,True
1,OECD,2017,Electricity Density,11069.0,3.159482,True
2,G7,2017,Electricity Density,7898.0,2.539274,True
3,BRICS,2017,Electricity Density,10000.0,2.950399,True
4,Europe,2017,Electricity Density,3886.0,1.754577,True
5,European Union,2017,Electricity Density,3275.0,1.635073,True
6,Belgium,2017,Electricity Density,87.0,1.01154,True
7,Czech Rep.,2017,Electricity Density,87.0,1.01154,True
8,France,2017,Electricity Density,551.0,1.102292,True
9,Germany,2017,Electricity Density,653.0,1.122242,True


In [59]:
### 10. % of population covered by electricity

In [60]:
indicators[9]

# load data
indicator = indicators[9]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by electricity
population_electricity_coverage


In [61]:
df.head(15)

Unnamed: 0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2017,2018,2019,2020,Country Name,Country Code,Indicator Name,Indicator Code,data_country,data_year
0,,,,,,,,,,,...,100.0,100.0,100.0,,Aruba,ABW,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
1,,,,,,,,,,,...,39.754201,42.168241,43.640661,,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
2,,,,,,,,,,,...,97.7,98.715622,97.7,,Afghanistan,AFG,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
3,,,,,,,,,,,...,48.848205,51.253253,51.341421,,Africa Western and Central,AFW,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
4,,,,,,,,,,,...,43.00161,45.29,45.670315,,Angola,AGO,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
5,,,,,,,,,,,...,99.89,100.0,100.0,,Albania,ALB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
6,,,,,,,,,,,...,100.0,100.0,100.0,,Andorra,AND,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
7,,,,,,,,,,,...,89.623427,88.306324,89.512282,,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
8,,,,,,,,,,,...,100.0,100.0,100.0,,United Arab Emirates,ARE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
9,,,,,,,,,,,...,100.0,100.0,100.0,,Argentina,ARG,Access to electricity (% of population),EG.ELC.ACCS.ZS,,


In [62]:
# create standard columns
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2019'] 
df['Year'] = 2019

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [63]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Aruba,2019,% of population covered by electricity,100.0,6.0,True
1,Africa Eastern and Southern,2019,% of population covered by electricity,43.640661,2.979006,True
2,Afghanistan,2019,% of population covered by electricity,97.7,5.876715,True
3,Africa Western and Central,2019,% of population covered by electricity,51.341421,3.391785,True
4,Angola,2019,% of population covered by electricity,45.670315,3.0878,True
5,Albania,2019,% of population covered by electricity,100.0,6.0,True
6,Andorra,2019,% of population covered by electricity,100.0,6.0,True
7,Arab World,2019,% of population covered by electricity,89.512282,5.437834,True
8,United Arab Emirates,2019,% of population covered by electricity,100.0,6.0,True
9,Argentina,2019,% of population covered by electricity,100.0,6.0,True


In [64]:
### 11. Electricity Supply Quality

In [None]:
indicators[10]

# load data
indicator = indicators[10]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Electricity supply quality
elect_supply_quality


In [None]:
df.head(10)

In [None]:
# The data needs to be transposed as countries are now in columns, not rows, which could present problems for later.

In [None]:
### 12. Electricity Environmental Sustainability

In [None]:
indicators[11]

# load data
indicator = indicators[11]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2021 SDG Index Score'] 
df['Year'] = 2021

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 13. Cellphone Signal Density

In [None]:
indicators[12]

# load data
indicator = indicators[12]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(10)

In [None]:
df = df[(df.Year == 2019)]

# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Network coverage'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 14. Telecommunication Infrastructure Index (TII)

In [None]:
indicators[13]

# load data
indicator = indicators[13]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(10)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Telecommunication Infrastructure Index'] 
df['Year'] = df['Survey Year']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))


In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 15. GSMA Mobile Connectivity Index

In [None]:
indicators[14]

# load data
indicator = indicators[14]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(10)

In [None]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Index'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 16. Spectrum Allocated to Mobile Providers

In [None]:
indicators[15]

# load data
indicator = indicators[15]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Spectrum'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 17. Internet Exchange Points (IXPs) 

In [None]:
indicators[16]

# load data
indicator = indicators[16]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))



In [None]:
df['Country Name'] = df['Country']
df.head(15)

In [None]:
agg_df = df.groupby('Country Name').agg({'Country':'count'})
agg_df

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

agg_df['higher_is_better'] = True
agg_df['Indicator'] = indicator
agg_df['data_col'] = agg_df['Country'] 
agg_df['Year'] = 2019

min_rank = agg_df['data_col'].min()
max_rank = agg_df['data_col'].max()

# transform 0-1 rank into 1-6
agg_df['new_rank_score'] = agg_df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
agg_df[['Year','Indicator','data_col','new_rank_score','higher_is_better']].head(150)

In [None]:
### 18. ISP internet download speed averag

In [None]:
indicators[17]

# load data
indicator = indicators[17]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Download speed (Mbps)'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 19. Mobile dowload speed at the slowest hour

In [None]:
indicators[18]

# load data
indicator = indicators[18]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# Waiting for internet_speed to be added to the processed folder

In [None]:
df.head(15)

In [None]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Mobile latencies'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))


# invert since to lower rank is better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 20. Mobile download speed average

In [None]:
indicators[19]

# load data
indicator = indicators[19]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Mbps'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 21. Individuals using the Internet (% of population)

In [None]:
indicators[20]

# load data
indicator = indicators[20]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
df = df[(df['Indicator name'] == 'Individuals using the Internet, total (%)')]
df = df[(df.Year==2019)]
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df.rename(columns = {'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(150)

In [None]:
### 22. Mobile-cellular subscriptions per 100 inhabitants

In [None]:
indicators[21]

# load data
indicator = indicators[21]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df = df[(df['Indicator name'] == 'Individuals using the Internet, total (%)')]
df = df[(df.Year==2020)]
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df.rename(columns = {'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(150)

In [None]:
### 23. Average fixed broadband download speeds	

In [None]:
indicators[22]

# load data
indicator = indicators[22]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))


In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2021
df['Indicator'] = indicator
df['data_col'] = df['broadband'] 
df.rename(columns = {'country':'Country'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 24. Postal Coverage

In [None]:
indicators[23]

# load data
indicator = indicators[23]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
#Choose 'Percent of Population Having Mail Delivered at Home' as the data column
df = df[(df.Indicator=='Percent of Population Having Mail Delivered at Home')]
df.head(15)

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2015
df['Indicator'] = indicator
df['data_col'] = df['2015'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 25. Logistics Performance Index (LPI) 

In [None]:
indicators[24]

# load data
indicator = indicators[24]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2018
df['Indicator'] = indicator
df['data_col'] = df['score']
df.rename(columns={'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 26. National cyber security index

In [None]:
indicators[25]

# load data
indicator = indicators[25]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2021
df['Indicator'] = indicator
df['data_col'] = df['National Cyber Security Index']
df.rename(columns={'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 27. Global Cybersecurity Index (GCI)

In [None]:
indicators[26]

# load data
indicator = indicators[26]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2020
df['Indicator'] = indicator
df['data_col'] = df['Score (2020)'] 
df.rename(columns={'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 28. Software Developer Ecosystem size

In [None]:
indicators[27]

# load data
indicator = indicators[27]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
df['higher_is_better'] = True
df['Year'] = 2018
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,[1]]
df.rename(columns={'Country':'Country Name'}, inplace=True)
    
min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 29. Digital Work Ecosystem size

In [None]:
indicators[28]

# load data
indicator = indicators[28]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = False
df['Year'] = 2020
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,[0]].astype(float)
df.rename(columns={'Country':'Country Name'}, inplace=True)
    
min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# invert since to lower rank is better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)
# Two problems: 1. cannot use the name of the column, have to use iloc, 2. cannot use the index column
# because the number format is not standard, have to use the rank column

In [None]:
### 30. Country, Industry, Skill Migration Data (Skill) 

In [None]:
indicators[29]

# load data
indicator = indicators[29]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
df['Country Name'] = df.iloc[:,[1]]

df_sum = df.groupby('Country Name')[['net_per_10K_2019']].sum()
df_sum

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df_sum['higher_is_better'] = True
df_sum['Year'] = 2019
df_sum['Indicator'] = indicator
df_sum['data_col'] = df_sum['net_per_10K_2019']


min_rank = df_sum['data_col'].min()
max_rank = df_sum['data_col'].max()

# transform 0-1 rank into 1-6
df_sum['new_rank_score'] = df_sum['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df_sum[['Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 31. Country, Industry, Skill Migration Data (Industry)

In [None]:
indicators[30]

# load data
indicator = indicators[30]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))


In [None]:
df.head(15)

In [None]:
df['Country Name'] = df.iloc[:,[1]]

df_sum = df.groupby('Country Name')[['net_per_10K_2019']].sum()
df_sum

In [None]:
## create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df_sum['higher_is_better'] = True
df_sum['Year'] = 2019
df_sum['Indicator'] = indicator
df_sum['data_col'] = df_sum['net_per_10K_2019']


min_rank = df_sum['data_col'].min()
max_rank = df_sum['data_col'].max()

# transform 0-1 rank into 1-6
df_sum['new_rank_score'] = df_sum['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df_sum[['Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 32. Country, Industry, Skill Migration Data (Nation)

In [None]:
indicators[31]

# load data
indicator = indicators[31]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
df['Country Name'] = df.iloc[:,[1]]

df_sum = df.groupby('Country Name')[['net_per_10K_2019']].sum()
df_sum

In [None]:
## create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df_sum['higher_is_better'] = True
df_sum['Year'] = 2019
df_sum['Indicator'] = indicator
df_sum['data_col'] = df_sum['net_per_10K_2019']


min_rank = df_sum['data_col'].min()
max_rank = df_sum['data_col'].max()

# transform 0-1 rank into 1-6
df_sum['new_rank_score'] = df_sum['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df_sum[['Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 33. Digital Finance Ecosystem size

In [None]:
indicators[32]

# load data
indicator = indicators[32]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2021
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,[2]]
df['Country Name'] = df.iloc[:,[1]]

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# Have to shorten the column names to see if this works

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 34. Tech hubs & spaces size

In [None]:
indicators[33]

# load data
indicator = indicators[33]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

#There are two problems:
#1. The column names need some overhaul (moving the second column to the top and make it the column names)
#2. This is a list of cities, not countries, may present probkems later. 

In [None]:
### 35. Banking Ecosystem size

In [None]:
indicators[34]

# load data
indicator = indicators[34]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2017
df['Indicator'] = indicator
df['data_col'] = df['Bank assets, percent of GDP, 2017']
df.rename(columns={'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

# Okay but will have to remove number from the Country column, leaving only characters. 

In [None]:
### 36. Angel Ecosystem size

In [None]:
indicators[35]

# load data
indicator = indicators[35]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2019
df['Indicator'] = indicator
df['data_col'] = df['Business angel investments (in million Euros)']
df.rename(columns={'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

# Okay but there are only EU countries

In [None]:
### 37. Startup Ecosystem size

In [None]:
indicators[36]

# load data
indicator = indicators[36]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

# This is a list of cities, not countries, wonder if it will work 

In [None]:
### 38. Venture Ecosystem size

indicators[37]

# load data
indicator = indicators[37]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

# Similar to the previous one

In [None]:
### 39. International Co-Inventions

In [None]:
indicators[38]

# load data
indicator = indicators[38]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# filter most recent year and global value
df = df[(df.Time==2017)]
df = df[(df.Country!='World')]
df ['Country Name'] = df ['Country']
df

In [None]:
# Create summarization
df_sum = df.groupby('Country Name')[['Value']].sum()
df_sum

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df_sum['higher_is_better'] = True
df_sum['Year'] = 2017
df_sum['Indicator'] = indicator
df_sum['data_col'] = df_sum.iloc[:,[0]]

min_rank = df_sum['data_col'].min()
max_rank = df_sum['data_col'].max()

# transform 0-1 rank into 1-6
df_sum['new_rank_score'] = df_sum['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df_sum[['Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

# Okay but the data is too general with many missing countries