In [1]:
import pandas as pd
import numpy as np

In [2]:
### Get all the pillar names from the excel

In [3]:
names = pd.read_excel('../../UNDP Digital Assessment Data Framework Filename Matching V7.xlsx')

In [4]:
col_names = ['Indicator','check', 'Data Source','Index','Filename']

In [5]:
names = names[col_names]

In [6]:
names.head()

Unnamed: 0,Indicator,check,Data Source,Index,Filename
0,Countries,,United Nations,False,Countries
1,"Database of Global Administrative Areas (GADM,...",,GADM maps and data,False,
2,High Resolution Population Density Maps + Demo...,,Facebook,False,
3,population density vs openstreetmap object den...,,Kontur,False,
4,Population Density,Infrastructure,World Bank,False,population_density


In [7]:
# get all the files per pillar
data_stats = names.groupby('check').agg({'Filename':'count','Indicator':'count'})

In [8]:
data_stats

Unnamed: 0_level_0,Filename,Indicator
check,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,20,25
Foundations,9,12
Government,10,15
Infrastructure,39,48
People,39,47
Regulation,6,7
Strategy,1,1


In [9]:
### Infrastructure

In [10]:
bnames = names[(names.check=='Infrastructure')&(~names.Filename.isna())]#&(names.Index==False)]
bnames.head(25)

Unnamed: 0,Indicator,check,Data Source,Index,Filename
4,Population Density,Infrastructure,World Bank,False,population_density
5,Broadband Density,Infrastructure,ITU,False,ITU_database
6,% of population covered by internet connectivity,Infrastructure,ITU,False,ITU_database
7,% of population covered by mobile 2G+ data con...,Infrastructure,GSMA Mobile Connectivity Index,False,countries_mobile_connectivity
8,% of population covered by mobile 3G+ data con...,Infrastructure,GSMA Mobile Connectivity Index,False,countries_mobile_connectivity
9,% of population covered by mobile 4G+ data con...,Infrastructure,GSMA Mobile Connectivity Index,False,countries_mobile_connectivity
10,% of population covered by mobile 5G+ data con...,Infrastructure,GSMA Mobile Connectivity Index,False,countries_mobile_connectivity
11,Mobile Coverage Maps,Infrastructure,ITU,False,ITU_database
12,Electricity Density,Infrastructure,Energy Data,False,electricity_yearbook
13,% of population covered by electricity,Infrastructure,World Bank,False,population_electricity_coverage


In [11]:
# get list of names for all indicators
indicators = bnames.Indicator.unique()

In [12]:
# get all file names
bfiles = bnames.Filename.unique()

In [13]:
bfiles

array(['population_density', 'ITU_database',
       'countries_mobile_connectivity', 'electricity_yearbook',
       'population_electricity_coverage', 'elect_supply_quality',
       'sustainability_index', 'mobile_density', 'e_government_index',
       'spectrum_allocated_mobile_providers', 'countries_ixp',
       'internet_speed', 'mobile_latency', 'mobile_speed',
       'fixed_bdbd_spd_dl_ul', 'postal_coverage',
       'logistics_performance_index', 'national_cybersecurity_index',
       'dice_export_global_cybersecurity_index',
       'software_developer_ecosystem_size',
       'digital_platform_economy_index', 'migration_skill',
       'migration_industry', 'migration_country',
       'global_fintech_ranking', 'tech_hubs', 'banking_sector_size',
       'angel_investment', 'startup_eco_size',
       'international_co_inventions'], dtype=object)

In [45]:
# formula for converting scale
def convert_rank(old_value, old_min=1, old_max=7, new_min=1, new_max=6 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

In [46]:
### 2. Broadband Density

In [89]:
indicators[1]

# load data
indicator = indicators[1]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Broadband Density
ITU_database


In [90]:
df.head(15)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
0,Angola,Africa,AGO,Female mobile phone ownership as a % of total ...,2010.0,,,
1,Benin,Africa,BEN,Female mobile phone ownership as a % of total ...,2010.0,,,
2,Botswana,Africa,BWA,Female mobile phone ownership as a % of total ...,2010.0,,,
3,Burkina Faso,Africa,BFA,Female mobile phone ownership as a % of total ...,2010.0,,,
4,Burundi,Africa,BDI,Female mobile phone ownership as a % of total ...,2010.0,,,
5,Cabo Verde,Africa,CPV,Female mobile phone ownership as a % of total ...,2010.0,,,
6,Cameroon,Africa,CMR,Female mobile phone ownership as a % of total ...,2010.0,,,
7,Central African Rep.,Africa,CAF,Female mobile phone ownership as a % of total ...,2010.0,,,
8,Chad,Africa,TCD,Female mobile phone ownership as a % of total ...,2010.0,,,
9,Congo (Rep. of the),Africa,COG,Female mobile phone ownership as a % of total ...,2010.0,,,


In [91]:
df = df[(df.Year == 2020)]
df = df[(df['Indicator name']== 'Fixed broadband subscriptions per 100 inhabitants')]
df


Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
36787,Angola,Africa,AGO,Fixed broadband subscriptions per 100 inhabitants,2020.0,0.701662,,
36788,Benin,Africa,BEN,Fixed broadband subscriptions per 100 inhabitants,2020.0,0.247303,,
36789,Botswana,Africa,BWA,Fixed broadband subscriptions per 100 inhabitants,2020.0,3.057373,,
36790,Burkina Faso,Africa,BFA,Fixed broadband subscriptions per 100 inhabitants,2020.0,0.066875,,
36791,Burundi,Africa,BDI,Fixed broadband subscriptions per 100 inhabitants,2020.0,0.035574,,
...,...,...,...,...,...,...,...,...
36978,Suriname,The Americas,SUR,Fixed broadband subscriptions per 100 inhabitants,2020.0,15.728770,,
36979,Trinidad and Tobago,The Americas,TTO,Fixed broadband subscriptions per 100 inhabitants,2020.0,26.866254,,
36980,United States,The Americas,USA,Fixed broadband subscriptions per 100 inhabitants,2020.0,36.413908,,
36981,Uruguay,The Americas,URY,Fixed broadband subscriptions per 100 inhabitants,2020.0,,,


In [92]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value']
df.rename(columns={'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [93]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
36787,Angola,2020.0,Broadband Density,0.701662,1.065945,True
36788,Benin,2020.0,Broadband Density,0.247303,1.023242,True
36789,Botswana,2020.0,Broadband Density,3.057373,1.287344,True
36790,Burkina Faso,2020.0,Broadband Density,0.066875,1.006285,True
36791,Burundi,2020.0,Broadband Density,0.035574,1.003343,True
36792,Cabo Verde,2020.0,Broadband Density,4.46755,1.419877,True
36793,Cameroon,2020.0,Broadband Density,2.689101,1.252732,True
36794,Central African Rep.,2020.0,Broadband Density,,,True
36795,Chad,2020.0,Broadband Density,0.0,1.0,True
36796,Congo (Rep. of the),2020.0,Broadband Density,,,True


In [17]:
### 3. % of population covered by internet connectivity

In [95]:
indicators[2]

# load data
indicator = indicators[2]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by internet connectivity
ITU_database


In [96]:
df = df[(df.Year == 2020)]
df = df[(df['Indicator name']== 'Individuals using the Internet, total (%)')]
df

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
47744,Mauritius,Africa,MUS,"Individuals using the Internet, total (%)",2020.0,64.884904,,
47745,Uganda,Africa,UGA,"Individuals using the Internet, total (%)",2020.0,,,
47746,Bahrain,Arab States,BHR,"Individuals using the Internet, total (%)",2020.0,99.539512,,
47747,Egypt,Arab States,EGY,"Individuals using the Internet, total (%)",2020.0,71.914200,,
47748,Iraq,Arab States,IRQ,"Individuals using the Internet, total (%)",2020.0,,,
...,...,...,...,...,...,...,...,...
47805,Bolivia (Plurinational State of),The Americas,BOL,"Individuals using the Internet, total (%)",2020.0,55.139051,,
47806,Costa Rica,The Americas,CRI,"Individuals using the Internet, total (%)",2020.0,80.530186,,
47807,Mexico,The Americas,MEX,"Individuals using the Internet, total (%)",2020.0,71.970000,,
47808,Paraguay,The Americas,PRY,"Individuals using the Internet, total (%)",2020.0,74.515240,,


In [97]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value']
df.rename(columns={'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [98]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
47744,Mauritius,2020.0,% of population covered by internet connectivity,64.884904,3.298839,True
47745,Uganda,2020.0,% of population covered by internet connectivity,,,True
47746,Bahrain,2020.0,% of population covered by internet connectivity,99.539512,5.964578,True
47747,Egypt,2020.0,% of population covered by internet connectivity,71.9142,3.839554,True
47748,Iraq,2020.0,% of population covered by internet connectivity,,,True
47749,Kuwait,2020.0,% of population covered by internet connectivity,98.599995,5.892307,True
47750,Morocco,2020.0,% of population covered by internet connectivity,84.120363,4.778489,True
47751,Oman,2020.0,% of population covered by internet connectivity,95.232293,5.633253,True
47752,Qatar,2020.0,% of population covered by internet connectivity,99.652794,5.973292,True
47753,Saudi Arabia,2020.0,% of population covered by internet connectivity,97.862332,5.835564,True


In [19]:
### 4. % of population covered by mobile 2G

In [20]:
indicators[3]

'% of population covered by mobile 2G+ data connectivity'

In [99]:
# load data
indicator = indicators[3]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by mobile 2G+ data connectivity
countries_mobile_connectivity


In [100]:
df.head(10)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [101]:
# filter most recent year
df = df[(df.Year==2019)]


In [104]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2G Coverage'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [105]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Afghanistan,2019,% of population covered by mobile 2G+ data con...,90.0,4.921251,True
11,Angola,2019,% of population covered by mobile 2G+ data con...,90.0,4.921251,True
17,Albania,2019,% of population covered by mobile 2G+ data con...,99.86,5.984898,True
23,United Arab Emirates,2019,% of population covered by mobile 2G+ data con...,100.0,6.0,True
29,Argentina,2019,% of population covered by mobile 2G+ data con...,98.0,5.78425,True
35,Armenia,2019,% of population covered by mobile 2G+ data con...,100.0,6.0,True
41,Australia,2019,% of population covered by mobile 2G+ data con...,99.4,5.935275,True
47,Austria,2019,% of population covered by mobile 2G+ data con...,99.0,5.892125,True
53,Azerbaijan,2019,% of population covered by mobile 2G+ data con...,100.0,6.0,True
59,Burundi,2019,% of population covered by mobile 2G+ data con...,53.65,1.0,True


In [26]:
### 5. % of population covered by mobile 3G

In [27]:
indicators[4]

# load data
indicator = indicators[4]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by mobile 3G+ data connectivity
countries_mobile_connectivity


In [28]:
df.head(10)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [106]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['3G Coverage'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [107]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Afghanistan,2019,% of population covered by mobile 2G+ data con...,58.7,3.282895,True
11,Angola,2019,% of population covered by mobile 2G+ data con...,71.0,4.092105,True
17,Albania,2019,% of population covered by mobile 2G+ data con...,97.0,5.802632,True
23,United Arab Emirates,2019,% of population covered by mobile 2G+ data con...,100.0,6.0,True
29,Argentina,2019,% of population covered by mobile 2G+ data con...,95.0,5.671053,True
35,Armenia,2019,% of population covered by mobile 2G+ data con...,99.0,5.934211,True
41,Australia,2019,% of population covered by mobile 2G+ data con...,99.5,5.967105,True
47,Austria,2019,% of population covered by mobile 2G+ data con...,99.0,5.934211,True
53,Azerbaijan,2019,% of population covered by mobile 2G+ data con...,95.0,5.671053,True
59,Burundi,2019,% of population covered by mobile 2G+ data con...,40.0,2.052632,True


In [31]:
### 6. % of population covered by mobile 4G

In [108]:
indicators[5]

# load data
indicator = indicators[5]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by mobile 4G+ data connectivity
countries_mobile_connectivity


In [109]:
df.head(10)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [110]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['4G Coverage'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [112]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Afghanistan,2019,% of population covered by mobile 4G+ data con...,15.0,1.75,True
11,Angola,2019,% of population covered by mobile 4G+ data con...,50.0,3.5,True
17,Albania,2019,% of population covered by mobile 4G+ data con...,96.0,5.8,True
23,United Arab Emirates,2019,% of population covered by mobile 4G+ data con...,99.0,5.95,True
29,Argentina,2019,% of population covered by mobile 4G+ data con...,89.77,5.4885,True
35,Armenia,2019,% of population covered by mobile 4G+ data con...,95.0,5.75,True
41,Australia,2019,% of population covered by mobile 4G+ data con...,99.2,5.96,True
47,Austria,2019,% of population covered by mobile 4G+ data con...,99.0,5.95,True
53,Azerbaijan,2019,% of population covered by mobile 4G+ data con...,90.0,5.5,True
59,Burundi,2019,% of population covered by mobile 4G+ data con...,25.0,2.25,True


In [36]:
### 7. % of population covered by mobile 5G

In [113]:
indicators[6]

# load data
indicator = indicators[6]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by mobile 5G+ data connectivity
countries_mobile_connectivity


In [114]:
df.head(10)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [115]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['5G Coverage'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [116]:
df[['Country','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Afghanistan,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True
11,Angola,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True
17,Albania,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True
23,United Arab Emirates,2019,% of population covered by mobile 5G+ data con...,100.0,6.0,True
29,Argentina,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True
35,Armenia,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True
41,Australia,2019,% of population covered by mobile 5G+ data con...,100.0,6.0,True
47,Austria,2019,% of population covered by mobile 5G+ data con...,100.0,6.0,True
53,Azerbaijan,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True
59,Burundi,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True


In [41]:
### 8. Mobile Coverage Maps

In [117]:
indicators[7]

# load data
indicator = indicators[7]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Mobile Coverage Maps
ITU_database


In [119]:
df.head(15)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
0,Angola,Africa,AGO,Female mobile phone ownership as a % of total ...,2010.0,,,
1,Benin,Africa,BEN,Female mobile phone ownership as a % of total ...,2010.0,,,
2,Botswana,Africa,BWA,Female mobile phone ownership as a % of total ...,2010.0,,,
3,Burkina Faso,Africa,BFA,Female mobile phone ownership as a % of total ...,2010.0,,,
4,Burundi,Africa,BDI,Female mobile phone ownership as a % of total ...,2010.0,,,
5,Cabo Verde,Africa,CPV,Female mobile phone ownership as a % of total ...,2010.0,,,
6,Cameroon,Africa,CMR,Female mobile phone ownership as a % of total ...,2010.0,,,
7,Central African Rep.,Africa,CAF,Female mobile phone ownership as a % of total ...,2010.0,,,
8,Chad,Africa,TCD,Female mobile phone ownership as a % of total ...,2010.0,,,
9,Congo (Rep. of the),Africa,COG,Female mobile phone ownership as a % of total ...,2010.0,,,


In [120]:
df = df[(df.Year == 2020)]
df = df[(df['Indicator name']== 'Mobile-cellular subscriptions per 100 inhabitants')]
df

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
32475,Angola,Africa,AGO,Mobile-cellular subscriptions per 100 inhabitants,2020.0,44.559511,,
32476,Benin,Africa,BEN,Mobile-cellular subscriptions per 100 inhabitants,2020.0,91.897280,,
32477,Botswana,Africa,BWA,Mobile-cellular subscriptions per 100 inhabitants,2020.0,162.399011,,
32478,Burkina Faso,Africa,BFA,Mobile-cellular subscriptions per 100 inhabitants,2020.0,105.807440,,
32479,Burundi,Africa,BDI,Mobile-cellular subscriptions per 100 inhabitants,2020.0,55.767172,,
...,...,...,...,...,...,...,...,...
32666,Suriname,The Americas,SUR,Mobile-cellular subscriptions per 100 inhabitants,2020.0,153.305479,,
32667,Trinidad and Tobago,The Americas,TTO,Mobile-cellular subscriptions per 100 inhabitants,2020.0,142.051665,,
32668,United States,The Americas,USA,Mobile-cellular subscriptions per 100 inhabitants,2020.0,,,
32669,Uruguay,The Americas,URY,Mobile-cellular subscriptions per 100 inhabitants,2020.0,,,


In [124]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [125]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
32475,Angola,2020.0,Mobile Coverage Maps,44.559511,1.012692,True
32476,Benin,2020.0,Mobile Coverage Maps,91.89728,1.96815,True
32477,Botswana,2020.0,Mobile Coverage Maps,162.399011,3.391146,True
32478,Burkina Faso,2020.0,Mobile Coverage Maps,105.80744,2.248911,True
32479,Burundi,2020.0,Mobile Coverage Maps,55.767172,1.238906,True
32480,Cabo Verde,2020.0,Mobile Coverage Maps,97.975133,2.090825,True
32481,Cameroon,2020.0,Mobile Coverage Maps,95.100069,2.032795,True
32482,Central African Rep.,2020.0,Mobile Coverage Maps,,,True
32483,Chad,2020.0,Mobile Coverage Maps,52.887026,1.180774,True
32484,Congo (Rep. of the),2020.0,Mobile Coverage Maps,,,True


In [126]:
### 9. Electricity Density

In [127]:
indicators[8]

# load data
indicator = indicators[8]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Electricity Density
electricity_yearbook


In [131]:
df.head(20)

Unnamed: 0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2016 - 2017 (%),2000 - 2017 (%/year),data_country,data_year,higher_is_better,Indicator,data_col,Country,Year,new_rank_score
0,11894.0,12173.0,12284.0,12574.0,12881.0,13327.0,13754.0,14045.0,14414.0,14823.0,...,2.7,3.0,World,World,True,Electricity Density,25592.0,World,2017,6.0
1,7712.0,7900.0,7975.0,8160.0,8389.0,8646.0,8889.0,9014.0,9244.0,9451.0,...,0.9,0.7,OECD,OECD,True,Electricity Density,11069.0,OECD,2017,3.159482
2,6089.0,6236.0,6277.0,6414.0,6578.0,6760.0,6944.0,6985.0,7142.0,7272.0,...,0.6,0.3,G7,G7,True,Electricity Density,7898.0,G7,2017,2.539274
3,2386.0,2468.0,2510.0,2583.0,2637.0,2754.0,2863.0,2961.0,3026.0,3170.0,...,4.7,6.6,BRICS,BRICS,True,Electricity Density,10000.0,BRICS,2017,2.950399
4,2900.0,2937.0,2926.0,2931.0,2976.0,3071.0,3164.0,3199.0,3280.0,3327.0,...,1.2,0.7,Europe,Europe,True,Electricity Density,3886.0,Europe,2017,1.754577
5,2595.0,2640.0,2624.0,2626.0,2667.0,2744.0,2846.0,2857.0,2923.0,2955.0,...,0.7,0.4,European Union,European Union,True,Electricity Density,3275.0,European Union,2017,1.635073
6,71.0,72.0,72.0,71.0,72.0,74.0,76.0,79.0,83.0,85.0,...,0.5,0.2,Belgium,Belgium,True,Electricity Density,87.0,Belgium,2017,1.01154
7,63.0,61.0,59.0,59.0,59.0,61.0,64.0,65.0,65.0,65.0,...,4.5,1.0,Czech Rep.,Czech Rep.,True,Electricity Density,87.0,Czech Rep.,2017,1.01154
8,421.0,456.0,464.0,473.0,477.0,494.0,513.0,505.0,511.0,526.0,...,-0.4,0.1,France,France,True,Electricity Density,551.0,France,2017,1.102292
9,550.0,540.0,537.0,526.0,529.0,537.0,555.0,552.0,556.0,556.0,...,0.6,0.7,Germany,Germany,True,Electricity Density,653.0,Germany,2017,1.122242


In [132]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2017'] 
df['Country'] = df['Unnamed: 28']
df['Year'] = 2017

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [133]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,World,2017,Electricity Density,25592.0,6.0,True
1,OECD,2017,Electricity Density,11069.0,3.159482,True
2,G7,2017,Electricity Density,7898.0,2.539274,True
3,BRICS,2017,Electricity Density,10000.0,2.950399,True
4,Europe,2017,Electricity Density,3886.0,1.754577,True
5,European Union,2017,Electricity Density,3275.0,1.635073,True
6,Belgium,2017,Electricity Density,87.0,1.01154,True
7,Czech Rep.,2017,Electricity Density,87.0,1.01154,True
8,France,2017,Electricity Density,551.0,1.102292,True
9,Germany,2017,Electricity Density,653.0,1.122242,True


In [None]:
### 10. % of population covered by electricity

In [135]:
indicators[9]

# load data
indicator = indicators[9]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by electricity
population_electricity_coverage


In [136]:
df.head(15)

Unnamed: 0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2017,2018,2019,2020,Country Name,Country Code,Indicator Name,Indicator Code,data_country,data_year
0,,,,,,,,,,,...,100.0,100.0,100.0,,Aruba,ABW,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
1,,,,,,,,,,,...,39.754201,42.168241,43.640661,,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
2,,,,,,,,,,,...,97.7,98.715622,97.7,,Afghanistan,AFG,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
3,,,,,,,,,,,...,48.848205,51.253253,51.341421,,Africa Western and Central,AFW,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
4,,,,,,,,,,,...,43.00161,45.29,45.670315,,Angola,AGO,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
5,,,,,,,,,,,...,99.89,100.0,100.0,,Albania,ALB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
6,,,,,,,,,,,...,100.0,100.0,100.0,,Andorra,AND,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
7,,,,,,,,,,,...,89.623427,88.306324,89.512282,,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
8,,,,,,,,,,,...,100.0,100.0,100.0,,United Arab Emirates,ARE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
9,,,,,,,,,,,...,100.0,100.0,100.0,,Argentina,ARG,Access to electricity (% of population),EG.ELC.ACCS.ZS,,


In [137]:
# create standard columns
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2019'] 
df['Year'] = 2019

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [138]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Aruba,2019,% of population covered by electricity,100.0,6.0,True
1,Africa Eastern and Southern,2019,% of population covered by electricity,43.640661,2.979006,True
2,Afghanistan,2019,% of population covered by electricity,97.7,5.876715,True
3,Africa Western and Central,2019,% of population covered by electricity,51.341421,3.391785,True
4,Angola,2019,% of population covered by electricity,45.670315,3.0878,True
5,Albania,2019,% of population covered by electricity,100.0,6.0,True
6,Andorra,2019,% of population covered by electricity,100.0,6.0,True
7,Arab World,2019,% of population covered by electricity,89.512282,5.437834,True
8,United Arab Emirates,2019,% of population covered by electricity,100.0,6.0,True
9,Argentina,2019,% of population covered by electricity,100.0,6.0,True


In [None]:
### 11. Electricity Supply Quality

In [139]:
indicators[10]

# load data
indicator = indicators[10]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Electricity supply quality
elect_supply_quality


In [140]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,AGO,ALB,ARE,ARG,ARM,AUS,AUT,AZE,BDI,...,GCREURASIA,GCREAP,GCRSASIA,LIC,LMC,UMC,HIC,AVG,data_country,data_year
0,Attribute,Angola,Albania,United Arab Emirates,Argentina,Armenia,Australia,Austria,Azerbaijan,Burundi,...,Eurasia,East Asia and Pacific,South Asia,Low-income,Lower-middle-income,Upper-middle-income,High-income,Sample average,Attribute,Attribute
1,VALUE,,,,,,,,,,...,,,,,,,,,VALUE,VALUE
2,RANK,136,81,25,83,69,16,21,58,135,...,,,,,,,,0,RANK,RANK
3,SCORE,38.11248593,57.61416881,75.00738794,57.20132859,61.27687643,78.74662637,76.60913087,62.71789348,40.25234131,...,59.37969533,69.86862667,54.70460508,43.7445505,51.94942309,59.7912297,72.78080961,60.6396938,SCORE,SCORE
4,DATE DESCRIPTION,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,...,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,DATE DESCRIPTION,DATE DESCRIPTION
5,SOURCE,"World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...",...,"World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...",SOURCE,SOURCE
6,SOURCE DATE,,,,,,,,,,...,,,,,,,,,SOURCE DATE,SOURCE DATE
7,NOTE,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,NOTE,NOTE
8,VALUE,,,,,,,,,,...,,,,,,,,,VALUE,VALUE
9,RANK,136,88,3,109,62,22,20,64,134,...,,,,,,,,0,RANK,RANK


In [None]:
# The data needs to be transposed as countries are now in columns, not rows, which could present problems for later.

In [None]:
### 12. Electricity Environmental Sustainability

In [141]:
indicators[11]

# load data
indicator = indicators[11]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Electricity Environmental Sustainability
sustainability_index


In [142]:
df.head(15)

Unnamed: 0,Country Code ISO3,Country,2021 SDG Index Score,2021 SDG Index Rank,Percentage missing values,Spillover Score (0-100),Regional Score (0-100),Regions used for the SDR,Population in 2020,Goal 1 Dash,...,Goal 10 Regional Score,Goal 11 Regional Score,Goal 12 Regional Score,Goal 13 Regional Score,Goal 14 Regional Score,Goal 15 Regional Score,Goal 16 Regional Score,Goal 17 Regional Score,data_country,data_year
0,AFG,Afghanistan,53.9,137.0,9.3,99.3,71.4,E. Europe & C. Asia,38928341,grey,...,72.362762,76.443228,77.629345,83.041913,58.865821,69.499564,72.403938,63.38646,,
1,AGO,Angola,50.3,154.0,0.0,97.0,51.9,Africa,32866268,red,...,37.334205,52.728915,88.024702,96.266527,68.010492,66.835003,56.050983,51.652955,,
2,ALB,Albania,71.0,64.0,3.3,94.3,71.4,E. Europe & C. Asia,2877800,yellow,...,72.362762,76.443228,77.629345,83.041913,58.865821,69.499564,72.403938,63.38646,,
3,AND,Andorra,,,51.2,,71.4,E. Europe & C. Asia,77265,grey,...,,,,,,,,,,
4,ARE,United Arab Emirates,70.2,71.0,11.0,38.8,67.1,MENA,9890400,green,...,66.178091,58.928271,76.842146,71.47551,62.776615,57.356212,69.093831,56.231255,,
5,ARG,Argentina,72.8,52.0,1.1,94.5,68.6,LAC,45195777,yellow,...,27.901676,77.087594,76.727283,85.373283,62.781114,60.904217,60.888934,64.295156,,
6,ARM,Armenia,71.8,58.0,3.5,96.7,71.4,E. Europe & C. Asia,2963234,yellow,...,72.362762,76.443228,77.629345,83.041913,58.865821,69.499564,72.403938,63.38646,,
7,ATG,Antigua and Barbuda,,,30.8,,68.6,LAC,97928,grey,...,,,,,,,,,,
8,AUS,Australia,75.6,35.0,1.1,63.9,77.2,OECD,25499881,yellow,...,74.475203,88.911367,58.055279,64.112721,63.360546,74.102776,82.316815,65.773824,,
9,AUT,Austria,82.1,6.0,3.5,59.5,77.2,OECD,9006400,green,...,74.475203,88.911367,58.055279,64.112721,63.360546,74.102776,82.316815,65.773824,,


In [148]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2021 SDG Index Score'] 
df['Year'] = 2021

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [149]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Country Name.1,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Afghanistan,Afghanistan,2021,Electricity Environmental Sustainability,53.9,2.638655,True
1,Angola,Angola,2021,Electricity Environmental Sustainability,50.3,2.260504,True
2,Albania,Albania,2021,Electricity Environmental Sustainability,71.0,4.434874,True
3,Andorra,Andorra,2021,Electricity Environmental Sustainability,,,True
4,United Arab Emirates,United Arab Emirates,2021,Electricity Environmental Sustainability,70.2,4.35084,True
5,Argentina,Argentina,2021,Electricity Environmental Sustainability,72.8,4.62395,True
6,Armenia,Armenia,2021,Electricity Environmental Sustainability,71.8,4.518908,True
7,Antigua and Barbuda,Antigua and Barbuda,2021,Electricity Environmental Sustainability,,,True
8,Australia,Australia,2021,Electricity Environmental Sustainability,75.6,4.918067,True
9,Austria,Austria,2021,Electricity Environmental Sustainability,82.1,5.60084,True


In [None]:
### 13. Cellphone Signal Density

In [156]:
indicators[12]

# load data
indicator = indicators[12]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Cellphone Signal Density
mobile_density


In [157]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,ISO Code,Country,Region,Year,Network coverage
0,1,AFG,Afghanistan,South Asia,2014,28.919998
1,2,AFG,Afghanistan,South Asia,2015,31.799999
2,3,AFG,Afghanistan,South Asia,2016,33.040001
3,4,AFG,Afghanistan,South Asia,2017,36.34
4,5,AFG,Afghanistan,South Asia,2018,30.931999
5,6,AFG,Afghanistan,South Asia,2019,38.48
6,7,AGO,Angola,Sub-Saharan Africa,2014,51.44577
7,8,AGO,Angola,Sub-Saharan Africa,2015,53.906952
8,9,AGO,Angola,Sub-Saharan Africa,2016,61.17495
9,10,AGO,Angola,Sub-Saharan Africa,2017,60.800003


In [159]:
df = df[(df.Year == 2019)]

# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Network coverage'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [160]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Afghanistan,2019,Cellphone Signal Density,38.48,2.117926,True
11,Angola,2019,Cellphone Signal Density,57.400002,3.311828,True
17,Albania,2019,Cellphone Signal Density,87.186005,5.191403,True
23,United Arab Emirates,2019,Cellphone Signal Density,99.599998,5.974759,True
29,Argentina,2019,Cellphone Signal Density,83.708,4.971932,True
35,Armenia,2019,Cellphone Signal Density,87.599998,5.217527,True
41,Australia,2019,Cellphone Signal Density,99.419998,5.9634,True
47,Austria,2019,Cellphone Signal Density,99.099998,5.943208,True
53,Azerbaijan,2019,Cellphone Signal Density,84.0,4.990358,True
59,Burundi,2019,Cellphone Signal Density,31.365,1.668951,True


In [None]:
### 14. Telecommunication Infrastructure Index (TII)

In [167]:
indicators[13]

# load data
indicator = indicators[13]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Telecommunication Infrastructure Index (TII)
e_government_index


In [162]:
df.head(10)

Unnamed: 0,Survey Year,Country Name,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
0,2020,Iraq,143,0.436,0.3095,0.3353,0.4358,0.537
1,2020,Ireland,27,0.8433,0.8571,0.7706,0.9494,0.81
2,2020,Israel,30,0.8361,0.7143,0.7471,0.8924,0.8689
3,2020,Italy,37,0.8231,0.8214,0.8294,0.8466,0.7932
4,2020,Jamaica,114,0.5392,0.369,0.3882,0.7142,0.5151
5,2020,Japan,14,0.8989,0.9881,0.9059,0.8684,0.9223
6,2020,Jordan,117,0.5309,0.3333,0.3588,0.68,0.554
7,2020,Kazakhstan,29,0.8375,0.881,0.9235,0.8866,0.7024
8,2020,Kenya,116,0.5326,0.5952,0.6765,0.5812,0.3402
9,2020,Kiribati,145,0.432,0.5595,0.4941,0.6778,0.1241


In [168]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Telecommunication Infrastructure Index'] 
df['Year'] = df['Survey Year']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))


In [169]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Iraq,2020,Telecommunication Infrastructure Index (TII),0.537,3.685,True
1,Ireland,2020,Telecommunication Infrastructure Index (TII),0.81,5.05,True
2,Israel,2020,Telecommunication Infrastructure Index (TII),0.8689,5.3445,True
3,Italy,2020,Telecommunication Infrastructure Index (TII),0.7932,4.966,True
4,Jamaica,2020,Telecommunication Infrastructure Index (TII),0.5151,3.5755,True
5,Japan,2020,Telecommunication Infrastructure Index (TII),0.9223,5.6115,True
6,Jordan,2020,Telecommunication Infrastructure Index (TII),0.554,3.77,True
7,Kazakhstan,2020,Telecommunication Infrastructure Index (TII),0.7024,4.512,True
8,Kenya,2020,Telecommunication Infrastructure Index (TII),0.3402,2.701,True
9,Kiribati,2020,Telecommunication Infrastructure Index (TII),0.1241,1.6205,True


In [170]:
### 15. GSMA Mobile Connectivity Index

In [171]:
indicators[14]

# load data
indicator = indicators[14]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

GSMA Mobile Connectivity Index 
countries_mobile_connectivity


In [172]:
df.head(10)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [173]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Index'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [174]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Afghanistan,2019,GSMA Mobile Connectivity Index,28.94,2.041533,True
11,Angola,2019,GSMA Mobile Connectivity Index,43.53,2.979555,True
17,Albania,2019,GSMA Mobile Connectivity Index,67.89,4.545712,True
23,United Arab Emirates,2019,GSMA Mobile Connectivity Index,78.23,5.210492,True
29,Argentina,2019,GSMA Mobile Connectivity Index,67.16,4.498778,True
35,Armenia,2019,GSMA Mobile Connectivity Index,56.24,3.796708,True
41,Australia,2019,GSMA Mobile Connectivity Index,90.51,6.0,True
47,Austria,2019,GSMA Mobile Connectivity Index,84.21,5.594959,True
53,Azerbaijan,2019,GSMA Mobile Connectivity Index,58.28,3.927864,True
59,Burundi,2019,GSMA Mobile Connectivity Index,26.16,1.862801,True


In [None]:
### 16. Spectrum Allocated to Mobile Providers

In [175]:
indicators[15]

# load data
indicator = indicators[15]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Spectrum Allocated to Mobile Providers
spectrum_allocated_mobile_providers


In [176]:
df.head(15)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [178]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Spectrum'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [179]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Afghanistan,2019,Spectrum Allocated to Mobile Providers,17.62,1.577685,True
11,Angola,2019,Spectrum Allocated to Mobile Providers,48.22,3.261224,True
17,Albania,2019,Spectrum Allocated to Mobile Providers,52.85,3.515955,True
23,United Arab Emirates,2019,Spectrum Allocated to Mobile Providers,82.44,5.143926,True
29,Argentina,2019,Spectrum Allocated to Mobile Providers,63.38,4.09529,True
35,Armenia,2019,Spectrum Allocated to Mobile Providers,21.43,1.787302,True
41,Australia,2019,Spectrum Allocated to Mobile Providers,75.45,4.759353,True
47,Austria,2019,Spectrum Allocated to Mobile Providers,73.33,4.642716,True
53,Azerbaijan,2019,Spectrum Allocated to Mobile Providers,15.36,1.453345,True
59,Burundi,2019,Spectrum Allocated to Mobile Providers,16.29,1.504511,True


In [None]:
### 17. Internet Exchange Points (IXPs) 

In [180]:
indicators[16]

# load data
indicator = indicators[16]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))



Internet Exchange Points (IXPs) map
countries_ixp


In [181]:
df['Country Name'] = df['Country']
df.head(15)

Unnamed: 0,Region,Country,City,IXP Name,Participants,Peak,Avg,IPv6,Prefixes,Established,URL,Country Name
0,Asia-Pacific,Afghanistan,Kabul,National Internet Exchange of Afghanistan,20,6.4G,2.19G,,,18-Jul,,Afghanistan
1,Europe,Albania,Tirana,Albanian Neutral Internet eXchange,16,2.42G,35M,,,2018,,Albania
2,,Albania,Tirana,Albania IX,0,,,,,2011,,Albania
3,Africa,Algeria,Algiers,Algeria Internet Exchange,0,,,,,,,Algeria
4,,Angola,Luanda,Angola Internet Exchange,21,1.6G,,,512.0,17-Mar-06,,Angola
5,,Angola,Luanda,Ponto de Intercambio Internet Angola,0,,,,,,,Angola
6,,Angola,Luanda,ANGONIX,21,16G,,,5495.0,9-Mar-15,,Angola
7,Latin America,Argentina,Bahía Blanca,CABASE IXP Bahía Blanca,15,713M,,,,27-Apr-13,,Argentina
8,,Argentina,Bariloche,CABASE IXP Bariloche,8,,,,,15-Sep-14,,Argentina
9,,Argentina,Buenos Aires,CABASE IXP Buenos Aires,125,9.19G,,,17224.0,1-Apr-98,,Argentina


In [182]:
agg_df = df.groupby('Country Name').agg({'Country':'count'})
agg_df

Unnamed: 0_level_0,Country
Country Name,Unnamed: 1_level_1
Afghanistan,1
Albania,2
Algeria,1
Angola,3
Argentina,33
...,...
Uzbekistan,3
Vanuatu,1
Viet Nam,3
Zambia,1


In [183]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

agg_df['higher_is_better'] = True
agg_df['Indicator'] = indicator
agg_df['data_col'] = agg_df['Country'] 
agg_df['Year'] = 2019

min_rank = agg_df['data_col'].min()
max_rank = agg_df['data_col'].max()

# transform 0-1 rank into 1-6
agg_df['new_rank_score'] = agg_df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [184]:
agg_df[['Year','Indicator','data_col','new_rank_score','higher_is_better']].head(150)

Unnamed: 0_level_0,Year,Indicator,data_col,new_rank_score,higher_is_better
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,2019,Internet Exchange Points (IXPs) map,1,1.000000,True
Albania,2019,Internet Exchange Points (IXPs) map,2,1.023148,True
Algeria,2019,Internet Exchange Points (IXPs) map,1,1.000000,True
Angola,2019,Internet Exchange Points (IXPs) map,3,1.046296,True
Argentina,2019,Internet Exchange Points (IXPs) map,33,1.740741,True
...,...,...,...,...,...
Syria,2019,Internet Exchange Points (IXPs) map,1,1.000000,True
Taiwan,2019,Internet Exchange Points (IXPs) map,7,1.138889,True
Tajikistan,2019,Internet Exchange Points (IXPs) map,1,1.000000,True
Tanzania,2019,Internet Exchange Points (IXPs) map,5,1.092593,True


In [None]:
### 18. ISP internet download speed averag

In [185]:
indicators[17]

# load data
indicator = indicators[17]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

ISP internet download speed average 
internet_speed


In [186]:
df.head(15)

Unnamed: 0,Ranking,Year,Country,Download speed (Mbps)
0,1,2021,Monaco,256.7
1,2,2021,Singapore,256.03
2,3,2021,Hong Kong (SAR),248.59
3,4,2021,Thailand,216.16
4,5,2021,Romania,215.3
5,6,2021,Switzerland,214.82
6,7,2021,South Korea,212.83
7,8,2021,Chile,209.45
8,9,2021,Denmark,208.5
9,10,2021,Liechtenstein,207.44


In [187]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Download speed (Mbps)'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [188]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Monaco,2021,ISP internet download speed average,256.7,6.0,True
1,Singapore,2021,ISP internet download speed average,256.03,5.986711,True
2,Hong Kong (SAR),2021,ISP internet download speed average,248.59,5.839145,True
3,Thailand,2021,ISP internet download speed average,216.16,5.195922,True
4,Romania,2021,ISP internet download speed average,215.3,5.178865,True
5,Switzerland,2021,ISP internet download speed average,214.82,5.169344,True
6,South Korea,2021,ISP internet download speed average,212.83,5.129874,True
7,Chile,2021,ISP internet download speed average,209.45,5.062835,True
8,Denmark,2021,ISP internet download speed average,208.5,5.043992,True
9,Liechtenstein,2021,ISP internet download speed average,207.44,5.022968,True


In [None]:
### 19. Mobile dowload speed at the slowest hour

In [189]:
indicators[18]

# load data
indicator = indicators[18]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# Waiting for internet_speed to be added to the processed folder

Mobile dowload speed at the slowest hour of the day
mobile_latency


In [190]:
df.head(15)

Unnamed: 0.1,Unnamed: 0,ISO Code,Country,Region,Year,Mobile latencies
0,1,AFG,Afghanistan,South Asia,2014,0.0
1,2,AFG,Afghanistan,South Asia,2015,0.0
2,3,AFG,Afghanistan,South Asia,2016,11.842106
3,4,AFG,Afghanistan,South Asia,2017,56.234993
4,5,AFG,Afghanistan,South Asia,2018,49.762146
5,6,AFG,Afghanistan,South Asia,2019,42.627621
6,7,AGO,Angola,Sub-Saharan Africa,2014,0.0
7,8,AGO,Angola,Sub-Saharan Africa,2015,36.975155
8,9,AGO,Angola,Sub-Saharan Africa,2016,50.958073
9,10,AGO,Angola,Sub-Saharan Africa,2017,66.052635


In [191]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Mobile latencies'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))


# invert since to lower rank is better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [192]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Afghanistan,2019,Mobile dowload speed at the slowest hour of th...,42.627621,3.618652,True
11,Angola,2019,Mobile dowload speed at the slowest hour of th...,66.285637,2.297022,True
17,Albania,2019,Mobile dowload speed at the slowest hour of th...,84.636459,1.271872,True
23,United Arab Emirates,2019,Mobile dowload speed at the slowest hour of th...,84.951675,1.254263,True
29,Argentina,2019,Mobile dowload speed at the slowest hour of th...,80.727203,1.490258,True
35,Armenia,2019,Mobile dowload speed at the slowest hour of th...,88.43795,1.059505,True
41,Australia,2019,Mobile dowload speed at the slowest hour of th...,86.069145,1.191836,True
47,Austria,2019,Mobile dowload speed at the slowest hour of th...,85.845047,1.204355,True
53,Azerbaijan,2019,Mobile dowload speed at the slowest hour of th...,85.6875,1.213157,True
59,Burundi,2019,Mobile dowload speed at the slowest hour of th...,51.736294,3.109806,True


In [None]:
### 20. Mobile download speed average

In [193]:
indicators[19]

# load data
indicator = indicators[19]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Mobile download speed average 
mobile_speed


In [194]:
df.head(15)

Unnamed: 0,#,Year,Country,Mbps
0,1,2021,United Arab Emirates,190.03
1,2,2021,South Korea,189.2
2,3,2021,Qatar,170.77
3,4,2021,China,157.72
4,5,2021,Cyprus,154.13
5,6,2021,Norway,152.53
6,7,2021,Saudi Arabia,151.13
7,8,2021,Kuwait,140.16
8,9,2021,Australia,122.27
9,10,2021,Bulgaria,120.44


In [195]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Mbps'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [196]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,United Arab Emirates,2021,Mobile download speed average,190.03,6.0,True
1,South Korea,2021,Mobile download speed average,189.2,5.977362,True
2,Qatar,2021,Mobile download speed average,170.77,5.474689,True
3,China,2021,Mobile download speed average,157.72,5.118754,True
4,Cyprus,2021,Mobile download speed average,154.13,5.020838,True
5,Norway,2021,Mobile download speed average,152.53,4.977198,True
6,Saudi Arabia,2021,Mobile download speed average,151.13,4.939014,True
7,Kuwait,2021,Mobile download speed average,140.16,4.63981,True
8,Australia,2021,Mobile download speed average,122.27,4.151866,True
9,Bulgaria,2021,Mobile download speed average,120.44,4.101953,True


In [None]:
### 21. Individuals using the Internet (% of population)

In [197]:
indicators[20]

# load data
indicator = indicators[20]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Individuals using the Internet (% of population)
ITU_database


In [198]:
df.head(15)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
0,Angola,Africa,AGO,Female mobile phone ownership as a % of total ...,2010.0,,,
1,Benin,Africa,BEN,Female mobile phone ownership as a % of total ...,2010.0,,,
2,Botswana,Africa,BWA,Female mobile phone ownership as a % of total ...,2010.0,,,
3,Burkina Faso,Africa,BFA,Female mobile phone ownership as a % of total ...,2010.0,,,
4,Burundi,Africa,BDI,Female mobile phone ownership as a % of total ...,2010.0,,,
5,Cabo Verde,Africa,CPV,Female mobile phone ownership as a % of total ...,2010.0,,,
6,Cameroon,Africa,CMR,Female mobile phone ownership as a % of total ...,2010.0,,,
7,Central African Rep.,Africa,CAF,Female mobile phone ownership as a % of total ...,2010.0,,,
8,Chad,Africa,TCD,Female mobile phone ownership as a % of total ...,2010.0,,,
9,Congo (Rep. of the),Africa,COG,Female mobile phone ownership as a % of total ...,2010.0,,,


In [199]:
df = df[(df['Indicator name'] == 'Individuals using the Internet, total (%)')]
df = df[(df.Year==2019)]
df.head(15)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
47607,Cabo Verde,Africa,CPV,"Individuals using the Internet, total (%)",2019.0,61.943398,,
47608,Central African Rep.,Africa,CAF,"Individuals using the Internet, total (%)",2019.0,,,
47609,Congo (Rep. of the),Africa,COG,"Individuals using the Internet, total (%)",2019.0,,,
47610,Côte d'Ivoire,Africa,CIV,"Individuals using the Internet, total (%)",2019.0,36.288955,,
47611,Equatorial Guinea,Africa,GNQ,"Individuals using the Internet, total (%)",2019.0,,,
47612,Eritrea,Africa,ERI,"Individuals using the Internet, total (%)",2019.0,,,
47613,Eswatini,Africa,SWZ,"Individuals using the Internet, total (%)",2019.0,,,
47614,Kenya,Africa,KEN,"Individuals using the Internet, total (%)",2019.0,22.565119,,
47615,Lesotho,Africa,LSO,"Individuals using the Internet, total (%)",2019.0,42.301734,,
47616,Madagascar,Africa,MDG,"Individuals using the Internet, total (%)",2019.0,,,


In [200]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df.rename(columns = {'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [201]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(150)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
47607,Cabo Verde,2019.0,Individuals using the Internet (% of population),61.943398,3.715247,True
47608,Central African Rep.,2019.0,Individuals using the Internet (% of population),,,True
47609,Congo (Rep. of the),2019.0,Individuals using the Internet (% of population),,,True
47610,Côte d'Ivoire,2019.0,Individuals using the Internet (% of population),36.288955,2.162890,True
47611,Equatorial Guinea,2019.0,Individuals using the Internet (% of population),,,True
...,...,...,...,...,...,...
47739,Saint Vincent and the Grenadines,2019.0,Individuals using the Internet (% of population),,,True
47740,Trinidad and Tobago,2019.0,Individuals using the Internet (% of population),,,True
47741,United States,2019.0,Individuals using the Internet (% of population),89.430285,5.378486,True
47742,Uruguay,2019.0,Individuals using the Internet (% of population),83.351534,5.010659,True


In [None]:
### 22. Mobile-cellular subscriptions per 100 inhabitants

In [202]:
indicators[21]

# load data
indicator = indicators[21]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Mobile cellular subscriptions (per 100 people)
ITU_database


In [203]:
df = df[(df['Indicator name'] == 'Individuals using the Internet, total (%)')]
df = df[(df.Year==2020)]
df.head(15)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
47744,Mauritius,Africa,MUS,"Individuals using the Internet, total (%)",2020.0,64.884904,,
47745,Uganda,Africa,UGA,"Individuals using the Internet, total (%)",2020.0,,,
47746,Bahrain,Arab States,BHR,"Individuals using the Internet, total (%)",2020.0,99.539512,,
47747,Egypt,Arab States,EGY,"Individuals using the Internet, total (%)",2020.0,71.9142,,
47748,Iraq,Arab States,IRQ,"Individuals using the Internet, total (%)",2020.0,,,
47749,Kuwait,Arab States,KWT,"Individuals using the Internet, total (%)",2020.0,98.599995,,
47750,Morocco,Arab States,MAR,"Individuals using the Internet, total (%)",2020.0,84.120363,,
47751,Oman,Arab States,OMN,"Individuals using the Internet, total (%)",2020.0,95.232293,,
47752,Qatar,Arab States,QAT,"Individuals using the Internet, total (%)",2020.0,99.652794,,
47753,Saudi Arabia,Arab States,SAU,"Individuals using the Internet, total (%)",2020.0,97.862332,,


In [204]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df.rename(columns = {'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [205]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(150)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
47744,Mauritius,2020.0,Mobile cellular subscriptions (per 100 people),64.884904,3.298839,True
47745,Uganda,2020.0,Mobile cellular subscriptions (per 100 people),,,True
47746,Bahrain,2020.0,Mobile cellular subscriptions (per 100 people),99.539512,5.964578,True
47747,Egypt,2020.0,Mobile cellular subscriptions (per 100 people),71.914200,3.839554,True
47748,Iraq,2020.0,Mobile cellular subscriptions (per 100 people),,,True
...,...,...,...,...,...,...
47805,Bolivia (Plurinational State of),2020.0,Mobile cellular subscriptions (per 100 people),55.139051,2.549158,True
47806,Costa Rica,2020.0,Mobile cellular subscriptions (per 100 people),80.530186,4.502322,True
47807,Mexico,2020.0,Mobile cellular subscriptions (per 100 people),71.970000,3.843846,True
47808,Paraguay,2020.0,Mobile cellular subscriptions (per 100 people),74.515240,4.039634,True


In [None]:
### 23. Average fixed broadband download speeds	

In [None]:
indicators[22]

# load data
indicator = indicators[22]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))


In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2021
df['Indicator'] = indicator
df['data_col'] = df['broadband'] 
df.rename(columns = {'country':'Country'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 24. Postal Coverage

In [206]:
indicators[23]

# load data
indicator = indicators[23]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Postal Coverage
postal_coverage


In [None]:
df.head(15)

In [None]:
#Choose 'Percent of Population Having Mail Delivered at Home' as the data column
df = df[(df.Indicator=='Percent of Population Having Mail Delivered at Home')]
df.head(15)

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2015
df['Indicator'] = indicator
df['data_col'] = df['2015'] 

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 25. Logistics Performance Index (LPI) 

In [None]:
indicators[24]

# load data
indicator = indicators[24]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2018
df['Indicator'] = indicator
df['data_col'] = df['score']
df.rename(columns={'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 26. National cyber security index

In [None]:
indicators[25]

# load data
indicator = indicators[25]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2021
df['Indicator'] = indicator
df['data_col'] = df['National Cyber Security Index']
df.rename(columns={'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 27. Global Cybersecurity Index (GCI)

In [77]:
indicators[26]

# load data
indicator = indicators[26]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Global Cybersecurity Index (GCI)
dice_export_global_cybersecurity_index


In [78]:
df.head(15)

Unnamed: 0,Country,Score (2020),Rank (2020)
0,United States of America**,100.0,1.0
1,United Kingdom,99.54,2.0
2,Saudi Arabia,99.54,2.0
3,Estonia,99.48,3.0
4,Korea (Rep. of),98.52,4.0
5,Singapore,98.52,4.0
6,Spain,98.52,4.0
7,Russian Federation,98.06,5.0
8,United Arab Emirates,98.06,5.0
9,Malaysia,98.06,5.0


In [79]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2020
df['Indicator'] = indicator
df['data_col'] = df['Score (2020)'] 
df.rename(columns={'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 28. Software Developer Ecosystem size

In [80]:
indicators[27]

# load data
indicator = indicators[27]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Software Developer Ecosystem size
software_developer_ecosystem_size


In [81]:
df.head(15)

Unnamed: 0,Country,GitHub Accounts
0,United States,651017
1,China,183805
2,India,168328
3,United Kingdom,109460
4,Germany,94359
5,Brazil,80903
6,Canada,77318
7,France,66367
8,Russia,58767
9,Australia,41790


In [85]:
df['higher_is_better'] = True
df['Year'] = 2018
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,[1]]
df.rename(columns={'Country':'Country Name'}, inplace=True)
    
min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [86]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,United States,2018,Software Developer Ecosystem size,651017,6.0,True
1,China,2018,Software Developer Ecosystem size,183805,2.348274,True
2,India,2018,Software Developer Ecosystem size,168328,2.227306,True
3,United Kingdom,2018,Software Developer Ecosystem size,109460,1.767194,True
4,Germany,2018,Software Developer Ecosystem size,94359,1.649165,True
5,Brazil,2018,Software Developer Ecosystem size,80903,1.543993,True
6,Canada,2018,Software Developer Ecosystem size,77318,1.515973,True
7,France,2018,Software Developer Ecosystem size,66367,1.43038,True
8,Russia,2018,Software Developer Ecosystem size,58767,1.370978,True
9,Australia,2018,Software Developer Ecosystem size,41790,1.238286,True


In [None]:
### 29. Digital Work Ecosystem size

In [None]:
indicators[28]

# load data
indicator = indicators[28]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = False
df['Year'] = 2020
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,[0]].astype(float)
df.rename(columns={'Country':'Country Name'}, inplace=True)
    
min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# invert since to lower rank is better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)
# Two problems: 1. cannot use the name of the column, have to use iloc, 2. cannot use the index column
# because the number format is not standard, have to use the rank column

In [None]:
### 30. Country, Industry, Skill Migration Data (Skill) 

In [None]:
indicators[29]

# load data
indicator = indicators[29]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
df['Country Name'] = df.iloc[:,[1]]

df_sum = df.groupby('Country Name')[['net_per_10K_2019']].sum()
df_sum

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df_sum['higher_is_better'] = True
df_sum['Year'] = 2019
df_sum['Indicator'] = indicator
df_sum['data_col'] = df_sum['net_per_10K_2019']


min_rank = df_sum['data_col'].min()
max_rank = df_sum['data_col'].max()

# transform 0-1 rank into 1-6
df_sum['new_rank_score'] = df_sum['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df_sum[['Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 31. Country, Industry, Skill Migration Data (Industry)

In [None]:
indicators[30]

# load data
indicator = indicators[30]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))


In [None]:
df.head(15)

In [None]:
df['Country Name'] = df.iloc[:,[1]]

df_sum = df.groupby('Country Name')[['net_per_10K_2019']].sum()
df_sum

In [None]:
## create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df_sum['higher_is_better'] = True
df_sum['Year'] = 2019
df_sum['Indicator'] = indicator
df_sum['data_col'] = df_sum['net_per_10K_2019']


min_rank = df_sum['data_col'].min()
max_rank = df_sum['data_col'].max()

# transform 0-1 rank into 1-6
df_sum['new_rank_score'] = df_sum['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df_sum[['Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 32. Country, Industry, Skill Migration Data (Nation)

In [None]:
indicators[31]

# load data
indicator = indicators[31]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
df['Country Name'] = df.iloc[:,[1]]

df_sum = df.groupby('Country Name')[['net_per_10K_2019']].sum()
df_sum

In [None]:
## create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df_sum['higher_is_better'] = True
df_sum['Year'] = 2019
df_sum['Indicator'] = indicator
df_sum['data_col'] = df_sum['net_per_10K_2019']


min_rank = df_sum['data_col'].min()
max_rank = df_sum['data_col'].max()

# transform 0-1 rank into 1-6
df_sum['new_rank_score'] = df_sum['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df_sum[['Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 33. Digital Finance Ecosystem size

In [None]:
indicators[32]

# load data
indicator = indicators[32]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2021
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,[2]]
df['Country Name'] = df.iloc[:,[1]]

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# Have to shorten the column names to see if this works

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 34. Tech hubs & spaces size

In [None]:
indicators[33]

# load data
indicator = indicators[33]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

#There are two problems:
#1. The column names need some overhaul (moving the second column to the top and make it the column names)
#2. This is a list of cities, not countries, may present probkems later. 

In [None]:
### 35. Banking Ecosystem size

In [None]:
indicators[34]

# load data
indicator = indicators[34]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2017
df['Indicator'] = indicator
df['data_col'] = df['Bank assets, percent of GDP, 2017']
df.rename(columns={'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

# Okay but will have to remove number from the Country column, leaving only characters. 

In [None]:
### 36. Angel Ecosystem size

In [None]:
indicators[35]

# load data
indicator = indicators[35]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2019
df['Indicator'] = indicator
df['data_col'] = df['Business angel investments (in million Euros)']
df.rename(columns={'Country':'Country Name'}, inplace=True)

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

# Okay but there are only EU countries

In [None]:
### 37. Startup Ecosystem size

In [None]:
indicators[36]

# load data
indicator = indicators[36]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

# This is a list of cities, not countries, wonder if it will work 

In [None]:
### 38. Venture Ecosystem size

indicators[37]

# load data
indicator = indicators[37]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

# Similar to the previous one

In [None]:
### 39. International Co-Inventions

In [None]:
indicators[38]

# load data
indicator = indicators[38]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# filter most recent year and global value
df = df[(df.Time==2017)]
df = df[(df.Country!='World')]
df ['Country Name'] = df ['Country']
df

In [None]:
# Create summarization
df_sum = df.groupby('Country Name')[['Value']].sum()
df_sum

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df_sum['higher_is_better'] = True
df_sum['Year'] = 2017
df_sum['Indicator'] = indicator
df_sum['data_col'] = df_sum.iloc[:,[0]]

min_rank = df_sum['data_col'].min()
max_rank = df_sum['data_col'].max()

# transform 0-1 rank into 1-6
df_sum['new_rank_score'] = df_sum['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df_sum[['Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

# Okay but the data is too general with many missing countries