In [1]:
import pandas as pd
import numpy as np

In [2]:
### Get all the pillar names from the excel

In [3]:
names = pd.read_excel('../../UNDP Digital Assessment Data Framework Filename Matching V7.xlsx')

In [4]:
col_names = ['Indicator','check', 'Data Source','Index','Filename','Sub-Pillar']

In [5]:
names = names[col_names]

In [6]:
names.head()

Unnamed: 0,Indicator,check,Data Source,Index,Filename,Sub-Pillar
0,Countries,,United Nations,False,Countries,
1,"Database of Global Administrative Areas (GADM,...",,GADM maps and data,False,,
2,High Resolution Population Density Maps + Demo...,,Facebook,False,,
3,population density vs openstreetmap object den...,,Kontur,False,,
4,Population Density,Infrastructure,World Bank,False,population_density,Connectivity Technology


In [7]:
# get all the files per pillar
data_stats = names.groupby('check').agg({'Filename':'count','Indicator':'count'})

In [8]:
data_stats

Unnamed: 0_level_0,Filename,Indicator
check,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,16,25
Foundations,8,13
Government,9,15
Infrastructure,39,48
People,34,47
Regulation,5,7
Strategy,1,1


In [9]:
### Infrastructure

In [10]:
bnames = names[(names.check=='Infrastructure')&(~names.Filename.isna())]#&(names.Index==False)]
bnames.head(25)

Unnamed: 0,Indicator,check,Data Source,Index,Filename,Sub-Pillar
4,Population Density,Infrastructure,World Bank,False,population_density,Connectivity Technology
6,% of population covered by internet connectivity,Infrastructure,ITU,False,ITU_database,Connectivity Technology
7,% of population covered by mobile 2G+ data con...,Infrastructure,GSMA Mobile Connectivity Index,False,countries_mobile_connectivity,Connectivity Technology
8,% of population covered by mobile 3G+ data con...,Infrastructure,GSMA Mobile Connectivity Index,False,countries_mobile_connectivity,Connectivity Technology
9,% of population covered by mobile 4G+ data con...,Infrastructure,GSMA Mobile Connectivity Index,False,countries_mobile_connectivity,Connectivity Technology
10,% of population covered by mobile 5G+ data con...,Infrastructure,GSMA Mobile Connectivity Index,False,countries_mobile_connectivity,Connectivity Technology
11,Mobile Coverage Maps,Infrastructure,ITU,False,ITU_database,Connectivity Technology
12,Electricity Density,Infrastructure,World Bank,False,population_electricity_coverage,Connectivity Technology
13,% of population covered by electricity,Infrastructure,World Bank,False,population_electricity_coverage,Connectivity Technology
14,Electricity supply quality,Infrastructure,International Energy Agency / Global Competiti...,False,elect_supply_quality,Connectivity Technology


In [11]:
# get list of names for all indicators
indicators = bnames.Indicator.unique()
subpillars = bnames['Sub-Pillar'].unique()

In [12]:
# get all file names
bfiles = bnames.Filename.unique()

In [13]:
bfiles

array(['population_density', 'ITU_database',
       'countries_mobile_connectivity', 'population_electricity_coverage',
       'elect_supply_quality', 'sustainability_index', 'mobile_density',
       'e_government_index', 'spectrum_allocated_mobile_providers',
       'countries_ixp', 'mobile_speed', 'internet_speed',
       'mobile_latency', 'fixed_bdbd_spd_dl_ul', 'postal_coverage',
       'logistics_performance_index', 'national_cybersecurity_index',
       'dice_export_global_cybersecurity_index',
       'software_developer_ecosystem_size',
       'digital_platform_economy_index', 'migration_skill',
       'migration_industry', 'migration_country',
       'global_fintech_ranking', 'tech_hubs', 'banking_sector_size',
       'angel_investment', 'startup_eco_size',
       'international_co_inventions'], dtype=object)

In [14]:
subpillars

array(['Connectivity Technology', 'Innovation Ecosystem'], dtype=object)

In [15]:
# formula for converting scale
def convert_rank(old_value, old_min=1, old_max=7, new_min=1, new_max=6 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

In [16]:
### 1. Population Density

In [17]:
### 2. % of population covered by internet connectivity

In [18]:
indicators[1]

# load data
indicator = indicators[1]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by internet connectivity
ITU_database


In [19]:
df = df[(df.Year == 2020)]
df = df[(df['Indicator name']== 'Individuals using the Internet, total (%)')]
df

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
47744,Mauritius,Africa,MUS,"Individuals using the Internet, total (%)",2020.0,64.884904,,
47745,Uganda,Africa,UGA,"Individuals using the Internet, total (%)",2020.0,,,
47746,Bahrain,Arab States,BHR,"Individuals using the Internet, total (%)",2020.0,99.539512,,
47747,Egypt,Arab States,EGY,"Individuals using the Internet, total (%)",2020.0,71.914200,,
47748,Iraq,Arab States,IRQ,"Individuals using the Internet, total (%)",2020.0,,,
...,...,...,...,...,...,...,...,...
47805,Bolivia (Plurinational State of),The Americas,BOL,"Individuals using the Internet, total (%)",2020.0,55.139051,,
47806,Costa Rica,The Americas,CRI,"Individuals using the Internet, total (%)",2020.0,80.530186,,
47807,Mexico,The Americas,MEX,"Individuals using the Internet, total (%)",2020.0,71.970000,,
47808,Paraguay,The Americas,PRY,"Individuals using the Internet, total (%)",2020.0,74.515240,,


In [20]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [21]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value']
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [22]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
47744,Mauritius,2020.0,% of population covered by internet connectivity,64.884904,3.298839,True,Connectivity Technology
47745,Uganda,2020.0,% of population covered by internet connectivity,,,True,Connectivity Technology
47746,Bahrain,2020.0,% of population covered by internet connectivity,99.539512,5.964578,True,Connectivity Technology
47747,Egypt,2020.0,% of population covered by internet connectivity,71.914200,3.839554,True,Connectivity Technology
47748,Iraq,2020.0,% of population covered by internet connectivity,,,True,Connectivity Technology
...,...,...,...,...,...,...,...
47805,Bolivia (Plurinational State of),2020.0,% of population covered by internet connectivity,55.139051,2.549158,True,Connectivity Technology
47806,Costa Rica,2020.0,% of population covered by internet connectivity,80.530186,4.502322,True,Connectivity Technology
47807,Mexico,2020.0,% of population covered by internet connectivity,71.970000,3.843846,True,Connectivity Technology
47808,Paraguay,2020.0,% of population covered by internet connectivity,74.515240,4.039634,True,Connectivity Technology


In [23]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [24]:
### 3. % of population covered by mobile 2G

In [25]:
indicators[2]

'% of population covered by mobile 2G+ data connectivity'

In [26]:
# load data
indicator = indicators[2]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by mobile 2G+ data connectivity
countries_mobile_connectivity


In [27]:
df.head(10)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [28]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [29]:
# filter most recent year
df = df[(df.Year==2019)]


In [30]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2G Coverage'] 
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [31]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
5,Afghanistan,2019,% of population covered by mobile 2G+ data con...,90.00,4.921251,True,Connectivity Technology
11,Angola,2019,% of population covered by mobile 2G+ data con...,90.00,4.921251,True,Connectivity Technology
17,Albania,2019,% of population covered by mobile 2G+ data con...,99.86,5.984898,True,Connectivity Technology
23,United Arab Emirates,2019,% of population covered by mobile 2G+ data con...,100.00,6.000000,True,Connectivity Technology
29,Argentina,2019,% of population covered by mobile 2G+ data con...,98.00,5.784250,True,Connectivity Technology
...,...,...,...,...,...,...,...
995,Samoa,2019,% of population covered by mobile 2G+ data con...,97.00,5.676375,True,Connectivity Technology
1001,Yemen,2019,% of population covered by mobile 2G+ data con...,95.00,5.460626,True,Connectivity Technology
1007,South Africa,2019,% of population covered by mobile 2G+ data con...,99.97,5.996764,True,Connectivity Technology
1013,Zambia,2019,% of population covered by mobile 2G+ data con...,80.40,3.885653,True,Connectivity Technology


In [32]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [33]:
### 4. % of population covered by mobile 3G

In [34]:
indicators[3]

# load data
indicator = indicators[3]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by mobile 3G+ data connectivity
countries_mobile_connectivity


In [35]:
df.head(10)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [36]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [37]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['3G Coverage'] 
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [38]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
5,Afghanistan,2019,% of population covered by mobile 3G+ data con...,58.7,3.282895,True,Connectivity Technology
11,Angola,2019,% of population covered by mobile 3G+ data con...,71.0,4.092105,True,Connectivity Technology
17,Albania,2019,% of population covered by mobile 3G+ data con...,97.0,5.802632,True,Connectivity Technology
23,United Arab Emirates,2019,% of population covered by mobile 3G+ data con...,100.0,6.000000,True,Connectivity Technology
29,Argentina,2019,% of population covered by mobile 3G+ data con...,95.0,5.671053,True,Connectivity Technology
...,...,...,...,...,...,...,...
995,Samoa,2019,% of population covered by mobile 3G+ data con...,95.0,5.671053,True,Connectivity Technology
1001,Yemen,2019,% of population covered by mobile 3G+ data con...,95.0,5.671053,True,Connectivity Technology
1007,South Africa,2019,% of population covered by mobile 3G+ data con...,99.5,5.967105,True,Connectivity Technology
1013,Zambia,2019,% of population covered by mobile 3G+ data con...,71.8,4.144737,True,Connectivity Technology


In [39]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [40]:
### 5. % of population covered by mobile 4G

In [41]:
indicators[4]

# load data
indicator = indicators[4]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by mobile 4G+ data connectivity
countries_mobile_connectivity


In [42]:
df.head(10)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [43]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [44]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['4G Coverage'] 
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [45]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
5,Afghanistan,2019,% of population covered by mobile 4G+ data con...,15.00,1.7500,True,Connectivity Technology
11,Angola,2019,% of population covered by mobile 4G+ data con...,50.00,3.5000,True,Connectivity Technology
17,Albania,2019,% of population covered by mobile 4G+ data con...,96.00,5.8000,True,Connectivity Technology
23,United Arab Emirates,2019,% of population covered by mobile 4G+ data con...,99.00,5.9500,True,Connectivity Technology
29,Argentina,2019,% of population covered by mobile 4G+ data con...,89.77,5.4885,True,Connectivity Technology
...,...,...,...,...,...,...,...
995,Samoa,2019,% of population covered by mobile 4G+ data con...,90.00,5.5000,True,Connectivity Technology
1001,Yemen,2019,% of population covered by mobile 4G+ data con...,0.00,1.0000,True,Connectivity Technology
1007,South Africa,2019,% of population covered by mobile 4G+ data con...,95.70,5.7850,True,Connectivity Technology
1013,Zambia,2019,% of population covered by mobile 4G+ data con...,49.10,3.4550,True,Connectivity Technology


In [46]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [47]:
### 6. % of population covered by mobile 5G

In [48]:
indicators[5]

# load data
indicator = indicators[5]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by mobile 5G+ data connectivity
countries_mobile_connectivity


In [49]:
df.head(10)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [50]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [51]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['5G Coverage'] 
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [52]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
5,Afghanistan,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True,Connectivity Technology
11,Angola,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True,Connectivity Technology
17,Albania,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True,Connectivity Technology
23,United Arab Emirates,2019,% of population covered by mobile 5G+ data con...,100.0,6.0,True,Connectivity Technology
29,Argentina,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True,Connectivity Technology
...,...,...,...,...,...,...,...
995,Samoa,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True,Connectivity Technology
1001,Yemen,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True,Connectivity Technology
1007,South Africa,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True,Connectivity Technology
1013,Zambia,2019,% of population covered by mobile 5G+ data con...,0.0,1.0,True,Connectivity Technology


In [53]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [54]:
### 7. Mobile Coverage Maps

In [55]:
indicators[6]

# load data
indicator = indicators[6]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Mobile Coverage Maps
ITU_database


In [56]:
df.head(15)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
0,Angola,Africa,AGO,Female mobile phone ownership as a % of total ...,2010.0,,,
1,Benin,Africa,BEN,Female mobile phone ownership as a % of total ...,2010.0,,,
2,Botswana,Africa,BWA,Female mobile phone ownership as a % of total ...,2010.0,,,
3,Burkina Faso,Africa,BFA,Female mobile phone ownership as a % of total ...,2010.0,,,
4,Burundi,Africa,BDI,Female mobile phone ownership as a % of total ...,2010.0,,,
5,Cabo Verde,Africa,CPV,Female mobile phone ownership as a % of total ...,2010.0,,,
6,Cameroon,Africa,CMR,Female mobile phone ownership as a % of total ...,2010.0,,,
7,Central African Rep.,Africa,CAF,Female mobile phone ownership as a % of total ...,2010.0,,,
8,Chad,Africa,TCD,Female mobile phone ownership as a % of total ...,2010.0,,,
9,Congo (Rep. of the),Africa,COG,Female mobile phone ownership as a % of total ...,2010.0,,,


In [57]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [58]:
df = df[(df.Year == 2020)]
df = df[(df['Indicator name']== 'Mobile-cellular subscriptions per 100 inhabitants')]
df

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
32475,Angola,Africa,AGO,Mobile-cellular subscriptions per 100 inhabitants,2020.0,44.559511,,
32476,Benin,Africa,BEN,Mobile-cellular subscriptions per 100 inhabitants,2020.0,91.897280,,
32477,Botswana,Africa,BWA,Mobile-cellular subscriptions per 100 inhabitants,2020.0,162.399011,,
32478,Burkina Faso,Africa,BFA,Mobile-cellular subscriptions per 100 inhabitants,2020.0,105.807440,,
32479,Burundi,Africa,BDI,Mobile-cellular subscriptions per 100 inhabitants,2020.0,55.767172,,
...,...,...,...,...,...,...,...,...
32666,Suriname,The Americas,SUR,Mobile-cellular subscriptions per 100 inhabitants,2020.0,153.305479,,
32667,Trinidad and Tobago,The Americas,TTO,Mobile-cellular subscriptions per 100 inhabitants,2020.0,142.051665,,
32668,United States,The Americas,USA,Mobile-cellular subscriptions per 100 inhabitants,2020.0,,,
32669,Uruguay,The Americas,URY,Mobile-cellular subscriptions per 100 inhabitants,2020.0,,,


In [59]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [60]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
32475,Angola,2020.0,Mobile Coverage Maps,44.559511,1.012692,True,Connectivity Technology
32476,Benin,2020.0,Mobile Coverage Maps,91.897280,1.968150,True,Connectivity Technology
32477,Botswana,2020.0,Mobile Coverage Maps,162.399011,3.391146,True,Connectivity Technology
32478,Burkina Faso,2020.0,Mobile Coverage Maps,105.807440,2.248911,True,Connectivity Technology
32479,Burundi,2020.0,Mobile Coverage Maps,55.767172,1.238906,True,Connectivity Technology
...,...,...,...,...,...,...,...
32666,Suriname,2020.0,Mobile Coverage Maps,153.305479,3.207604,True,Connectivity Technology
32667,Trinidad and Tobago,2020.0,Mobile Coverage Maps,142.051665,2.980458,True,Connectivity Technology
32668,United States,2020.0,Mobile Coverage Maps,,,True,Connectivity Technology
32669,Uruguay,2020.0,Mobile Coverage Maps,,,True,Connectivity Technology


In [61]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [62]:
### 8. Electricity Density

In [63]:
indicators[7]

# load data
indicator = indicators[7]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Electricity Density
population_electricity_coverage


In [64]:
df.head(20)

Unnamed: 0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2017,2018,2019,2020,Country Name,Country Code,Indicator Name,Indicator Code,data_country,data_year
0,,,,,,,,,,,...,100.0,100.0,100.0,,Aruba,ABW,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
1,,,,,,,,,,,...,39.754201,42.168241,43.640661,,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
2,,,,,,,,,,,...,97.7,98.715622,97.7,,Afghanistan,AFG,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
3,,,,,,,,,,,...,48.848205,51.253253,51.341421,,Africa Western and Central,AFW,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
4,,,,,,,,,,,...,43.00161,45.29,45.670315,,Angola,AGO,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
5,,,,,,,,,,,...,99.89,100.0,100.0,,Albania,ALB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
6,,,,,,,,,,,...,100.0,100.0,100.0,,Andorra,AND,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
7,,,,,,,,,,,...,89.623427,88.306324,89.512282,,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
8,,,,,,,,,,,...,100.0,100.0,100.0,,United Arab Emirates,ARE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
9,,,,,,,,,,,...,100.0,100.0,100.0,,Argentina,ARG,Access to electricity (% of population),EG.ELC.ACCS.ZS,,


In [65]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [66]:
# create standard columns
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2019'] 
df['Year'] = 2019
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [67]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Aruba,2019,Electricity Density,100.000000,6.000000,True,Connectivity Technology
1,Africa Eastern and Southern,2019,Electricity Density,43.640661,2.979006,True,Connectivity Technology
2,Afghanistan,2019,Electricity Density,97.700000,5.876715,True,Connectivity Technology
3,Africa Western and Central,2019,Electricity Density,51.341421,3.391785,True,Connectivity Technology
4,Angola,2019,Electricity Density,45.670315,3.087800,True,Connectivity Technology
...,...,...,...,...,...,...,...
261,Kosovo,2019,Electricity Density,100.000000,6.000000,True,Connectivity Technology
262,"Yemen, Rep.",2019,Electricity Density,72.751701,4.539427,True,Connectivity Technology
263,South Africa,2019,Electricity Density,85.000000,5.195965,True,Connectivity Technology
264,Zambia,2019,Electricity Density,43.000000,2.944665,True,Connectivity Technology


In [68]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [69]:
### 9. % of population covered by electricity

In [70]:
indicators[8]

# load data
indicator = indicators[8]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population covered by electricity
population_electricity_coverage


In [71]:
df.head(15)

Unnamed: 0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2017,2018,2019,2020,Country Name,Country Code,Indicator Name,Indicator Code,data_country,data_year
0,,,,,,,,,,,...,100.0,100.0,100.0,,Aruba,ABW,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
1,,,,,,,,,,,...,39.754201,42.168241,43.640661,,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
2,,,,,,,,,,,...,97.7,98.715622,97.7,,Afghanistan,AFG,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
3,,,,,,,,,,,...,48.848205,51.253253,51.341421,,Africa Western and Central,AFW,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
4,,,,,,,,,,,...,43.00161,45.29,45.670315,,Angola,AGO,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
5,,,,,,,,,,,...,99.89,100.0,100.0,,Albania,ALB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
6,,,,,,,,,,,...,100.0,100.0,100.0,,Andorra,AND,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
7,,,,,,,,,,,...,89.623427,88.306324,89.512282,,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
8,,,,,,,,,,,...,100.0,100.0,100.0,,United Arab Emirates,ARE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,
9,,,,,,,,,,,...,100.0,100.0,100.0,,Argentina,ARG,Access to electricity (% of population),EG.ELC.ACCS.ZS,,


In [72]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [73]:
# create standard columns
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2019'] 
df['Year'] = 2019
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [74]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df


Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Aruba,2019,% of population covered by electricity,100.000000,6.000000,True,Connectivity Technology
1,Africa Eastern and Southern,2019,% of population covered by electricity,43.640661,2.979006,True,Connectivity Technology
2,Afghanistan,2019,% of population covered by electricity,97.700000,5.876715,True,Connectivity Technology
3,Africa Western and Central,2019,% of population covered by electricity,51.341421,3.391785,True,Connectivity Technology
4,Angola,2019,% of population covered by electricity,45.670315,3.087800,True,Connectivity Technology
...,...,...,...,...,...,...,...
261,Kosovo,2019,% of population covered by electricity,100.000000,6.000000,True,Connectivity Technology
262,"Yemen, Rep.",2019,% of population covered by electricity,72.751701,4.539427,True,Connectivity Technology
263,South Africa,2019,% of population covered by electricity,85.000000,5.195965,True,Connectivity Technology
264,Zambia,2019,% of population covered by electricity,43.000000,2.944665,True,Connectivity Technology


In [75]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [76]:
### 10. Electricity Supply Quality

In [77]:
indicators[9]

# load data
indicator = indicators[9]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Electricity supply quality
elect_supply_quality


In [78]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,AGO,ALB,ARE,ARG,ARM,AUS,AUT,AZE,BDI,...,GCREURASIA,GCREAP,GCRSASIA,LIC,LMC,UMC,HIC,AVG,data_country,data_year
0,Attribute,Angola,Albania,United Arab Emirates,Argentina,Armenia,Australia,Austria,Azerbaijan,Burundi,...,Eurasia,East Asia and Pacific,South Asia,Low-income,Lower-middle-income,Upper-middle-income,High-income,Sample average,Attribute,Attribute
1,VALUE,,,,,,,,,,...,,,,,,,,,VALUE,VALUE
2,RANK,136,81,25,83,69,16,21,58,135,...,,,,,,,,0,RANK,RANK
3,SCORE,38.11248593,57.61416881,75.00738794,57.20132859,61.27687643,78.74662637,76.60913087,62.71789348,40.25234131,...,59.37969533,69.86862667,54.70460508,43.7445505,51.94942309,59.7912297,72.78080961,60.6396938,SCORE,SCORE
4,DATE DESCRIPTION,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,...,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,2019 edition,DATE DESCRIPTION,DATE DESCRIPTION
5,SOURCE,"World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...",...,"World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...","World Economic Forum, [i]Global Competitivenes...",SOURCE,SOURCE
6,SOURCE DATE,,,,,,,,,,...,,,,,,,,,SOURCE DATE,SOURCE DATE
7,NOTE,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,See Appendix A of the Global Competitiveness R...,NOTE,NOTE
8,VALUE,,,,,,,,,,...,,,,,,,,,VALUE,VALUE
9,RANK,136,88,3,109,62,22,20,64,134,...,,,,,,,,0,RANK,RANK


In [79]:
# The data needs to be transposed as countries are now in columns, not rows, which could present problems for later.

In [80]:
### 11. Electricity Environmental Sustainability

In [81]:
indicators[10]

# load data
indicator = indicators[10]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Electricity Environmental Sustainability
sustainability_index


In [82]:
df.head(15)

Unnamed: 0,Country Code ISO3,Country,2021 SDG Index Score,2021 SDG Index Rank,Percentage missing values,Spillover Score (0-100),Regional Score (0-100),Regions used for the SDR,Population in 2020,Goal 1 Dash,...,Goal 10 Regional Score,Goal 11 Regional Score,Goal 12 Regional Score,Goal 13 Regional Score,Goal 14 Regional Score,Goal 15 Regional Score,Goal 16 Regional Score,Goal 17 Regional Score,data_country,data_year
0,AFG,Afghanistan,53.9,137.0,9.3,99.3,71.4,E. Europe & C. Asia,38928341,grey,...,72.362762,76.443228,77.629345,83.041913,58.865821,69.499564,72.403938,63.38646,,
1,AGO,Angola,50.3,154.0,0.0,97.0,51.9,Africa,32866268,red,...,37.334205,52.728915,88.024702,96.266527,68.010492,66.835003,56.050983,51.652955,,
2,ALB,Albania,71.0,64.0,3.3,94.3,71.4,E. Europe & C. Asia,2877800,yellow,...,72.362762,76.443228,77.629345,83.041913,58.865821,69.499564,72.403938,63.38646,,
3,AND,Andorra,,,51.2,,71.4,E. Europe & C. Asia,77265,grey,...,,,,,,,,,,
4,ARE,United Arab Emirates,70.2,71.0,11.0,38.8,67.1,MENA,9890400,green,...,66.178091,58.928271,76.842146,71.47551,62.776615,57.356212,69.093831,56.231255,,
5,ARG,Argentina,72.8,52.0,1.1,94.5,68.6,LAC,45195777,yellow,...,27.901676,77.087594,76.727283,85.373283,62.781114,60.904217,60.888934,64.295156,,
6,ARM,Armenia,71.8,58.0,3.5,96.7,71.4,E. Europe & C. Asia,2963234,yellow,...,72.362762,76.443228,77.629345,83.041913,58.865821,69.499564,72.403938,63.38646,,
7,ATG,Antigua and Barbuda,,,30.8,,68.6,LAC,97928,grey,...,,,,,,,,,,
8,AUS,Australia,75.6,35.0,1.1,63.9,77.2,OECD,25499881,yellow,...,74.475203,88.911367,58.055279,64.112721,63.360546,74.102776,82.316815,65.773824,,
9,AUT,Austria,82.1,6.0,3.5,59.5,77.2,OECD,9006400,green,...,74.475203,88.911367,58.055279,64.112721,63.360546,74.102776,82.316815,65.773824,,


In [83]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [84]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2021 SDG Index Score'] 
df['Year'] = 2021
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [85]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Afghanistan,2021,Electricity Environmental Sustainability,53.9,2.638655,True,Connectivity Technology
1,Angola,2021,Electricity Environmental Sustainability,50.3,2.260504,True,Connectivity Technology
2,Albania,2021,Electricity Environmental Sustainability,71.0,4.434874,True,Connectivity Technology
3,Andorra,2021,Electricity Environmental Sustainability,,,True,Connectivity Technology
4,United Arab Emirates,2021,Electricity Environmental Sustainability,70.2,4.350840,True,Connectivity Technology
...,...,...,...,...,...,...,...
200,Sub-Saharan Africa,2021,Electricity Environmental Sustainability,51.9,2.428571,True,Connectivity Technology
201,Low-income Countries,2021,Electricity Environmental Sustainability,51.0,2.334034,True,Connectivity Technology
202,Lower-middle-income Countries,2021,Electricity Environmental Sustainability,60.1,3.289916,True,Connectivity Technology
203,Upper-middle-income Countries,2021,Electricity Environmental Sustainability,70.8,4.413866,True,Connectivity Technology


In [86]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [87]:
### 12. Cellphone Signal Density

In [88]:
indicators[11]

# load data
indicator = indicators[11]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Cellphone Signal Density
mobile_density


In [89]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,ISO Code,Country,Region,Year,Network coverage
0,1,AFG,Afghanistan,South Asia,2014,28.919998
1,2,AFG,Afghanistan,South Asia,2015,31.799999
2,3,AFG,Afghanistan,South Asia,2016,33.040001
3,4,AFG,Afghanistan,South Asia,2017,36.34
4,5,AFG,Afghanistan,South Asia,2018,30.931999
5,6,AFG,Afghanistan,South Asia,2019,38.48
6,7,AGO,Angola,Sub-Saharan Africa,2014,51.44577
7,8,AGO,Angola,Sub-Saharan Africa,2015,53.906952
8,9,AGO,Angola,Sub-Saharan Africa,2016,61.17495
9,10,AGO,Angola,Sub-Saharan Africa,2017,60.800003


In [90]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [91]:
df = df[(df.Year == 2019)]

# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Network coverage'] 
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [92]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
5,Afghanistan,2019,Cellphone Signal Density,38.480000,2.117926,True,Connectivity Technology
11,Angola,2019,Cellphone Signal Density,57.400002,3.311828,True,Connectivity Technology
17,Albania,2019,Cellphone Signal Density,87.186005,5.191403,True,Connectivity Technology
23,United Arab Emirates,2019,Cellphone Signal Density,99.599998,5.974759,True,Connectivity Technology
29,Argentina,2019,Cellphone Signal Density,83.708000,4.971932,True,Connectivity Technology
...,...,...,...,...,...,...,...
995,Samoa,2019,Cellphone Signal Density,83.699997,4.971427,True,Connectivity Technology
1001,Yemen,2019,Cellphone Signal Density,47.500000,2.687112,True,Connectivity Technology
1007,South Africa,2019,Cellphone Signal Density,88.077003,5.247628,True,Connectivity Technology
1013,Zambia,2019,Cellphone Signal Density,56.400002,3.248725,True,Connectivity Technology


In [93]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [94]:
### 13. Telecommunication Infrastructure Index (TII)

In [95]:
indicators[12]

# load data
indicator = indicators[12]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Telecommunication Infrastructure Index (TII)
e_government_index


In [96]:
df.head(10)

Unnamed: 0,Survey Year,Country Name,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
0,2020,Iraq,143,0.436,0.3095,0.3353,0.4358,0.537
1,2020,Ireland,27,0.8433,0.8571,0.7706,0.9494,0.81
2,2020,Israel,30,0.8361,0.7143,0.7471,0.8924,0.8689
3,2020,Italy,37,0.8231,0.8214,0.8294,0.8466,0.7932
4,2020,Jamaica,114,0.5392,0.369,0.3882,0.7142,0.5151
5,2020,Japan,14,0.8989,0.9881,0.9059,0.8684,0.9223
6,2020,Jordan,117,0.5309,0.3333,0.3588,0.68,0.554
7,2020,Kazakhstan,29,0.8375,0.881,0.9235,0.8866,0.7024
8,2020,Kenya,116,0.5326,0.5952,0.6765,0.5812,0.3402
9,2020,Kiribati,145,0.432,0.5595,0.4941,0.6778,0.1241


In [97]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [98]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Telecommunication Infrastructure Index'] 
df['Year'] = df['Survey Year']
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))


In [99]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Iraq,2020,Telecommunication Infrastructure Index (TII),0.5370,3.6850,True,Connectivity Technology
1,Ireland,2020,Telecommunication Infrastructure Index (TII),0.8100,5.0500,True,Connectivity Technology
2,Israel,2020,Telecommunication Infrastructure Index (TII),0.8689,5.3445,True,Connectivity Technology
3,Italy,2020,Telecommunication Infrastructure Index (TII),0.7932,4.9660,True,Connectivity Technology
4,Jamaica,2020,Telecommunication Infrastructure Index (TII),0.5151,3.5755,True,Connectivity Technology
...,...,...,...,...,...,...,...
188,Senegal,2020,Telecommunication Infrastructure Index (TII),0.4358,3.1790,True,Connectivity Technology
189,Serbia,2020,Telecommunication Infrastructure Index (TII),0.6200,4.1000,True,Connectivity Technology
190,Seychelles,2020,Telecommunication Infrastructure Index (TII),0.6925,4.4625,True,Connectivity Technology
191,Singapore,2020,Telecommunication Infrastructure Index (TII),0.8899,5.4495,True,Connectivity Technology


In [100]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [101]:
### 14. GSMA Mobile Connectivity Index

In [102]:
indicators[13]

# load data
indicator = indicators[13]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

GSMA Mobile Connectivity Index 
countries_mobile_connectivity


In [103]:
df.head(10)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [104]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [105]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Index'] 
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [106]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
5,Afghanistan,2019,GSMA Mobile Connectivity Index,28.94,2.041533,True,Connectivity Technology
11,Angola,2019,GSMA Mobile Connectivity Index,43.53,2.979555,True,Connectivity Technology
17,Albania,2019,GSMA Mobile Connectivity Index,67.89,4.545712,True,Connectivity Technology
23,United Arab Emirates,2019,GSMA Mobile Connectivity Index,78.23,5.210492,True,Connectivity Technology
29,Argentina,2019,GSMA Mobile Connectivity Index,67.16,4.498778,True,Connectivity Technology
...,...,...,...,...,...,...,...
995,Samoa,2019,GSMA Mobile Connectivity Index,60.20,4.051305,True,Connectivity Technology
1001,Yemen,2019,GSMA Mobile Connectivity Index,29.71,2.091038,True,Connectivity Technology
1007,South Africa,2019,GSMA Mobile Connectivity Index,60.14,4.047448,True,Connectivity Technology
1013,Zambia,2019,GSMA Mobile Connectivity Index,35.34,2.453002,True,Connectivity Technology


In [107]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [108]:
### 15. Spectrum Allocated to Mobile Providers

In [109]:
indicators[14]

# load data
indicator = indicators[14]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Spectrum Allocated to Mobile Providers
spectrum_allocated_mobile_providers


In [110]:
df.head(15)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [111]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [112]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Spectrum'] 
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [113]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
5,Afghanistan,2019,Spectrum Allocated to Mobile Providers,17.62,1.577685,True,Connectivity Technology
11,Angola,2019,Spectrum Allocated to Mobile Providers,48.22,3.261224,True,Connectivity Technology
17,Albania,2019,Spectrum Allocated to Mobile Providers,52.85,3.515955,True,Connectivity Technology
23,United Arab Emirates,2019,Spectrum Allocated to Mobile Providers,82.44,5.143926,True,Connectivity Technology
29,Argentina,2019,Spectrum Allocated to Mobile Providers,63.38,4.095290,True,Connectivity Technology
...,...,...,...,...,...,...,...
995,Samoa,2019,Spectrum Allocated to Mobile Providers,30.78,2.301717,True,Connectivity Technology
1001,Yemen,2019,Spectrum Allocated to Mobile Providers,18.55,1.628851,True,Connectivity Technology
1007,South Africa,2019,Spectrum Allocated to Mobile Providers,30.66,2.295114,True,Connectivity Technology
1013,Zambia,2019,Spectrum Allocated to Mobile Providers,23.28,1.889085,True,Connectivity Technology


In [114]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [115]:
### 16. Internet Exchange Points (IXPs) 

In [116]:
indicators[15]

# load data
indicator = indicators[15]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))



Internet Exchange Points (IXPs) map
countries_ixp


In [117]:
df['Country Name'] = df['Country']
df.head(15)

Unnamed: 0,Region,Country,City,IXP Name,Participants,Peak,Avg,IPv6,Prefixes,Established,URL,Country Name
0,Asia-Pacific,Afghanistan,Kabul,National Internet Exchange of Afghanistan,20,6.4G,2.19G,,,18-Jul,,Afghanistan
1,Europe,Albania,Tirana,Albanian Neutral Internet eXchange,16,2.42G,35M,,,2018,,Albania
2,,Albania,Tirana,Albania IX,0,,,,,2011,,Albania
3,Africa,Algeria,Algiers,Algeria Internet Exchange,0,,,,,,,Algeria
4,,Angola,Luanda,Angola Internet Exchange,21,1.6G,,,512.0,17-Mar-06,,Angola
5,,Angola,Luanda,Ponto de Intercambio Internet Angola,0,,,,,,,Angola
6,,Angola,Luanda,ANGONIX,21,16G,,,5495.0,9-Mar-15,,Angola
7,Latin America,Argentina,Bahía Blanca,CABASE IXP Bahía Blanca,15,713M,,,,27-Apr-13,,Argentina
8,,Argentina,Bariloche,CABASE IXP Bariloche,8,,,,,15-Sep-14,,Argentina
9,,Argentina,Buenos Aires,CABASE IXP Buenos Aires,125,9.19G,,,17224.0,1-Apr-98,,Argentina


In [118]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [119]:
agg_df = df.groupby('Country Name').agg({'Country':'count'})
agg_df

Unnamed: 0_level_0,Country
Country Name,Unnamed: 1_level_1
Afghanistan,1
Albania,2
Algeria,1
Angola,3
Argentina,33
...,...
Uzbekistan,3
Vanuatu,1
Viet Nam,3
Zambia,1


In [120]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

agg_df['higher_is_better'] = True
agg_df['Indicator'] = indicator
agg_df['data_col'] = agg_df['Country'] 
agg_df['Year'] = 2019
agg_df['Sub-Pillar'] = subpillar

min_rank = agg_df['data_col'].min()
max_rank = agg_df['data_col'].max()

# transform 0-1 rank into 1-6
agg_df['new_rank_score'] = agg_df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [121]:
agg_df = agg_df[['Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
agg_df

Unnamed: 0_level_0,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,2019,Internet Exchange Points (IXPs) map,1,1.000000,True,Connectivity Technology
Albania,2019,Internet Exchange Points (IXPs) map,2,1.023148,True,Connectivity Technology
Algeria,2019,Internet Exchange Points (IXPs) map,1,1.000000,True,Connectivity Technology
Angola,2019,Internet Exchange Points (IXPs) map,3,1.046296,True,Connectivity Technology
Argentina,2019,Internet Exchange Points (IXPs) map,33,1.740741,True,Connectivity Technology
...,...,...,...,...,...,...
Uzbekistan,2019,Internet Exchange Points (IXPs) map,3,1.046296,True,Connectivity Technology
Vanuatu,2019,Internet Exchange Points (IXPs) map,1,1.000000,True,Connectivity Technology
Viet Nam,2019,Internet Exchange Points (IXPs) map,3,1.046296,True,Connectivity Technology
Zambia,2019,Internet Exchange Points (IXPs) map,1,1.000000,True,Connectivity Technology


In [122]:
agg_df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [123]:
### 17. ISP Speeds - Mobile

In [124]:
indicators[16]

# load data
indicator = indicators[16]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

ISP Speeds - Mobile
mobile_speed


In [125]:
df.head(15)

Unnamed: 0,#,Year,Country,Mbps
0,1,2021,United Arab Emirates,190.03
1,2,2021,South Korea,189.2
2,3,2021,Qatar,170.77
3,4,2021,China,157.72
4,5,2021,Cyprus,154.13
5,6,2021,Norway,152.53
6,7,2021,Saudi Arabia,151.13
7,8,2021,Kuwait,140.16
8,9,2021,Australia,122.27
9,10,2021,Bulgaria,120.44


In [126]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [127]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Mbps'] 
df['Country Name'] = df['Country']
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [128]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,United Arab Emirates,2021,ISP Speeds - Mobile,190.03,6.000000,True,Connectivity Technology
1,South Korea,2021,ISP Speeds - Mobile,189.20,5.977362,True,Connectivity Technology
2,Qatar,2021,ISP Speeds - Mobile,170.77,5.474689,True,Connectivity Technology
3,China,2021,ISP Speeds - Mobile,157.72,5.118754,True,Connectivity Technology
4,Cyprus,2021,ISP Speeds - Mobile,154.13,5.020838,True,Connectivity Technology
...,...,...,...,...,...,...,...
134,Bangladesh,2021,ISP Speeds - Mobile,12.60,1.160648,True,Connectivity Technology
135,Zimbabwe,2021,ISP Speeds - Mobile,11.71,1.136374,True,Connectivity Technology
136,Palestine,2021,ISP Speeds - Mobile,7.62,1.024820,True,Connectivity Technology
137,Venezuela,2021,ISP Speeds - Mobile,7.61,1.024547,True,Connectivity Technology


In [129]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [130]:
### 18. ISP internet download speed average

In [131]:
indicators[17]

# load data
indicator = indicators[17]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

ISP internet download speed average 
internet_speed


In [132]:
df.head(15)

Unnamed: 0,Ranking,Year,Country,Download speed (Mbps)
0,1,2021,Monaco,256.7
1,2,2021,Singapore,256.03
2,3,2021,Hong Kong (SAR),248.59
3,4,2021,Thailand,216.16
4,5,2021,Romania,215.3
5,6,2021,Switzerland,214.82
6,7,2021,South Korea,212.83
7,8,2021,Chile,209.45
8,9,2021,Denmark,208.5
9,10,2021,Liechtenstein,207.44


In [133]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [134]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Download speed (Mbps)'] 
df['Country Name'] = df['Country']
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [135]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Monaco,2021,ISP internet download speed average,256.70,6.000000,True,Connectivity Technology
1,Singapore,2021,ISP internet download speed average,256.03,5.986711,True,Connectivity Technology
2,Hong Kong (SAR),2021,ISP internet download speed average,248.59,5.839145,True,Connectivity Technology
3,Thailand,2021,ISP internet download speed average,216.16,5.195922,True,Connectivity Technology
4,Romania,2021,ISP internet download speed average,215.30,5.178865,True,Connectivity Technology
...,...,...,...,...,...,...,...
175,Niger,2021,ISP internet download speed average,8.33,1.073783,True,Connectivity Technology
176,Sudan,2021,ISP internet download speed average,7.48,1.056924,True,Connectivity Technology
177,Yemen,2021,ISP internet download speed average,5.49,1.017454,True,Connectivity Technology
178,Cuba,2021,ISP internet download speed average,4.61,1.000000,True,Connectivity Technology


In [136]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [137]:
### 19. Mobile dowload speed at the slowest hour

In [138]:
indicators[18]

# load data
indicator = indicators[18]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# Waiting for internet_speed to be added to the processed folder

Mobile dowload speed at the slowest hour of the day
mobile_latency


In [139]:
df.head(15)

Unnamed: 0.1,Unnamed: 0,ISO Code,Country,Region,Year,Mobile latencies
0,1,AFG,Afghanistan,South Asia,2014,0.0
1,2,AFG,Afghanistan,South Asia,2015,0.0
2,3,AFG,Afghanistan,South Asia,2016,11.842106
3,4,AFG,Afghanistan,South Asia,2017,56.234993
4,5,AFG,Afghanistan,South Asia,2018,49.762146
5,6,AFG,Afghanistan,South Asia,2019,42.627621
6,7,AGO,Angola,Sub-Saharan Africa,2014,0.0
7,8,AGO,Angola,Sub-Saharan Africa,2015,36.975155
8,9,AGO,Angola,Sub-Saharan Africa,2016,50.958073
9,10,AGO,Angola,Sub-Saharan Africa,2017,66.052635


In [140]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [141]:
# filter most recent year
df = df[(df.Year==2019)]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Mobile latencies'] 
df['Country Name'] = df['Country']
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))


# invert since to lower rank is better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [142]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
5,Afghanistan,2019,Mobile dowload speed at the slowest hour of th...,42.627621,3.618652,True,Connectivity Technology
11,Angola,2019,Mobile dowload speed at the slowest hour of th...,66.285637,2.297022,True,Connectivity Technology
17,Albania,2019,Mobile dowload speed at the slowest hour of th...,84.636459,1.271872,True,Connectivity Technology
23,United Arab Emirates,2019,Mobile dowload speed at the slowest hour of th...,84.951675,1.254263,True,Connectivity Technology
29,Argentina,2019,Mobile dowload speed at the slowest hour of th...,80.727203,1.490258,True,Connectivity Technology
...,...,...,...,...,...,...,...
995,Samoa,2019,Mobile dowload speed at the slowest hour of th...,70.360039,2.069409,True,Connectivity Technology
1001,Yemen,2019,Mobile dowload speed at the slowest hour of th...,9.473684,5.470762,True,Connectivity Technology
1007,South Africa,2019,Mobile dowload speed at the slowest hour of th...,78.438538,1.618112,True,Connectivity Technology
1013,Zambia,2019,Mobile dowload speed at the slowest hour of th...,60.044987,2.645649,True,Connectivity Technology


In [143]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [144]:
### 20. Mobile download speed average

In [145]:
indicators[19]

# load data
indicator = indicators[19]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Mobile download speed average 
mobile_speed


In [146]:
df.head(15)

Unnamed: 0,#,Year,Country,Mbps
0,1,2021,United Arab Emirates,190.03
1,2,2021,South Korea,189.2
2,3,2021,Qatar,170.77
3,4,2021,China,157.72
4,5,2021,Cyprus,154.13
5,6,2021,Norway,152.53
6,7,2021,Saudi Arabia,151.13
7,8,2021,Kuwait,140.16
8,9,2021,Australia,122.27
9,10,2021,Bulgaria,120.44


In [147]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [148]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Mbps'] 
df['Country Name'] = df['Country']
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [149]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,United Arab Emirates,2021,Mobile download speed average,190.03,6.000000,True,Connectivity Technology
1,South Korea,2021,Mobile download speed average,189.20,5.977362,True,Connectivity Technology
2,Qatar,2021,Mobile download speed average,170.77,5.474689,True,Connectivity Technology
3,China,2021,Mobile download speed average,157.72,5.118754,True,Connectivity Technology
4,Cyprus,2021,Mobile download speed average,154.13,5.020838,True,Connectivity Technology
...,...,...,...,...,...,...,...
134,Bangladesh,2021,Mobile download speed average,12.60,1.160648,True,Connectivity Technology
135,Zimbabwe,2021,Mobile download speed average,11.71,1.136374,True,Connectivity Technology
136,Palestine,2021,Mobile download speed average,7.62,1.024820,True,Connectivity Technology
137,Venezuela,2021,Mobile download speed average,7.61,1.024547,True,Connectivity Technology


In [150]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [151]:
### 21. Individuals using the Internet (% of population)

In [152]:
indicators[20]

# load data
indicator = indicators[20]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Individuals using the Internet (% of population)
ITU_database


In [153]:
df.head(15)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
0,Angola,Africa,AGO,Female mobile phone ownership as a % of total ...,2010.0,,,
1,Benin,Africa,BEN,Female mobile phone ownership as a % of total ...,2010.0,,,
2,Botswana,Africa,BWA,Female mobile phone ownership as a % of total ...,2010.0,,,
3,Burkina Faso,Africa,BFA,Female mobile phone ownership as a % of total ...,2010.0,,,
4,Burundi,Africa,BDI,Female mobile phone ownership as a % of total ...,2010.0,,,
5,Cabo Verde,Africa,CPV,Female mobile phone ownership as a % of total ...,2010.0,,,
6,Cameroon,Africa,CMR,Female mobile phone ownership as a % of total ...,2010.0,,,
7,Central African Rep.,Africa,CAF,Female mobile phone ownership as a % of total ...,2010.0,,,
8,Chad,Africa,TCD,Female mobile phone ownership as a % of total ...,2010.0,,,
9,Congo (Rep. of the),Africa,COG,Female mobile phone ownership as a % of total ...,2010.0,,,


In [154]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [155]:
df = df[(df['Indicator name'] == 'Individuals using the Internet, total (%)')]
df = df[(df.Year==2019)]
df.head(15)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
47607,Cabo Verde,Africa,CPV,"Individuals using the Internet, total (%)",2019.0,61.943398,,
47608,Central African Rep.,Africa,CAF,"Individuals using the Internet, total (%)",2019.0,,,
47609,Congo (Rep. of the),Africa,COG,"Individuals using the Internet, total (%)",2019.0,,,
47610,Côte d'Ivoire,Africa,CIV,"Individuals using the Internet, total (%)",2019.0,36.288955,,
47611,Equatorial Guinea,Africa,GNQ,"Individuals using the Internet, total (%)",2019.0,,,
47612,Eritrea,Africa,ERI,"Individuals using the Internet, total (%)",2019.0,,,
47613,Eswatini,Africa,SWZ,"Individuals using the Internet, total (%)",2019.0,,,
47614,Kenya,Africa,KEN,"Individuals using the Internet, total (%)",2019.0,22.565119,,
47615,Lesotho,Africa,LSO,"Individuals using the Internet, total (%)",2019.0,42.301734,,
47616,Madagascar,Africa,MDG,"Individuals using the Internet, total (%)",2019.0,,,


In [156]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df.rename(columns = {'Country':'Country Name'}, inplace=True)
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [157]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
47607,Cabo Verde,2019.0,Individuals using the Internet (% of population),61.943398,3.715247,True,Connectivity Technology
47608,Central African Rep.,2019.0,Individuals using the Internet (% of population),,,True,Connectivity Technology
47609,Congo (Rep. of the),2019.0,Individuals using the Internet (% of population),,,True,Connectivity Technology
47610,Côte d'Ivoire,2019.0,Individuals using the Internet (% of population),36.288955,2.162890,True,Connectivity Technology
47611,Equatorial Guinea,2019.0,Individuals using the Internet (% of population),,,True,Connectivity Technology
...,...,...,...,...,...,...,...
47739,Saint Vincent and the Grenadines,2019.0,Individuals using the Internet (% of population),,,True,Connectivity Technology
47740,Trinidad and Tobago,2019.0,Individuals using the Internet (% of population),,,True,Connectivity Technology
47741,United States,2019.0,Individuals using the Internet (% of population),89.430285,5.378486,True,Connectivity Technology
47742,Uruguay,2019.0,Individuals using the Internet (% of population),83.351534,5.010659,True,Connectivity Technology


In [158]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [159]:
### 22. Mobile-cellular subscriptions per 100 inhabitants

In [160]:
indicators[21]

# load data
indicator = indicators[21]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Mobile cellular subscriptions (per 100 people)
ITU_database


In [161]:
df = df[(df['Indicator name'] == 'Mobile-cellular subscriptions per 100 inhabitants')]
df = df[(df.Year==2020)]
df.head(15)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
32475,Angola,Africa,AGO,Mobile-cellular subscriptions per 100 inhabitants,2020.0,44.559511,,
32476,Benin,Africa,BEN,Mobile-cellular subscriptions per 100 inhabitants,2020.0,91.89728,,
32477,Botswana,Africa,BWA,Mobile-cellular subscriptions per 100 inhabitants,2020.0,162.399011,,
32478,Burkina Faso,Africa,BFA,Mobile-cellular subscriptions per 100 inhabitants,2020.0,105.80744,,
32479,Burundi,Africa,BDI,Mobile-cellular subscriptions per 100 inhabitants,2020.0,55.767172,,
32480,Cabo Verde,Africa,CPV,Mobile-cellular subscriptions per 100 inhabitants,2020.0,97.975133,,
32481,Cameroon,Africa,CMR,Mobile-cellular subscriptions per 100 inhabitants,2020.0,95.100069,,
32482,Central African Rep.,Africa,CAF,Mobile-cellular subscriptions per 100 inhabitants,2020.0,,,
32483,Chad,Africa,TCD,Mobile-cellular subscriptions per 100 inhabitants,2020.0,52.887026,,
32484,Congo (Rep. of the),Africa,COG,Mobile-cellular subscriptions per 100 inhabitants,2020.0,,,


In [162]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [163]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df.rename(columns = {'Country':'Country Name'}, inplace=True)
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [164]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
32475,Angola,2020.0,Mobile cellular subscriptions (per 100 people),44.559511,1.012692,True,Connectivity Technology
32476,Benin,2020.0,Mobile cellular subscriptions (per 100 people),91.897280,1.968150,True,Connectivity Technology
32477,Botswana,2020.0,Mobile cellular subscriptions (per 100 people),162.399011,3.391146,True,Connectivity Technology
32478,Burkina Faso,2020.0,Mobile cellular subscriptions (per 100 people),105.807440,2.248911,True,Connectivity Technology
32479,Burundi,2020.0,Mobile cellular subscriptions (per 100 people),55.767172,1.238906,True,Connectivity Technology
...,...,...,...,...,...,...,...
32666,Suriname,2020.0,Mobile cellular subscriptions (per 100 people),153.305479,3.207604,True,Connectivity Technology
32667,Trinidad and Tobago,2020.0,Mobile cellular subscriptions (per 100 people),142.051665,2.980458,True,Connectivity Technology
32668,United States,2020.0,Mobile cellular subscriptions (per 100 people),,,True,Connectivity Technology
32669,Uruguay,2020.0,Mobile cellular subscriptions (per 100 people),,,True,Connectivity Technology


In [165]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [166]:
### 23. Average fixed broadband download speeds	

In [167]:
indicators[22]

# load data
indicator = indicators[22]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))


Average fixed broadband download speeds 
fixed_bdbd_spd_dl_ul


In [168]:
df.head(15)

Unnamed: 0,country,broadband,mobile,pop2021
0,Singapore,226.6,60.52,5896.686
1,Hong Kong,210.73,50.96,7552.81
2,Romania,193.47,40.35,19127.774
3,Switzerland,178.81,64.1,8715.494
4,Thailand,175.22,34.38,69950.85
5,France,173.05,49.82,65426.179
6,Denmark,162.08,63.86,5813.298
7,Monaco,162.06,,39.511
8,Hungary,161.51,44.52,9634.164
9,United States,161.14,47.13,332915.073


In [169]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [170]:
# create standard columns

df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Year'] = 2021
df['Indicator'] = indicator
df['data_col'] = df['broadband'] 
df.rename(columns = {'country':'Country Name'}, inplace=True)
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [171]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Singapore,2021,Average fixed broadband download speeds,226.60,6.000000,True,Connectivity Technology
1,Hong Kong,2021,Average fixed broadband download speeds,210.73,5.644489,True,Connectivity Technology
2,Romania,2021,Average fixed broadband download speeds,193.47,5.257841,True,Connectivity Technology
3,Switzerland,2021,Average fixed broadband download speeds,178.81,4.929435,True,Connectivity Technology
4,Thailand,2021,Average fixed broadband download speeds,175.22,4.849014,True,Connectivity Technology
...,...,...,...,...,...,...,...
167,Mauritania,2021,Average fixed broadband download speeds,6.47,1.068772,True,Connectivity Technology
168,Algeria,2021,Average fixed broadband download speeds,4.81,1.031586,True,Connectivity Technology
169,Yemen,2021,Average fixed broadband download speeds,4.25,1.019041,True,Connectivity Technology
170,Turkmenistan,2021,Average fixed broadband download speeds,3.40,1.000000,True,Connectivity Technology


In [172]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [173]:
### 24. Postal Coverage

In [174]:
indicators[23]

# load data
indicator = indicators[23]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Postal Coverage
postal_coverage


In [175]:
df.head(15)

Unnamed: 0,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,Country ISO3,Country Name,Indicator Id,Indicator,Subindicator Type,2013-14
0,,,,,,,,,93.0,,,ABW,Aruba,24707,Percent of Population Having Mail Delivered at...,Value,
1,,,,,,17.0,,,,,,ABW,Aruba,24708,Percent of Income Linked to Parcels and Logist...,Value,
2,,,,,,,,,0.0,,,ABW,Aruba,24710,Percent of the Population Without Postal Services,Value,
3,,,,,,,,,,70.0,,AFG,Afghanistan,24707,Percent of Population Having Mail Delivered at...,Value,
4,,,,,,,,,,74.0,,AFG,Afghanistan,24708,Percent of Income Linked to Parcels and Logist...,Value,
5,,,,,,,,,,,,AFG,Afghanistan,24709,Postal Reliability Index,Index (0-100),17.9
6,,,,,,,,,,10.0,,AFG,Afghanistan,24710,Percent of the Population Without Postal Services,Value,
7,,,,,,,,,,,15.0,AGO,Angola,24707,Percent of Population Having Mail Delivered at...,Value,
8,,,,,,,,,,,8.9,AGO,Angola,24708,Percent of Income Linked to Parcels and Logist...,Value,
9,,,,,,,,,,,,AGO,Angola,24709,Postal Reliability Index,Index (0-100),15.3


In [176]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [177]:
#Choose 'Percent of Population Having Mail Delivered at Home' as the data column
df = df[(df.Indicator=='Percent of Population Having Mail Delivered at Home')]
df.head(15)

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2015
df['Indicator'] = indicator
df['data_col'] = df['2015'] 
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [178]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Aruba,2015,Postal Coverage,,,True,Connectivity Technology
3,Afghanistan,2015,Postal Coverage,,,True,Connectivity Technology
7,Angola,2015,Postal Coverage,15.0,1.75,True,Connectivity Technology
11,Albania,2015,Postal Coverage,,,True,Connectivity Technology
15,United Arab Emirates,2015,Postal Coverage,,,True,Connectivity Technology
...,...,...,...,...,...,...,...
729,Vanuatu,2015,Postal Coverage,,,True,Connectivity Technology
734,"Yemen, Rep.",2015,Postal Coverage,,,True,Connectivity Technology
738,South Africa,2015,Postal Coverage,71.8,4.59,True,Connectivity Technology
742,Zambia,2015,Postal Coverage,18.0,1.90,True,Connectivity Technology


In [179]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [180]:
### 25. Logistics Performance Index (LPI) 

In [181]:
indicators[24]

# load data
indicator = indicators[24]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Logistics Performance Index (LPI) - Infrastructure Rating
logistics_performance_index


In [182]:
df.head(15)

Unnamed: 0,Country,Code,score,lower bound,upper bound,rank,% of highest performer,data_country,data_year
0,Germany,DEU,4.39,1,1,3,100.0,,
1,Sweden,SWE,4.28,2,12,7,95.36,,
2,Belgium,BEL,4.41,2,12,1,94.93,,
3,Austria,AUT,4.25,2,14,12,94.52,,
4,Japan,JPN,4.25,2,10,10,94.51,,
5,Netherlands,NLD,4.25,2,11,11,94.31,,
6,Singapore,SGP,4.32,2,15,6,93.59,,
7,Denmark,DNK,4.41,2,17,2,93.45,,
8,United Kingdom,GBR,4.33,3,11,5,93.3,,
9,Finland,FIN,4.28,1,21,8,92.74,,


In [183]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [184]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2018
df['Indicator'] = indicator
df['data_col'] = df['score']
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [185]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Germany,2018,Logistics Performance Index (LPI) - Infrastruc...,4.39,5.957806,True,Connectivity Technology
1,Sweden,2018,Logistics Performance Index (LPI) - Infrastruc...,4.28,5.725738,True,Connectivity Technology
2,Belgium,2018,Logistics Performance Index (LPI) - Infrastruc...,4.41,6.000000,True,Connectivity Technology
3,Austria,2018,Logistics Performance Index (LPI) - Infrastruc...,4.25,5.662447,True,Connectivity Technology
4,Japan,2018,Logistics Performance Index (LPI) - Infrastruc...,4.25,5.662447,True,Connectivity Technology
...,...,...,...,...,...,...,...
155,Sierra Leone,2018,Logistics Performance Index (LPI) - Infrastruc...,2.34,1.632911,True,Connectivity Technology
156,Niger,2018,Logistics Performance Index (LPI) - Infrastruc...,2.33,1.611814,True,Connectivity Technology
157,Burundi,2018,Logistics Performance Index (LPI) - Infrastruc...,2.17,1.274262,True,Connectivity Technology
158,Angola,2018,Logistics Performance Index (LPI) - Infrastruc...,2.59,2.160338,True,Connectivity Technology


In [186]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [187]:
### 26. National cyber security index

In [188]:
indicators[25]

# load data
indicator = indicators[25]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

National cyber security index
national_cybersecurity_index


In [189]:
df.head(15)

Unnamed: 0,Rank,Country,National Cyber Security Index,Difference,Unnamed: 4
0,1.0,Greece,96.1,32.35,
1,2.0,Czech Republic,92.21,23.25,
2,3.0,Estonia,90.91,15.05,
3,4.0,Portugal,89.61,21.76,
4,5.0,Lithuania,88.31,20.01,
5,6.0,Spain,88.31,15.7,
6,7.0,Poland,87.01,21.66,
7,8.0,Belgium,85.71,11.32,
8,9.0,Finland,85.71,6.23,
9,10.0,France,84.42,6.63,


In [190]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [191]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2021
df['Indicator'] = indicator
df['data_col'] = df['National Cyber Security Index']
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [192]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Greece,2021,National cyber security index,96.10,6.000000,True,Connectivity Technology
1,Czech Republic,2021,National cyber security index,92.21,5.794831,True,Connectivity Technology
2,Estonia,2021,National cyber security index,90.91,5.726266,True,Connectivity Technology
3,Portugal,2021,National cyber security index,89.61,5.657700,True,Connectivity Technology
4,Lithuania,2021,National cyber security index,88.31,5.589135,True,Connectivity Technology
...,...,...,...,...,...,...,...
156,Burundi,2021,National cyber security index,2.60,1.068565,True,Connectivity Technology
157,Solomon Islands,2021,National cyber security index,2.60,1.068565,True,Connectivity Technology
158,Tuvalu,2021,National cyber security index,2.60,1.068565,True,Connectivity Technology
159,South Sudan,2021,National cyber security index,1.30,1.000000,True,Connectivity Technology


In [193]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [194]:
### 27. Global Cybersecurity Index (GCI)

In [195]:
indicators[26]

# load data
indicator = indicators[26]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Global Cybersecurity Index (GCI)
dice_export_global_cybersecurity_index


In [196]:
df.head(15)

Unnamed: 0,Country,Score (2020),Rank (2020)
0,United States of America**,100.0,1.0
1,United Kingdom,99.54,2.0
2,Saudi Arabia,99.54,2.0
3,Estonia,99.48,3.0
4,Korea (Rep. of),98.52,4.0
5,Singapore,98.52,4.0
6,Spain,98.52,4.0
7,Russian Federation,98.06,5.0
8,United Arab Emirates,98.06,5.0
9,Malaysia,98.06,5.0


In [197]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Connectivity Technology


In [198]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2020
df['Indicator'] = indicator
df['data_col'] = df['Score (2020)'] 
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [199]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,United States of America**,2020,Global Cybersecurity Index (GCI),100.00,5.032258,True,Connectivity Technology
1,United Kingdom,2020,Global Cybersecurity Index (GCI),99.54,5.013710,True,Connectivity Technology
2,Saudi Arabia,2020,Global Cybersecurity Index (GCI),99.54,5.013710,True,Connectivity Technology
3,Estonia,2020,Global Cybersecurity Index (GCI),99.48,5.011290,True,Connectivity Technology
4,Korea (Rep. of),2020,Global Cybersecurity Index (GCI),98.52,4.972581,True,Connectivity Technology
...,...,...,...,...,...,...,...
189,Dem. People's Rep. of Korea**,2020,Global Cybersecurity Index (GCI),1.35,1.054435,True,Connectivity Technology
190,Micronesia*,2020,Global Cybersecurity Index (GCI),0.00,1.000000,True,Connectivity Technology
191,Vatican*,2020,Global Cybersecurity Index (GCI),0.00,1.000000,True,Connectivity Technology
192,Yemen*,2020,Global Cybersecurity Index (GCI),0.00,1.000000,True,Connectivity Technology


In [200]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [201]:
### 28. Software Developer Ecosystem size

In [202]:
indicators[27]

# load data
indicator = indicators[27]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Software Developer Ecosystem size
software_developer_ecosystem_size


In [203]:
df.head(15)

Unnamed: 0,Country,GitHub Accounts
0,United States,651017
1,China,183805
2,India,168328
3,United Kingdom,109460
4,Germany,94359
5,Brazil,80903
6,Canada,77318
7,France,66367
8,Russia,58767
9,Australia,41790


In [204]:
subpillars[1]
subpillar = subpillars[1]
print(subpillar)

Innovation Ecosystem


In [205]:
df['higher_is_better'] = True
df['Year'] = 2018
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,[1]]
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['Sub-Pillar'] = subpillar
    
min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [206]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,United States,2018,Software Developer Ecosystem size,651017,6.0,True,Innovation Ecosystem
1,China,2018,Software Developer Ecosystem size,183805,2.348274,True,Innovation Ecosystem
2,India,2018,Software Developer Ecosystem size,168328,2.227306,True,Innovation Ecosystem
3,United Kingdom,2018,Software Developer Ecosystem size,109460,1.767194,True,Innovation Ecosystem
4,Germany,2018,Software Developer Ecosystem size,94359,1.649165,True,Innovation Ecosystem
5,Brazil,2018,Software Developer Ecosystem size,80903,1.543993,True,Innovation Ecosystem
6,Canada,2018,Software Developer Ecosystem size,77318,1.515973,True,Innovation Ecosystem
7,France,2018,Software Developer Ecosystem size,66367,1.43038,True,Innovation Ecosystem
8,Russia,2018,Software Developer Ecosystem size,58767,1.370978,True,Innovation Ecosystem
9,Australia,2018,Software Developer Ecosystem size,41790,1.238286,True,Innovation Ecosystem


In [207]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [208]:
### 29. Digital Work Ecosystem size

In [209]:
indicators[28]

# load data
indicator = indicators[28]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Digital Work Ecosystem size
digital_platform_economy_index


In [210]:
df.head(15)

Unnamed: 0,Rank,Country,DPE 2020,GDP per capita 2017
0,1,United States,85.0,54225
1,2,United Kingdom,82.7,39753
2,3,Netherlands,82.4,48473
3,4,Canada,78.2,44018
4,5,Sweden,76.8,46949
5,6,Switzerland,76.3,57410
6,7,Norway,74.4,64800
7,8,Denmark,71.1,46683
8,9,Australia,69.3,44649
9,10,Finland,68.9,40586


In [211]:
subpillars[1]
subpillar = subpillars[1]
print(subpillar)

Innovation Ecosystem


In [212]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = False
df['Year'] = 2020
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,[0]].astype(float)
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['Sub-Pillar'] = subpillar
    
min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# invert since to lower rank is better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [213]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,United States,2020,Digital Work Ecosystem size,1.0,6.000000,False,Innovation Ecosystem
1,United Kingdom,2020,Digital Work Ecosystem size,2.0,5.956522,False,Innovation Ecosystem
2,Netherlands,2020,Digital Work Ecosystem size,3.0,5.913043,False,Innovation Ecosystem
3,Canada,2020,Digital Work Ecosystem size,4.0,5.869565,False,Innovation Ecosystem
4,Sweden,2020,Digital Work Ecosystem size,5.0,5.826087,False,Innovation Ecosystem
...,...,...,...,...,...,...,...
111,Malawi,2020,Digital Work Ecosystem size,112.0,1.173913,False,Innovation Ecosystem
112,Benin,2020,Digital Work Ecosystem size,113.0,1.130435,False,Innovation Ecosystem
113,Madagascar,2020,Digital Work Ecosystem size,114.0,1.086957,False,Innovation Ecosystem
114,Burundi,2020,Digital Work Ecosystem size,115.0,1.043478,False,Innovation Ecosystem


In [214]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [215]:
### 30. Country, Industry, Skill Migration Data (Skill) 

In [216]:
indicators[29]

# load data
indicator = indicators[29]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Country, Industry, Skill Migration Data (Skill)
migration_skill


In [217]:
df.head(15)

Unnamed: 0,country_code,country_name,wb_income,wb_region,skill_group_id,skill_group_category,skill_group_name,net_per_10K_2015,net_per_10K_2016,net_per_10K_2017,net_per_10K_2018,net_per_10K_2019,Unnamed: 12,data_country,data_year
0,af,Afghanistan,Low income,South Asia,2549.0,Tech Skills,Information Management,-791.59,-705.88,-550.04,-680.92,-1208.79,,,
1,af,Afghanistan,Low income,South Asia,2608.0,Business Skills,Operational Efficiency,-1610.25,-933.55,-776.06,-532.22,-790.09,,,
2,af,Afghanistan,Low income,South Asia,3806.0,Specialized Industry Skills,National Security,-1731.45,-769.68,-756.59,-600.44,-767.64,,,
3,af,Afghanistan,Low income,South Asia,50321.0,Tech Skills,Software Testing,-957.5,-828.54,-964.73,-406.5,-739.51,,,
4,af,Afghanistan,Low income,South Asia,1606.0,Specialized Industry Skills,Navy,-1510.71,-841.17,-842.32,-581.71,-718.64,,,
5,af,Afghanistan,Low income,South Asia,3139.0,Disruptive Tech Skills,Materials Science,-1085.03,-1045.71,-783.03,-473.42,-717.73,,,
6,af,Afghanistan,Low income,South Asia,1315.0,Specialized Industry Skills,Criminal Law,-687.8,-294.93,-769.01,-415.34,-698.84,,,
7,af,Afghanistan,Low income,South Asia,1017.0,Soft Skills,Problem Solving,-906.42,-210.44,-480.92,-126.62,-696.16,,,
8,af,Afghanistan,Low income,South Asia,2130.0,Tech Skills,Software Development Life Cycle (SDLC),-1096.96,-566.7,-478.3,-324.99,-692.24,,,
9,af,Afghanistan,Low income,South Asia,2265.0,Specialized Industry Skills,Cybersecurity,-1046.26,-796.25,-796.72,-517.62,-640.96,,,


In [218]:
subpillars[1]
subpillar = subpillars[1]
print(subpillar)

Innovation Ecosystem


In [219]:
df['Country Name'] = df.iloc[:,[1]]

df_sum = df.groupby('Country Name')[['net_per_10K_2019']].sum()
df_sum

Unnamed: 0_level_0,net_per_10K_2019
Country Name,Unnamed: 1_level_1
Afghanistan,-35926.60
Albania,-20747.41
Algeria,-26729.53
Angola,4332.13
Argentina,-16970.64
...,...
Vietnam,-1952.65
West Bank and Gaza,-13115.14
"Yemen, Rep.",-2249.00
Zambia,2455.50


In [220]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df_sum['higher_is_better'] = True
df_sum['Year'] = 2019
df_sum['Indicator'] = indicator
df_sum['data_col'] = df_sum['net_per_10K_2019']
df_sum['Sub-Pillar'] = subpillar

min_rank = df_sum['data_col'].min()
max_rank = df_sum['data_col'].max()

# transform 0-1 rank into 1-6
df_sum['new_rank_score'] = df_sum['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [221]:
df_sum = df_sum[['Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df_sum

Unnamed: 0_level_0,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,2019,"Country, Industry, Skill Migration Data (Skill)",-35926.60,4.221400,True,Innovation Ecosystem
Albania,2019,"Country, Industry, Skill Migration Data (Skill)",-20747.41,4.449323,True,Innovation Ecosystem
Algeria,2019,"Country, Industry, Skill Migration Data (Skill)",-26729.53,4.359499,True,Innovation Ecosystem
Angola,2019,"Country, Industry, Skill Migration Data (Skill)",4332.13,4.825905,True,Innovation Ecosystem
Argentina,2019,"Country, Industry, Skill Migration Data (Skill)",-16970.64,4.506034,True,Innovation Ecosystem
...,...,...,...,...,...,...
Vietnam,2019,"Country, Industry, Skill Migration Data (Skill)",-1952.65,4.731536,True,Innovation Ecosystem
West Bank and Gaza,2019,"Country, Industry, Skill Migration Data (Skill)",-13115.14,4.563926,True,Innovation Ecosystem
"Yemen, Rep.",2019,"Country, Industry, Skill Migration Data (Skill)",-2249.00,4.727086,True,Innovation Ecosystem
Zambia,2019,"Country, Industry, Skill Migration Data (Skill)",2455.50,4.797727,True,Innovation Ecosystem


In [222]:
df_sum.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [223]:
### 31. Country, Industry, Skill Migration Data (Industry)

In [224]:
indicators[30]

# load data
indicator = indicators[30]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))


Country, Industry, Skill Migration Data (Industry)
migration_industry


In [225]:
df.head(15)

Unnamed: 0,country_code,country_name,wb_income,wb_region,isic_section_index,isic_section_name,industry_id,industry_name,net_per_10K_2015,net_per_10K_2016,net_per_10K_2017,net_per_10K_2018,net_per_10K_2019,Unnamed: 13,data_country,data_year
0,ae,United Arab Emirates,High income,Middle East & North Africa,C,Manufacturing,1,Defense & Space,378.74,127.94,8.2,68.51,49.55,,,
1,ae,United Arab Emirates,High income,Middle East & North Africa,J,Information and communication,3,Computer Hardware,100.97,358.14,112.98,149.57,182.22,,,
2,ae,United Arab Emirates,High income,Middle East & North Africa,J,Information and communication,4,Computer Software,1079.36,848.15,596.48,409.18,407.41,,,
3,ae,United Arab Emirates,High income,Middle East & North Africa,J,Information and communication,5,Computer Networking,401.46,447.39,163.99,236.69,188.07,,,
4,ae,United Arab Emirates,High income,Middle East & North Africa,J,Information and communication,6,Internet,1840.33,1368.42,877.71,852.39,519.4,,,
5,ae,United Arab Emirates,High income,Middle East & North Africa,J,Information and communication,8,Telecommunications,676.91,676.11,365.96,283.59,281.91,,,
6,ae,United Arab Emirates,High income,Middle East & North Africa,M,Professional scientific and technical activities,9,Law Practice,694.5,529.5,362.83,356.56,147.83,,,
7,ae,United Arab Emirates,High income,Middle East & North Africa,M,Professional scientific and technical activities,10,Legal Services,661.1,665.65,462.3,475.62,255.21,,,
8,ae,United Arab Emirates,High income,Middle East & North Africa,M,Professional scientific and technical activities,11,Management Consulting,982.85,694.25,520.28,574.6,406.03,,,
9,ae,United Arab Emirates,High income,Middle East & North Africa,M,Professional scientific and technical activities,12,Biotechnology,1230.17,846.28,500.77,205.78,513.23,,,


In [226]:
subpillars[1]
subpillar = subpillars[1]
print(subpillar)

Innovation Ecosystem


In [227]:
df['Country Name'] = df.iloc[:,[1]]

df_sum = df.groupby('Country Name')[['net_per_10K_2019']].sum()
df_sum

Unnamed: 0_level_0,net_per_10K_2019
Country Name,Unnamed: 1_level_1
Afghanistan,-1369.00
Albania,-2188.08
Algeria,-3243.78
Angola,544.66
Argentina,-4122.87
...,...
Vietnam,-2645.19
West Bank and Gaza,-748.05
"Yemen, Rep.",160.41
Zambia,494.78


In [228]:
## create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df_sum['higher_is_better'] = True
df_sum['Year'] = 2019
df_sum['Indicator'] = indicator
df_sum['data_col'] = df_sum['net_per_10K_2019']
df_sum['Sub-Pillar'] = subpillar

min_rank = df_sum['data_col'].min()
max_rank = df_sum['data_col'].max()

# transform 0-1 rank into 1-6
df_sum['new_rank_score'] = df_sum['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [229]:
df_sum = df_sum[['Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df_sum

Unnamed: 0_level_0,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,2019,"Country, Industry, Skill Migration Data (Indus...",-1369.00,4.082767,True,Innovation Ecosystem
Albania,2019,"Country, Industry, Skill Migration Data (Indus...",-2188.08,4.016637,True,Innovation Ecosystem
Algeria,2019,"Country, Industry, Skill Migration Data (Indus...",-3243.78,3.931403,True,Innovation Ecosystem
Angola,2019,"Country, Industry, Skill Migration Data (Indus...",544.66,4.237269,True,Innovation Ecosystem
Argentina,2019,"Country, Industry, Skill Migration Data (Indus...",-4122.87,3.860429,True,Innovation Ecosystem
...,...,...,...,...,...,...
Vietnam,2019,"Country, Industry, Skill Migration Data (Indus...",-2645.19,3.979732,True,Innovation Ecosystem
West Bank and Gaza,2019,"Country, Industry, Skill Migration Data (Indus...",-748.05,4.132900,True,Innovation Ecosystem
"Yemen, Rep.",2019,"Country, Industry, Skill Migration Data (Indus...",160.41,4.206246,True,Innovation Ecosystem
Zambia,2019,"Country, Industry, Skill Migration Data (Indus...",494.78,4.233242,True,Innovation Ecosystem


In [230]:
df_sum.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [231]:
### 32. Country, Industry, Skill Migration Data (Nation)

In [232]:
indicators[31]

# load data
indicator = indicators[31]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Country, Industry, Skill Migration Data (Country)
migration_country


In [233]:
df.head(15)

Unnamed: 0,base_country_code,base_country_name,base_lat,base_long,base_country_wb_income,base_country_wb_region,target_country_code,target_country_name,target_lat,target_long,target_country_wb_income,target_country_wb_region,net_per_10K_2015,net_per_10K_2016,net_per_10K_2017,net_per_10K_2018,net_per_10K_2019,Unnamed: 17,data_country,data_year
0,ae,United Arab Emirates,23.424076,53.847818,High Income,Middle East & North Africa,af,Afghanistan,33.93911,67.709953,Low Income,South Asia,0.19,0.16,0.11,-0.05,-0.02,,,
1,ae,United Arab Emirates,23.424076,53.847818,High Income,Middle East & North Africa,dz,Algeria,28.033886,1.659626,Upper Middle Income,Middle East & North Africa,0.19,0.25,0.57,0.55,0.78,,,
2,ae,United Arab Emirates,23.424076,53.847818,High Income,Middle East & North Africa,ao,Angola,-11.202692,17.873887,Lower Middle Income,Sub-Saharan Africa,-0.01,0.04,0.11,-0.02,-0.06,,,
3,ae,United Arab Emirates,23.424076,53.847818,High Income,Middle East & North Africa,ar,Argentina,-38.416097,-63.616672,High Income,Latin America & Caribbean,0.16,0.18,0.04,0.01,0.23,,,
4,ae,United Arab Emirates,23.424076,53.847818,High Income,Middle East & North Africa,am,Armenia,40.069099,45.038189,Upper Middle Income,Europe & Central Asia,0.1,0.05,0.03,-0.01,0.02,,,
5,ae,United Arab Emirates,23.424076,53.847818,High Income,Middle East & North Africa,au,Australia,-25.274398,133.775136,High Income,East Asia & Pacific,-1.06,-3.31,-4.01,-4.58,-4.09,,,
6,ae,United Arab Emirates,23.424076,53.847818,High Income,Middle East & North Africa,at,Austria,47.516231,14.550072,High Income,Europe & Central Asia,0.11,-0.08,-0.07,-0.05,-0.16,,,
7,ae,United Arab Emirates,23.424076,53.847818,High Income,Middle East & North Africa,az,Azerbaijan,40.143105,47.576927,Upper Middle Income,Europe & Central Asia,0.24,0.25,0.1,0.05,0.04,,,
8,ae,United Arab Emirates,23.424076,53.847818,High Income,Middle East & North Africa,bh,Bahrain,25.930414,50.637772,High Income,Middle East & North Africa,0.9,0.89,0.38,-0.07,0.15,,,
9,ae,United Arab Emirates,23.424076,53.847818,High Income,Middle East & North Africa,bd,Bangladesh,23.684994,90.356331,Lower Middle Income,South Asia,-0.21,-0.21,-0.26,-0.18,-0.04,,,


In [234]:
subpillars[1]
subpillar = subpillars[1]
print(subpillar)

Innovation Ecosystem


In [235]:
df['Country Name'] = df.iloc[:,[1]]

df_sum = df.groupby('Country Name')[['net_per_10K_2019']].sum()
df_sum

Unnamed: 0_level_0,net_per_10K_2019
Country Name,Unnamed: 1_level_1
Afghanistan,23.01
Albania,-16.58
Algeria,-23.47
Angola,19.07
Argentina,-8.77
...,...
Vietnam,-7.94
West Bank and Gaza,-15.59
"Yemen, Rep.",2.89
Zambia,27.05


In [236]:
## create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df_sum['higher_is_better'] = True
df_sum['Year'] = 2019
df_sum['Indicator'] = indicator
df_sum['data_col'] = df_sum['net_per_10K_2019']
df_sum ['Sub-Pillar'] = subpillar

min_rank = df_sum['data_col'].min()
max_rank = df_sum['data_col'].max()

# transform 0-1 rank into 1-6
df_sum['new_rank_score'] = df_sum['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [237]:
df_sum = df_sum[['Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df_sum

Unnamed: 0_level_0,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,2019,"Country, Industry, Skill Migration Data (Country)",23.01,3.194208,True,Innovation Ecosystem
Albania,2019,"Country, Industry, Skill Migration Data (Country)",-16.58,2.634379,True,Innovation Ecosystem
Algeria,2019,"Country, Industry, Skill Migration Data (Country)",-23.47,2.536950,True,Innovation Ecosystem
Angola,2019,"Country, Industry, Skill Migration Data (Country)",19.07,3.138494,True,Innovation Ecosystem
Argentina,2019,"Country, Industry, Skill Migration Data (Country)",-8.77,2.744817,True,Innovation Ecosystem
...,...,...,...,...,...,...
Vietnam,2019,"Country, Industry, Skill Migration Data (Country)",-7.94,2.756554,True,Innovation Ecosystem
West Bank and Gaza,2019,"Country, Industry, Skill Migration Data (Country)",-15.59,2.648378,True,Innovation Ecosystem
"Yemen, Rep.",2019,"Country, Industry, Skill Migration Data (Country)",2.89,2.909698,True,Innovation Ecosystem
Zambia,2019,"Country, Industry, Skill Migration Data (Country)",27.05,3.251336,True,Innovation Ecosystem


In [238]:
df_sum.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [239]:
### 33. Digital Finance Ecosystem size

In [240]:
indicators[32]

# load data
indicator = indicators[32]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Digital Finance Ecosystem size
global_fintech_ranking


In [241]:
df.head(15)

Unnamed: 0,Rank,Country,Total Score (2021)
0,1,United States,31.789
1,2,United Kingdom,23.262
2,3,Singapore,19.176
3,4,Lithuania,17.343
4,5,Switzerland,16.018
5,6,The Netherlands,14.464
6,7,Sweden,14.272
7,8,Australia,13.555
8,9,Canada,13.322
9,10,Estonia,13.303


In [242]:
subpillars[1]
subpillar = subpillars[1]
print(subpillar)

Innovation Ecosystem


In [243]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2021
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,[2]]
df['Country Name'] = df.iloc[:,[1]]
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# Have to shorten the column names to see if this works

In [244]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,United States,2021,Digital Finance Ecosystem size,31.789,6.000000,True,Innovation Ecosystem
1,United Kingdom,2021,Digital Finance Ecosystem size,23.262,4.469010,True,Innovation Ecosystem
2,Singapore,2021,Digital Finance Ecosystem size,19.176,3.735385,True,Innovation Ecosystem
3,Lithuania,2021,Digital Finance Ecosystem size,17.343,3.406277,True,Innovation Ecosystem
4,Switzerland,2021,Digital Finance Ecosystem size,16.018,3.168378,True,Innovation Ecosystem
...,...,...,...,...,...,...,...
60,Bangladesh,2021,Digital Finance Ecosystem size,5.073,1.203246,True,Innovation Ecosystem
61,Pakistan,2021,Digital Finance Ecosystem size,4.675,1.131787,True,Innovation Ecosystem
62,Uruguay,2021,Digital Finance Ecosystem size,4.562,1.111498,True,Innovation Ecosystem
63,Uganda,2021,Digital Finance Ecosystem size,4.037,1.017236,True,Innovation Ecosystem


In [245]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [246]:
### 34. Tech hubs & spaces size

In [247]:
indicators[33]

# load data
indicator = indicators[33]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Tech hubs & spaces size
tech_hubs


In [248]:
df.head(15)

#There are two problems:
#1. The column names need some overhaul (moving the second column to the top and make it the column names)
#2. This is a list of cities, not countries, may present probkems later. 

Unnamed: 0,City/metropolitan area,Overall Score,Ranking,Score (Research innovation),Ranking (Research innovation),Score (Innovation economy),Ranking (Innovation economy),Score (Innovation ecosystem),Ranking (Innovation ecosystem)
0,San Francisco ? San Jose,100.0,1,91.59,3,100.0,1,100.0,1
1,New York,88.44,2,100.0,1,67.63,11,94.26,2
2,Boston ? Cambridge ? Newton,85.57,3,98.49,2,67.91,10,87.73,4
3,Tokyo,84.75,4,82.99,10,90.92,2,76.37,15
4,Beijing,84.68,5,85.96,8,86.49,3,77.96,11
5,London,80.69,6,88.49,4,63.63,18,88.09,3
6,Seattle ? Tacoma ? Bellevue,77.61,7,81.8,14,69.47,9,80.04,9
7,Los Angeles ? Long Beach ? Anaheim,76.88,8,85.1,9,63.46,19,81.18,6
8,Baltimore ? Washington,76.72,9,87.96,5,63.74,15,77.9,12
9,Chapel Hill ? Durham ? Raleigh,76.58,10,87.13,7,64.2,14,77.81,13


In [249]:
### 35. Banking Ecosystem size

In [250]:
indicators[34]

# load data
indicator = indicators[34]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Banking Ecosystem size
banking_sector_size


In [251]:
df.head(15)

Unnamed: 0,Country,"Bank assets, percent of GDP, 2017"
0,Hong Kong,256.63
1,China,174.54
2,Denmark,172.28
3,New Zealand,157.73
4,Japan,157.51
5,Lebanon,157.18
6,Singapore,151.67
7,Qatar,149.95
8,South Korea,141.52
9,Australia,140.47


In [252]:
subpillars[1]
subpillar = subpillars[1]
print(subpillar)

Innovation Ecosystem


In [253]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2017
df['Indicator'] = indicator
df['data_col'] = df['Bank assets, percent of GDP, 2017']
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [254]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df


Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Hong Kong,2017,Banking Ecosystem size,256.63,6.000000,True,Innovation Ecosystem
1,China,2017,Banking Ecosystem size,174.54,4.380612,True,Innovation Ecosystem
2,Denmark,2017,Banking Ecosystem size,172.28,4.336029,True,Innovation Ecosystem
3,New Zealand,2017,Banking Ecosystem size,157.73,4.049002,True,Innovation Ecosystem
4,Japan,2017,Banking Ecosystem size,157.51,4.044662,True,Innovation Ecosystem
...,...,...,...,...,...,...,...
157,Sierra Leone,2017,Banking Ecosystem size,15.06,1.234554,True,Innovation Ecosystem
158,C.A. Republic,2017,Banking Ecosystem size,14.26,1.218772,True,Innovation Ecosystem
159,Sudan,2017,Banking Ecosystem size,11.86,1.171427,True,Innovation Ecosystem
160,Afghanistan,2017,Banking Ecosystem size,3.43,1.005129,True,Innovation Ecosystem


In [255]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [256]:
### 36. Angel Ecosystem size

In [257]:
indicators[35]

# load data
indicator = indicators[35]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Angel Ecosystem size
angel_investment


In [258]:
df.head(15)

Unnamed: 0,Country,Business angel investments (in million Euros)
0,United Kingdom,153.08
1,Germany,81.77
2,Spain,78.66
3,Finland,54.0
4,Sweden,44.08
5,France,43.0
6,Denmark,33.34
7,Switzerland,32.6
8,Italy,28.2
9,Russia,26.08


In [259]:
subpillars[1]
subpillar = subpillars[1]
print(subpillar)

Innovation Ecosystem


In [260]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df['higher_is_better'] = True
df['Year'] = 2019
df['Indicator'] = indicator
df['data_col'] = df['Business angel investments (in million Euros)']
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [261]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

# Okay but there are only EU countries

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,United Kingdom,2019,Angel Ecosystem size,153.08,6.0,True,Innovation Ecosystem
1,Germany,2019,Angel Ecosystem size,81.77,3.666099,True,Innovation Ecosystem
2,Spain,2019,Angel Ecosystem size,78.66,3.564312,True,Innovation Ecosystem
3,Finland,2019,Angel Ecosystem size,54.0,2.757217,True,Innovation Ecosystem
4,Sweden,2019,Angel Ecosystem size,44.08,2.432546,True,Innovation Ecosystem
5,France,2019,Angel Ecosystem size,43.0,2.397198,True,Innovation Ecosystem
6,Denmark,2019,Angel Ecosystem size,33.34,2.081037,True,Innovation Ecosystem
7,Switzerland,2019,Angel Ecosystem size,32.6,2.056817,True,Innovation Ecosystem
8,Italy,2019,Angel Ecosystem size,28.2,1.91281,True,Innovation Ecosystem
9,Russia,2019,Angel Ecosystem size,26.08,1.843425,True,Innovation Ecosystem


In [262]:
df.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [263]:
### 37. Startup Ecosystem size

In [264]:
indicators[36]

# load data
indicator = indicators[36]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Startup Ecosystem size
startup_eco_size


In [265]:
df.head(15)

# This is a list of cities, not countries, wonder if it will work 

Unnamed: 0,Location,Ranking (2019)
0,Silicon Valley,1
1,New York City,2 (tie)
2,London,2 (tie)
3,Beijing,4
4,Boston,5
5,Tel Aviv - Jerusalem,6 (tie)
6,Los Angeles,6 (tie)
7,Shanghai,8
8,Seattle,9
9,Stockholm,10


In [266]:
### 38. Venture Ecosystem size

indicators[37]

# load data
indicator = indicators[37]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Venture Ecosystem size
startup_eco_size


In [267]:
df.head(15)

# Similar to the previous one

Unnamed: 0,Location,Ranking (2019)
0,Silicon Valley,1
1,New York City,2 (tie)
2,London,2 (tie)
3,Beijing,4
4,Boston,5
5,Tel Aviv - Jerusalem,6 (tie)
6,Los Angeles,6 (tie)
7,Shanghai,8
8,Seattle,9
9,Stockholm,10


In [268]:
### 39. International Co-Inventions

In [269]:
indicators[38]

# load data
indicator = indicators[38]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

International Co-Inventions
international_co_inventions


In [270]:
df.head(15)

Unnamed: 0,KINDPATENT,Patent office,KINDCOOP,Type of International Cooperation in Patenting,LOCATION,Country,PARTNER,Partner Country,KINDDATE,Reference Date,...,Time,Unit Code,Unit,PowerCode Code,PowerCode,Reference Period Code,Reference Period,Value,Flag Codes,Flags
0,EPO_A,Patent applications to the EPO,FOR_APP,Foreign ownership of domestic inventions,AUS,Australia,TOTAL,Total Patents,PRIORITY,Priority date,...,1999,NBR,Number,0,Units,,,1082.0,,
1,EPO_A,Patent applications to the EPO,FOR_APP,Foreign ownership of domestic inventions,AUS,Australia,TOTAL,Total Patents,PRIORITY,Priority date,...,2000,NBR,Number,0,Units,,,1159.0,,
2,EPO_A,Patent applications to the EPO,FOR_APP,Foreign ownership of domestic inventions,AUS,Australia,TOTAL,Total Patents,PRIORITY,Priority date,...,2001,NBR,Number,0,Units,,,1119.0,,
3,EPO_A,Patent applications to the EPO,FOR_APP,Foreign ownership of domestic inventions,AUS,Australia,TOTAL,Total Patents,PRIORITY,Priority date,...,2002,NBR,Number,0,Units,,,1193.0,,
4,EPO_A,Patent applications to the EPO,FOR_APP,Foreign ownership of domestic inventions,AUS,Australia,TOTAL,Total Patents,PRIORITY,Priority date,...,2003,NBR,Number,0,Units,,,1238.0,,
5,EPO_A,Patent applications to the EPO,FOR_APP,Foreign ownership of domestic inventions,AUS,Australia,TOTAL,Total Patents,PRIORITY,Priority date,...,2004,NBR,Number,0,Units,,,1336.0,,
6,EPO_A,Patent applications to the EPO,FOR_APP,Foreign ownership of domestic inventions,AUS,Australia,TOTAL,Total Patents,PRIORITY,Priority date,...,2005,NBR,Number,0,Units,,,1332.0,,
7,EPO_A,Patent applications to the EPO,FOR_APP,Foreign ownership of domestic inventions,AUS,Australia,TOTAL,Total Patents,PRIORITY,Priority date,...,2006,NBR,Number,0,Units,,,1219.0,,
8,EPO_A,Patent applications to the EPO,FOR_APP,Foreign ownership of domestic inventions,AUS,Australia,TOTAL,Total Patents,PRIORITY,Priority date,...,2007,NBR,Number,0,Units,,,1131.0,,
9,EPO_A,Patent applications to the EPO,FOR_APP,Foreign ownership of domestic inventions,AUS,Australia,TOTAL,Total Patents,PRIORITY,Priority date,...,2008,NBR,Number,0,Units,,,1110.0,,


In [271]:
subpillars[1]
subpillar = subpillars[1]
print(subpillar)

Innovation Ecosystem


In [272]:
# filter most recent year and global value
df = df[(df.Time==2017)]
df = df[(df.Country!='World')]
df ['Country Name'] = df ['Country']
df

Unnamed: 0,KINDPATENT,Patent office,KINDCOOP,Type of International Cooperation in Patenting,LOCATION,Country,PARTNER,Partner Country,KINDDATE,Reference Date,...,Unit Code,Unit,PowerCode Code,PowerCode,Reference Period Code,Reference Period,Value,Flag Codes,Flags,Country Name
36,EPO_A,Patent applications to the EPO,FOR_APP,Foreign ownership of domestic inventions,AUS,Australia,TOTAL,Total Patents,APPLICATION,Application date,...,NBR,Number,0,Units,,,1077.0000,,,Australia
73,EPO_A,Patent applications to the EPO,FOR_APP,Foreign ownership of domestic inventions,AUS,Australia,WRD,Total co-operation with abroad,APPLICATION,Application date,...,NBR,Number,0,Units,,,277.0000,,,Australia
110,EPO_A,Patent applications to the EPO,FOR_APP,Foreign ownership of domestic inventions,AUS,Australia,JPN,Japan,APPLICATION,Application date,...,NBR,Number,0,Units,,,11.0000,,,Australia
147,EPO_A,Patent applications to the EPO,FOR_APP,Foreign ownership of domestic inventions,AUS,Australia,USA,United States,APPLICATION,Application date,...,NBR,Number,0,Units,,,137.0000,,,Australia
184,EPO_A,Patent applications to the EPO,FOR_APP,Foreign ownership of domestic inventions,AUS,Australia,EU27,European Union (27),APPLICATION,Application date,...,NBR,Number,0,Units,,,62.0000,,,Australia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27024,USPTO_A,Patent applications to the USPTO,CO_INV_S,% of patents with foreign co-inventor(s),USA,United States,EU27,European Union (27),APPLICATION,Application date,...,PC,Percentage,0,Units,,,3.6198,,,United States
27062,USPTO_A,Patent applications to the USPTO,CO_INV_S,% of patents with foreign co-inventor(s),EU27,European Union (27 countries),WRD,Total co-operation with abroad,APPLICATION,Application date,...,PC,Percentage,0,Units,,,20.9912,,,European Union (27 countries)
27081,USPTO_A,Patent applications to the USPTO,CO_INV_S,% of patents with foreign co-inventor(s),EU27,European Union (27 countries),JPN,Japan,APPLICATION,Application date,...,PC,Percentage,0,Units,,,0.7465,,,European Union (27 countries)
27100,USPTO_A,Patent applications to the USPTO,CO_INV_S,% of patents with foreign co-inventor(s),EU27,European Union (27 countries),USA,United States,APPLICATION,Application date,...,PC,Percentage,0,Units,,,13.4261,,,European Union (27 countries)


In [273]:
# Create summarization
df_sum = df.groupby('Country Name')[['Value']].sum()
df_sum

Unnamed: 0_level_0,Value
Country Name,Unnamed: 1_level_1
Australia,31165.07
Canada,86286.4
European Union (27 countries),797654.3
European Union (28 countries),0.0
Japan,545770.6
Netherlands,74388.52
United States,1224357.0


In [274]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)

df_sum['higher_is_better'] = True
df_sum['Year'] = 2017
df_sum['Indicator'] = indicator
df_sum['data_col'] = df_sum.iloc[:,[0]]
df_sum['Sub-Pillar'] = subpillar

min_rank = df_sum['data_col'].min()
max_rank = df_sum['data_col'].max()

# transform 0-1 rank into 1-6
df_sum['new_rank_score'] = df_sum['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [275]:
df_sum = df_sum[['Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df_sum
# Okay but the data is too general with many missing countries

Unnamed: 0_level_0,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Australia,2017,International Co-Inventions,31165.07,1.127271,True,Innovation Ecosystem
Canada,2017,International Co-Inventions,86286.4,1.352374,True,Innovation Ecosystem
European Union (27 countries),2017,International Co-Inventions,797654.3,4.257441,True,Innovation Ecosystem
European Union (28 countries),2017,International Co-Inventions,0.0,1.0,True,Innovation Ecosystem
Japan,2017,International Co-Inventions,545770.6,3.228805,True,Innovation Ecosystem
Netherlands,2017,International Co-Inventions,74388.52,1.303786,True,Innovation Ecosystem
United States,2017,International Co-Inventions,1224357.0,6.0,True,Innovation Ecosystem


In [276]:
df_sum.to_csv('../indicator_scores/infrastructure_{}_scores.csv'.format(indicator), index=False)

In [277]:
### Score Aggregating

In [278]:
import os

In [279]:
# get list of files in scores folder
scores = os.listdir('../indicator_scores/')
scores = [s for s in scores if s.startswith('infrastructure')]

In [280]:
scores

['infrastructure_% of population covered by electricity_scores.csv',
 'infrastructure_% of population covered by internet connectivity_scores.csv',
 'infrastructure_% of population covered by mobile 2G+ data connectivity_scores.csv',
 'infrastructure_% of population covered by mobile 3G+ data connectivity_scores.csv',
 'infrastructure_% of population covered by mobile 4G+ data connectivity_scores.csv',
 'infrastructure_% of population covered by mobile 5G+ data connectivity_scores.csv',
 'infrastructure_Angel Ecosystem size_scores.csv',
 'infrastructure_Average fixed broadband download speeds _scores.csv',
 'infrastructure_Banking Ecosystem size_scores.csv',
 'infrastructure_Cellphone Signal Density_scores.csv',
 'infrastructure_Country, Industry, Skill Migration Data (Country)_scores.csv',
 'infrastructure_Country, Industry, Skill Migration Data (Industry)_scores.csv',
 'infrastructure_Country, Industry, Skill Migration Data (Skill)_scores.csv',
 'infrastructure_Digital Finance Ecosys

In [281]:
# create a dataframe that concatenates all these file into one table
df = pd.concat([pd.read_csv('../indicator_scores/{}'.format(s)) for s in scores])    

In [282]:
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Aruba,2019.0,% of population covered by electricity,100.000000,6.000000,True,Connectivity Technology
1,Africa Eastern and Southern,2019.0,% of population covered by electricity,43.640661,2.979006,True,Connectivity Technology
2,Afghanistan,2019.0,% of population covered by electricity,97.700000,5.876715,True,Connectivity Technology
3,Africa Western and Central,2019.0,% of population covered by electricity,51.341421,3.391785,True,Connectivity Technology
4,Angola,2019.0,% of population covered by electricity,45.670315,3.087800,True,Connectivity Technology
...,...,...,...,...,...,...,...
188,Senegal,2020.0,Telecommunication Infrastructure Index (TII),0.435800,3.179000,True,Connectivity Technology
189,Serbia,2020.0,Telecommunication Infrastructure Index (TII),0.620000,4.100000,True,Connectivity Technology
190,Seychelles,2020.0,Telecommunication Infrastructure Index (TII),0.692500,4.462500,True,Connectivity Technology
191,Singapore,2020.0,Telecommunication Infrastructure Index (TII),0.889900,5.449500,True,Connectivity Technology


In [283]:
# Data cleaning
df['new_rank_score'] = df['new_rank_score'].fillna(0)
df.sort_values(by=['Country Name'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)

# remove trailing whitespaces from country name
df['Country Name'] = df['Country Name'].str.strip()
df['Country Name'] = df['Country Name'].str.strip('**')
df['Country Name'] = df['Country Name'].str.strip('*')


# Replace values that are not truly country names with nan
df['Country Name'] = df['Country Name'].replace('n.a. : non avalaible',np.nan)
df['Country Name'] = df['Country Name'].replace('nan',np.nan)
df['Country Name'] = df['Country Name'].replace('Not classified',np.nan)
df['Country Name'] = df['Country Name'].replace('Source :',np.nan)
df['Country Name'] = df['Country Name'].replace('© Copyright Enerdata. Reproduction and diffusion prohibited (web, photocopy, intranet...) without written permission.',np.nan)

# Dropping the columns having NaN/NaT values
df = df[df['Country Name'].notna()]


df['Country Name'] = df['Country Name'].astype(str, errors = 'ignore')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Country Name'] = df['Country Name'].astype(str, errors = 'ignore')


In [284]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4614 entries, 0 to 4615
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country Name      4614 non-null   object 
 1   Year              4614 non-null   float64
 2   Indicator         4614 non-null   object 
 3   data_col          4349 non-null   float64
 4   new_rank_score    4614 non-null   float64
 5   higher_is_better  4614 non-null   bool   
 6   Sub-Pillar        4614 non-null   object 
dtypes: bool(1), float64(3), object(3)
memory usage: 256.8+ KB


In [285]:
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Afghanistan,2019.0,% of population covered by mobile 3G+ data con...,58.7,3.282895,True,Connectivity Technology
1,Afghanistan,2019.0,% of population covered by mobile 2G+ data con...,90.0,4.921251,True,Connectivity Technology
2,Afghanistan,2019.0,Mobile dowload speed at the slowest hour of th...,42.627621,3.618652,True,Connectivity Technology
3,Afghanistan,2020.0,Global Cybersecurity Index (GCI),5.2,1.209677,True,Connectivity Technology
4,Afghanistan,2021.0,Average fixed broadband download speeds,10.31,1.154794,True,Connectivity Technology
5,Afghanistan,2020.0,Mobile Coverage Maps,58.255812,1.289136,True,Connectivity Technology
6,Afghanistan,2017.0,Banking Ecosystem size,3.43,1.005129,True,Innovation Ecosystem
7,Afghanistan,2019.0,% of population covered by mobile 5G+ data con...,0.0,1.0,True,Connectivity Technology
8,Afghanistan,2019.0,Cellphone Signal Density,38.48,2.117926,True,Connectivity Technology
9,Afghanistan,2020.0,Telecommunication Infrastructure Index (TII),0.1762,1.881,True,Connectivity Technology


In [286]:
sorted(df['Country Name'].unique().tolist())

['Afghanistan',
 'Africa Eastern and Southern',
 'Africa Western and Central',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Ant.& Barb.',
 'Antigua and Barbuda',
 'Arab World',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahamas, The',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belgium Estonia',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bolivia (Plurinational State of)',
 'Bosnia & Herz.',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'British Virgin Islands',
 'Brunei',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burma',
 'Burundi',
 'C.A. Republic',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Caribbean small states',
 'Cayman Islands',
 'Central African Rep.',
 'Central African Republic',
 'Central Europe and the Baltics',
 'Chad',
 'Channel Islands',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo',
 'Congo (Democratic Republic of

In [287]:
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Afghanistan,2019.0,% of population covered by mobile 3G+ data con...,58.7,3.282895,True,Connectivity Technology
1,Afghanistan,2019.0,% of population covered by mobile 2G+ data con...,90.0,4.921251,True,Connectivity Technology
2,Afghanistan,2019.0,Mobile dowload speed at the slowest hour of th...,42.627621,3.618652,True,Connectivity Technology
3,Afghanistan,2020.0,Global Cybersecurity Index (GCI),5.2,1.209677,True,Connectivity Technology
4,Afghanistan,2021.0,Average fixed broadband download speeds,10.31,1.154794,True,Connectivity Technology
5,Afghanistan,2020.0,Mobile Coverage Maps,58.255812,1.289136,True,Connectivity Technology
6,Afghanistan,2017.0,Banking Ecosystem size,3.43,1.005129,True,Innovation Ecosystem
7,Afghanistan,2019.0,% of population covered by mobile 5G+ data con...,0.0,1.0,True,Connectivity Technology
8,Afghanistan,2019.0,Cellphone Signal Density,38.48,2.117926,True,Connectivity Technology
9,Afghanistan,2020.0,Telecommunication Infrastructure Index (TII),0.1762,1.881,True,Connectivity Technology


In [288]:
# average indicator scores per country
agg_df = df.groupby(['Country Name']).agg({'new_rank_score':'mean','data_col':'count'})

In [289]:
agg_df.columns = ['agg_score', 'count_source' ]

In [290]:
max_number_sources = agg_df.describe()['count_source']['max']

In [291]:
agg_df['agg_score_wt'] = agg_df['agg_score']*(agg_df['count_source']/max_number_sources)

In [292]:
agg_df.sort_values(by='agg_score', ascending=False, inplace=True)

In [293]:
agg_df.head(25)

Unnamed: 0_level_0,agg_score,count_source,agg_score_wt
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Turks and Caicos Islands,6.0,2,0.413793
Northern Mariana Islands,6.0,2,0.413793
Euro area,6.0,2,0.413793
St. Martin (French part),6.0,2,0.413793
Curacao,6.0,2,0.413793
Channel Islands,6.0,2,0.413793
Central Europe and the Baltics,6.0,2,0.413793
British Virgin Islands,6.0,2,0.413793
Guam,6.0,2,0.413793
Sint Maarten (Dutch part),6.0,2,0.413793


In [294]:
agg_df.to_csv('../pillar_scores/infrastructure_scores_v0.csv')

In [295]:
### Score Aggregating by Individual Indicators

In [296]:
# get list of files in scores folder
scores = os.listdir('../indicator_scores/')
scores
df = pd.concat([pd.read_csv('../indicator_scores/{}'.format(s)) for s in scores])    
df = df[['Sub-Pillar','Indicator','Country Name','Year','data_col','new_rank_score','higher_is_better']]
df

Unnamed: 0,Sub-Pillar,Indicator,Country Name,Year,data_col,new_rank_score,higher_is_better
0,Technology Adoption,"Cloud Services (Spend, IT Forecast Data)",Germany,2018.0,18.2,4.739130,True
1,Technology Adoption,"Cloud Services (Spend, IT Forecast Data)",Japan,2018.0,20.3,5.195652,True
2,Technology Adoption,"Cloud Services (Spend, IT Forecast Data)",United States,2018.0,18.0,4.695652,True
3,Technology Adoption,"Cloud Services (Spend, IT Forecast Data)",United Kingdom,2018.0,19.8,5.086957,True
4,Technology Adoption,"Cloud Services (Spend, IT Forecast Data)",Australia,2018.0,16.1,4.282609,True
...,...,...,...,...,...,...,...
200,Ambition,SDG Index,Sub-Saharan Africa,2019.0,51.9,2.428571,True
201,Ambition,SDG Index,Low-income Countries,2019.0,51.0,2.334034,True
202,Ambition,SDG Index,Lower-middle-income Countries,2019.0,60.1,3.289916,True
203,Ambition,SDG Index,Upper-middle-income Countries,2019.0,70.8,4.413866,True


In [297]:
# remove trailing whitespaces from country name
df['Country Name'] = df['Country Name'].str.strip()
df['Country Name'] = df['Country Name'].str.strip('**')
df['Country Name'] = df['Country Name'].str.strip('*')

In [298]:
uzb = df[(df['Country Name'] == 'Uzbekistan')]
uzb
uzb.to_csv('../country_scores/Uzbekistan_Indicator.csv')

In [299]:
### Score Aggregating by Subpillars

In [300]:
df.insert(0,'Pillar','Infrastructure')
df

Unnamed: 0,Pillar,Sub-Pillar,Indicator,Country Name,Year,data_col,new_rank_score,higher_is_better
0,Infrastructure,Technology Adoption,"Cloud Services (Spend, IT Forecast Data)",Germany,2018.0,18.2,4.739130,True
1,Infrastructure,Technology Adoption,"Cloud Services (Spend, IT Forecast Data)",Japan,2018.0,20.3,5.195652,True
2,Infrastructure,Technology Adoption,"Cloud Services (Spend, IT Forecast Data)",United States,2018.0,18.0,4.695652,True
3,Infrastructure,Technology Adoption,"Cloud Services (Spend, IT Forecast Data)",United Kingdom,2018.0,19.8,5.086957,True
4,Infrastructure,Technology Adoption,"Cloud Services (Spend, IT Forecast Data)",Australia,2018.0,16.1,4.282609,True
...,...,...,...,...,...,...,...,...
200,Infrastructure,Ambition,SDG Index,Sub-Saharan Africa,2019.0,51.9,2.428571,True
201,Infrastructure,Ambition,SDG Index,Low-income Countries,2019.0,51.0,2.334034,True
202,Infrastructure,Ambition,SDG Index,Lower-middle-income Countries,2019.0,60.1,3.289916,True
203,Infrastructure,Ambition,SDG Index,Upper-middle-income Countries,2019.0,70.8,4.413866,True


In [301]:
sub_df = df.groupby(['Pillar','Sub-Pillar','Country Name']).agg({'new_rank_score':'mean','data_col':'count'})

In [302]:
sub_df.columns = ['agg_score', 'count_source' ]

In [303]:
max_number_sources = sub_df.describe()['count_source']['max']

In [304]:
sub_df['agg_score_wt'] = sub_df['agg_score']*(sub_df['count_source']/max_number_sources)

In [305]:
sub_df.to_csv('../subpillar_score/people_scores_subpillar_v0.csv')

In [306]:
#Test Uzbekistan

In [307]:
# get list of files in scores folder
scores = os.listdir('../subpillar_score/')
scores

df = pd.concat([pd.read_csv('../subpillar_score/{}'.format(s)) for s in scores])    

df

Unnamed: 0,Pillar,Sub-Pillar,Country Name,agg_score,count_source,agg_score_wt
0,Business,Financing Incentives,Albania,3.142857,1,0.392857
1,Business,Financing Incentives,Algeria,4.285714,1,0.535714
2,Business,Financing Incentives,Angola,1.000000,1,0.125000
3,Business,Financing Incentives,Argentina,1.821429,1,0.227679
4,Business,Financing Incentives,Armenia,4.035714,1,0.504464
...,...,...,...,...,...,...
200,Strategy,Ambition,St. Lucia,,0,
201,Strategy,Ambition,St. Vincent and the Grenadines,,0,
202,Strategy,Ambition,Timor-Leste,,0,
203,Strategy,Ambition,Tonga,,0,


In [308]:
uzb = df[(df['Country Name'] == 'Uzbekistan')]
uzb
uzb.to_csv('../country_scores/Uzbekistan.csv')

In [309]:
#Test France

In [310]:
fr = df[(df['Country Name'] == 'France')]
fr
fr.to_csv('../country_scores/France.csv')

In [311]:
#Test Germany

In [312]:
ger = df[(df['Country Name'] == 'Germany')]
ger
ger.to_csv('../country_scores/Germany.csv')

In [313]:
#Test Sweden

In [314]:
sw = df[(df['Country Name'] == 'Sweden')]
sw
sw.to_csv('../country_scores/Sweden.csv')

In [315]:
#Test Japan

In [316]:
jp = df[(df['Country Name'] == 'Japan')]
jp
jp.to_csv('../country_scores/Japan.csv')