In [1]:
import pandas as pd
import numpy as np

### Get all the pillar names from the excel

In [2]:
names = pd.read_excel('../../UNDP Digital Assessment Data Framework Filename Matching V7.xlsx')

In [3]:
col_names = ['Indicator','check', 'Data Source','Index','Filename','Sub-Pillar']

In [4]:
names = names[col_names]

In [5]:
names.head()

Unnamed: 0,Indicator,check,Data Source,Index,Filename,Sub-Pillar
0,Countries,,UN Statistics Division: List of Countries,False,Countries,
1,"Database of Global Administrative Areas (GADM,...",,,False,,
2,High Resolution Population Density Maps + Demo...,,,False,,
3,population density vs openstreetmap object den...,,,False,,
4,Population Density,Infrastructure,World Bank: World Development Indicators,False,population_density,Connectivity Technology


In [6]:
# get all the files per pillar
data_stats = names.groupby('check').agg({'Filename':'count','Indicator':'count'})

In [7]:
data_stats

Unnamed: 0_level_0,Filename,Indicator
check,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,18,27
Foundations,14,22
Government,10,15
Infrastructure,47,58
People,35,49
Regulation,5,8
Strategy,1,1


### Business

In [8]:
bnames = names[(names.check=='Business')&(~names.Filename.isna())]#&(names.Index==False)]

In [9]:
bnames.head(25)

Unnamed: 0,Indicator,check,Data Source,Index,Filename,Sub-Pillar
86,UNCTAD Business-to-Consumer (B2C) E-commerce I...,Business,UNCTAD: Business-to-Consumer (B2C) E-commerce...,True,b2c_ecommerse_idx,Technology Adoption
87,"Networking Services (Spend, IT Forecast Data)",Business,Portulans Institute: Network Readiness Index,True,network_readiness_index,Technology Adoption
92,"Cloud Services (Spend, IT Forecast Data)",Business,Statista,True,cloud_services,Technology Adoption
93,ICT task-intensive jobs as a percentage of tot...,Business,OECD: Going Digital Toolkit,False,ICT_proportion,Technology Adoption
96,Share of business with internet,Business,OECD: ICT Access and Usage by Businesses,False,business_internet,Technology Adoption
97,Share of businesses with broadband,Business,OECD: Innovation Indicators,False,business_broadband,Technology Adoption
98,Share of businesses with online presence,Business,Portulans Institute: Network Readiness Index,False,share_of_businesses_online_presence,Technology Adoption
99,Size of gig economy (% of GDP),Business,Portulans Institute: Network Readiness Index,False,prevalance_gig_economy,Technology Adoption
100,Size of digital economy (% of transactions),Business,Portulans Institute: Network Readiness Index,False,size_digital_economy,Technology Adoption
101,Venture Capital Availability,Business,World Economic Forum: Global Competitiveness I...,True,TCdata360,Financing Incentives


In [10]:
# get list of names for all indicators
indicators = bnames.Indicator.unique()
subpillars = bnames['Sub-Pillar'].unique()

In [11]:
# get all file names
bfiles = bnames.Filename.unique()

In [12]:
bfiles

array(['b2c_ecommerse_idx', 'network_readiness_index', 'cloud_services',
       'ICT_proportion', 'business_internet', 'business_broadband',
       'share_of_businesses_online_presence', 'prevalance_gig_economy',
       'size_digital_economy', 'TCdata360', 'doing_bus_idx',
       'legal_rights_strength', 'time_start_bus', 'ease_doing_bus',
       'ease_of_finding_skilled_employees', 'start_up_investment',
       'global_innovation_dataset'], dtype=object)

In [13]:
subpillars

array(['Technology Adoption', 'Financing Incentives',
       'Startup Environment'], dtype=object)

In [14]:
# ls digital-readiness-assessment-main/processed/

In [15]:
# formula for converting scale
def convert_rank(old_value, old_min=1, old_max=7, new_min=1, new_max=6 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

### 1. 'UNCTAD Business-to-Consumer (B2C) E-commerce Index'

In [16]:
indicators[0]

'UNCTAD Business-to-Consumer (B2C) E-commerce Index'

In [17]:
# load data
indicator = indicators[0]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

UNCTAD Business-to-Consumer (B2C) E-commerce Index
b2c_ecommerse_idx


In [18]:
df.head()

Unnamed: 0,2020 Rannk,Economy,Share of individuals using the Internet (2019 or latest),"Share of individuals with an account (15+, 2017)","Secure Internet servers (normalize d, 2019)",UPU postal reliability score (2019 or latest),2020 Index value,Index value change (2019-20data),Rank 2019
0,1,Switzerland,97,98,92,97,95.9,-0.1,2
1,2,Netherlands,96,100,94,93,95.8,-0.7,1
2,3,Denmark,97,100,100,81,94.5,0.2,6
3,4,Singapore,89,98,94,97,94.4,-0.4,3
4,5,United Kingdom,96,96,84,98,93.6,-0.8,4


In [19]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Technology Adoption


In [20]:
# create standard columns
df.rename(columns={'Economy':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2020 Index value'] 
df['Sub-Pillar'] = subpillar
df['Year'] = 2020

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [21]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Switzerland,2020,UNCTAD Business-to-Consumer (B2C) E-commerce I...,95.9,6.000000,True,Technology Adoption
1,Netherlands,2020,UNCTAD Business-to-Consumer (B2C) E-commerce I...,95.8,5.994463,True,Technology Adoption
2,Denmark,2020,UNCTAD Business-to-Consumer (B2C) E-commerce I...,94.5,5.922481,True,Technology Adoption
3,Singapore,2020,UNCTAD Business-to-Consumer (B2C) E-commerce I...,94.4,5.916944,True,Technology Adoption
4,United Kingdom,2020,UNCTAD Business-to-Consumer (B2C) E-commerce I...,93.6,5.872647,True,Technology Adoption
...,...,...,...,...,...,...,...
147,Dem. Rep. of the Congo,2020,UNCTAD Business-to-Consumer (B2C) E-commerce I...,12.8,1.398671,True,Technology Adoption
148,Comoros,2020,UNCTAD Business-to-Consumer (B2C) E-commerce I...,12.0,1.354374,True,Technology Adoption
149,Burundi,2020,UNCTAD Business-to-Consumer (B2C) E-commerce I...,8.3,1.149502,True,Technology Adoption
150,Chad,2020,UNCTAD Business-to-Consumer (B2C) E-commerce I...,7.1,1.083056,True,Technology Adoption


In [22]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator))

## 2. Networking Services (Spend, IT Forecast Data)


In [23]:
indicator = indicators[1]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Networking Services (Spend, IT Forecast Data)
network_readiness_index


In [24]:
df.head(16)

Unnamed: 0,Rank,Country,Score,Income Group,Region
0,1,Sweden,82.75,High-income,Europe
1,2,Denmark,82.19,High-income,Europe
2,3,Singapore,81.39,High-income,Asia & Pacific
3,4,Netherlands,81.37,High-income,Europe
4,5,Switzerland,80.41,High-income,Europe
5,6,Finland,80.16,High-income,Europe
6,7,Norway,79.39,High-income,Europe
7,8,United States,78.91,High-income,The Americas
8,9,Germany,77.48,High-income,Europe
9,10,United Kingdom,76.27,High-income,Europe


In [25]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Technology Adoption


In [26]:
# convert rank into 1-6 - in order to get most recent data 
df['data_col'] = df['Score'] 


# going to assume index is between 1-100 but not 100% sure
min_rank = 1 #df['data_col'].min()
max_rank = 100 #df['data_col'].max()

# transform 1-100 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert score since higher rank is not better 
# df['new_rank_score'] = (6-df['new_rank_score'])+1

In [27]:
# prepare output
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['Indicator'] = indicator
df['higher_is_better'] = True
df['Year'] = 2020
df['Sub-Pillar'] = subpillar

In [28]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Sweden,2020,"Networking Services (Spend, IT Forecast Data)",82.75,5.128788,True,Technology Adoption
1,Denmark,2020,"Networking Services (Spend, IT Forecast Data)",82.19,5.100505,True,Technology Adoption
2,Singapore,2020,"Networking Services (Spend, IT Forecast Data)",81.39,5.060101,True,Technology Adoption
3,Netherlands,2020,"Networking Services (Spend, IT Forecast Data)",81.37,5.059091,True,Technology Adoption
4,Switzerland,2020,"Networking Services (Spend, IT Forecast Data)",80.41,5.010606,True,Technology Adoption
...,...,...,...,...,...,...,...
129,Burundi,2020,"Networking Services (Spend, IT Forecast Data)",22.62,2.091919,True,Technology Adoption
130,Angola,2020,"Networking Services (Spend, IT Forecast Data)",20.96,2.008081,True,Technology Adoption
131,Yemen,2020,"Networking Services (Spend, IT Forecast Data)",18.00,1.858586,True,Technology Adoption
132,"Congo, Dem. Rep.",2020,"Networking Services (Spend, IT Forecast Data)",16.60,1.787879,True,Technology Adoption


In [29]:
# output scores
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator))

### 3. Cloud Services (Spend, IT Forecast Data)


In [30]:
indicator = indicators[2]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Cloud Services (Spend, IT Forecast Data)
cloud_services


In [31]:
# remove nulls
df = df.dropna()
df

Unnamed: 0,Cloud computing policy environment by category - country ranking 2018,Unnamed: 1
2,Germany,18.2
3,Japan,20.3
4,United States,18.0
5,United Kingdom,19.8
6,Australia,16.1
7,Singapore,20.7
8,Canada,17.0
9,France,17.3
10,Italy,15.0
11,Spain,16.6


In [32]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Technology Adoption


In [33]:
# prepare standard columns
df['data_col'] = df['Unnamed: 1'].astype(float)
df['Country Name'] = df.iloc[:,0]
df['Indicator'] = indicator
df['higher_is_better'] = True
df['Year'] = 2018
df['Sub-Pillar'] = subpillar

In [34]:
min_rank = 1
max_rank = df['Country Name'].nunique()

In [35]:
# transform 1-24 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert score since higher rank is not better 
# df['new_rank_score'] = (6-df['new_rank_score'])+1

In [36]:
# prepare output
df.sort_values(by='new_rank_score', ascending=False)

# prepare output
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]

# output scores
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

In [37]:
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
2,Germany,2018,"Cloud Services (Spend, IT Forecast Data)",18.2,4.73913,True,Technology Adoption
3,Japan,2018,"Cloud Services (Spend, IT Forecast Data)",20.3,5.195652,True,Technology Adoption
4,United States,2018,"Cloud Services (Spend, IT Forecast Data)",18.0,4.695652,True,Technology Adoption
5,United Kingdom,2018,"Cloud Services (Spend, IT Forecast Data)",19.8,5.086957,True,Technology Adoption
6,Australia,2018,"Cloud Services (Spend, IT Forecast Data)",16.1,4.282609,True,Technology Adoption
7,Singapore,2018,"Cloud Services (Spend, IT Forecast Data)",20.7,5.282609,True,Technology Adoption
8,Canada,2018,"Cloud Services (Spend, IT Forecast Data)",17.0,4.478261,True,Technology Adoption
9,France,2018,"Cloud Services (Spend, IT Forecast Data)",17.3,4.543478,True,Technology Adoption
10,Italy,2018,"Cloud Services (Spend, IT Forecast Data)",15.0,4.043478,True,Technology Adoption
11,Spain,2018,"Cloud Services (Spend, IT Forecast Data)",16.6,4.391304,True,Technology Adoption


## 4. ICT task-intensive jobs as a percentage of total employment

In [38]:
indicator = indicators[3]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

ICT task-intensive jobs as a percentage of total employment
ICT_proportion


In [39]:
df.head()

Unnamed: 0,Indicator,Country,Industry,Information and communication technologies,Sex,Measure,Time,Value,Flags
0,ICT specialists within and outside information...,Austria,Total economy,Specialist (ISCO-08: 133+215+251+252+351+352+742),Total,Share of jobs,2011,3.1764,
1,ICT specialists within and outside information...,Austria,Total economy,Specialist (ISCO-08: 133+215+251+252+351+352+742),Total,Share of jobs,2012,3.225967,
2,ICT specialists within and outside information...,Austria,Total economy,Specialist (ISCO-08: 133+215+251+252+351+352+742),Total,Share of jobs,2013,3.346251,
3,ICT specialists within and outside information...,Austria,Total economy,Specialist (ISCO-08: 133+215+251+252+351+352+742),Total,Share of jobs,2014,3.3191,
4,ICT specialists within and outside information...,Austria,Total economy,Specialist (ISCO-08: 133+215+251+252+351+352+742),Total,Share of jobs,2015,3.72934,


In [40]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Technology Adoption


In [41]:
df[(df['Time']==2018)&(df['Information and communication technologies']=='ICT-intensive')].sort_values(by='Value', ascending=False)

Unnamed: 0,Indicator,Country,Industry,Information and communication technologies,Sex,Measure,Time,Value,Flags


In [42]:
# bnames

In [43]:
df['Information and communication technologies'].unique()

array(['Specialist (ISCO-08: 133+215+251+252+351+352+742)',
       'Other ICT-intensive (ISCO-08: 121+122,134+,211+,216+,231+,241+,242+243)',
       'Non-ICT (rest of ISCO-08 occupations)', 'ICT-intensive', 'Total'],
      dtype=object)

In [44]:
df.Sex.unique()

array(['Total'], dtype=object)

In [45]:
# convert to correct types
df['Value'] = df['Value'].astype(float)

In [46]:
df['Value'].describe()

count    985.000000
mean      42.353406
std       42.614469
min        0.890157
25%        5.526795
50%       12.887070
75%       91.318100
max      100.000000
Name: Value, dtype: float64

In [47]:
# filter on relevant years
df = df[(df['Time']==2017)&(df['Information and communication technologies']=='ICT-intensive')]

# convert rank into 1-6 - in order to get most recent data 
df['data_col'] = df['Value'] 

# going to assume index is between 1-100 but not 100% sure
min_rank = 0 #df['data_col'].min()
max_rank = 100 #df['data_col'].max()

# transform 1-147 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert score since higher rank is not better 
# df['new_rank_score'] = (6-df['new_rank_score'])+1

# df_rank[['Country ISO3', 'Country Name','Indicator','data_col','new_rank_score']].head()

df.rename(columns={'Country':'Country Name'}, inplace=True)
df['Indicator'] = indicator
df['higher_is_better'] = True
df['Year'] = df['Time']
df['Sub-Pillar'] = subpillar

df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


# output scores to csv
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
27,Austria,2017,ICT task-intensive jobs as a percentage of tot...,10.88416,1.544208,True,Technology Adoption
62,Belgium,2017,ICT task-intensive jobs as a percentage of tot...,14.44826,1.722413,True,Technology Adoption
97,Czech Republic,2017,ICT task-intensive jobs as a percentage of tot...,9.219953,1.460998,True,Technology Adoption
157,Estonia,2017,ICT task-intensive jobs as a percentage of tot...,15.76142,1.788071,True,Technology Adoption
217,Finland,2017,ICT task-intensive jobs as a percentage of tot...,15.22048,1.761024,True,Technology Adoption
252,France,2017,ICT task-intensive jobs as a percentage of tot...,12.00835,1.600418,True,Technology Adoption
287,Germany,2017,ICT task-intensive jobs as a percentage of tot...,10.4419,1.522095,True,Technology Adoption
322,Greece,2017,ICT task-intensive jobs as a percentage of tot...,6.675247,1.333762,True,Technology Adoption
357,Hungary,2017,ICT task-intensive jobs as a percentage of tot...,8.369766,1.418488,True,Technology Adoption
392,Iceland,2017,ICT task-intensive jobs as a percentage of tot...,14.27687,1.713843,True,Technology Adoption


In [48]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

## 5. Share of business with internet

In [49]:
indicator = indicators[4]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Share of business with internet
business_internet


In [50]:
df= df.replace('..',np.nan)

In [51]:
df.head(15)

Unnamed: 0,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,Country
0,52.47,54.8,60.17,61.52,67.16,67.81,69.33,76.25,75.58,75.77,76.73,75.62,77.37,79.38,80.37,,Australia
1,72.22,78.81,80.06,79.84,79.85,80.19,82.87,82.01,85.7,86.35,87.46,88.11,85.55,87.92,89.45,90.42,Austria
2,,,,,77.37,78.47,76.6,76.01,78.26,79.15,81.04,81.0,82.6,84.03,86.72,86.62,Belgium
3,64.8,67.5,69.7,,,,,79.8,77.5,,,,78.5,,81.8,,Canada
4,,,,47.87,51.74,54.03,59.41,63.27,66.55,67.0,66.47,67.43,67.17,67.81,,,Colombia
5,,70.08,71.12,73.99,72.66,73.63,77.44,79.67,79.86,82.63,82.57,82.15,82.9,82.79,83.31,83.32,Czech Republic
6,,,,,87.61,87.83,88.68,89.3,91.78,91.4,91.95,93.34,95.09,95.58,93.92,92.77,Denmark
7,52.65,57.86,61.87,65.73,67.53,70.04,72.63,74.97,75.74,77.56,79.73,77.93,78.09,78.36,81.18,79.79,Estonia
8,,,,,84.62,87.32,92.56,91.3,93.64,95.1,95.2,95.33,96.28,95.64,,95.92,Finland
9,,,,,54.05,57.71,60.05,64.48,65.3,63.59,66.82,68.5,66.53,69.41,71.54,70.35,France


In [52]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Technology Adoption


In [53]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2020'].astype(float)
df['Country Name'] = df['Country']
df['Year'] = 2020
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [54]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Australia,2020,Share of business with internet,,,True,Technology Adoption
1,Austria,2020,Share of business with internet,90.42,5.347878,True,Technology Adoption
2,Belgium,2020,Share of business with internet,86.62,4.89732,True,Technology Adoption
3,Canada,2020,Share of business with internet,,,True,Technology Adoption
4,Colombia,2020,Share of business with internet,,,True,Technology Adoption
5,Czech Republic,2020,Share of business with internet,83.32,4.506047,True,Technology Adoption
6,Denmark,2020,Share of business with internet,92.77,5.626512,True,Technology Adoption
7,Estonia,2020,Share of business with internet,79.79,4.087503,True,Technology Adoption
8,Finland,2020,Share of business with internet,95.92,6.0,True,Technology Adoption
9,France,2020,Share of business with internet,70.35,2.968224,True,Technology Adoption


In [55]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

## 6. Share of businesses with broadband

In [56]:
indicator = indicators[5]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Share of businesses with broadband
business_broadband


In [57]:
df.head(15)

Unnamed: 0,2008,2009,2010,Country
0,76.94,76.01,82.06,Austria
1,79.33,77.31,86.52,Czech Republic
2,87.53,86.08,88.08,Estonia
3,,91.68,93.31,France
4,83.46,87.9,89.34,Germany
5,70.37,74.19,79.61,Hungary
6,,,95.43,Iceland
7,,76.11,86.84,Ireland
8,,82.92,84.12,Italy
9,,88.03,87.91,Luxembourg


In [58]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Technology Adoption


In [59]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2010']
df['Country Name'] = df['Country']
df['Year'] = 2010
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [60]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Austria,2010,Share of businesses with broadband,82.06,3.436782,True,Technology Adoption
1,Czech Republic,2010,Share of businesses with broadband,86.52,4.291188,True,Technology Adoption
2,Estonia,2010,Share of businesses with broadband,88.08,4.590038,True,Technology Adoption
3,France,2010,Share of businesses with broadband,93.31,5.591954,True,Technology Adoption
4,Germany,2010,Share of businesses with broadband,89.34,4.831418,True,Technology Adoption
5,Hungary,2010,Share of businesses with broadband,79.61,2.967433,True,Technology Adoption
6,Iceland,2010,Share of businesses with broadband,95.43,5.998084,True,Technology Adoption
7,Ireland,2010,Share of businesses with broadband,86.84,4.35249,True,Technology Adoption
8,Italy,2010,Share of businesses with broadband,84.12,3.831418,True,Technology Adoption
9,Luxembourg,2010,Share of businesses with broadband,87.91,4.557471,True,Technology Adoption


In [61]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

## 7. Share of businesses with online presence

In [62]:
indicator = indicators[6]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))
df.head(15)

Share of businesses with online presence
share_of_businesses_online_presence


Unnamed: 0,RANK,COUNTRY/ECONOMY,VALUE,SCORE,Year
0,1.0,Finland,95.64,100.0,2018
1,2.0,Denmark,93.92,98.01,2018
2,3.0,Japan,92.4,96.24,2018
3,4.0,Netherlands,91.89,95.65,2018
4,5.0,Switzerland,91.74,95.48,2018
5,6.0,Sweden,89.65,93.05,2018
6,7.0,Austria,89.45,92.82,2018
7,8.0,Germany,88.21,91.38,2018
8,9.0,Belgium,86.72,89.65,2018
9,10.0,United Kingdom,83.88,86.35,2018


In [63]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Technology Adoption


In [64]:
# create standard columns
df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['VALUE']
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [65]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Finland,2018,Share of businesses with online presence,95.64,6.0,True,Technology Adoption
1,Denmark,2018,Share of businesses with online presence,93.92,5.900163,True,Technology Adoption
2,Japan,2018,Share of businesses with online presence,92.4,5.811934,True,Technology Adoption
3,Netherlands,2018,Share of businesses with online presence,91.89,5.782331,True,Technology Adoption
4,Switzerland,2018,Share of businesses with online presence,91.74,5.773624,True,Technology Adoption
5,Sweden,2018,Share of businesses with online presence,89.65,5.65231,True,Technology Adoption
6,Austria,2018,Share of businesses with online presence,89.45,5.640701,True,Technology Adoption
7,Germany,2018,Share of businesses with online presence,88.21,5.568725,True,Technology Adoption
8,Belgium,2018,Share of businesses with online presence,86.72,5.482238,True,Technology Adoption
9,United Kingdom,2018,Share of businesses with online presence,83.88,5.31739,True,Technology Adoption


In [66]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

## 8. Size of gig economy (% of GDP)

In [67]:
indicator = indicators[7]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Size of gig economy (% of GDP)
prevalance_gig_economy


In [68]:
df.head(15)

Unnamed: 0,RANK,COUNTRY/ECONOMY,VALUE,SCORE
0,1.0,United States,5.4,100.0
1,2.0,Netherlands,5.22,94.63
2,3.0,United Kingdom,5.19,93.8
3,4.0,Saudi Arabia,5.08,90.33
4,5.0,Malaysia,5.07,90.19
5,6.0,Egypt,5.05,89.46
6,7.0,Israel,5.02,88.42
7,8.0,Canada,4.94,86.07
8,9.0,Singapore,4.92,85.52
9,10.0,United Arab Emirates,4.87,83.82


In [69]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Technology Adoption


In [70]:
# create standard columns
df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['VALUE']
df['Year'] = 2019
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [71]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,United States,2019,Size of gig economy (% of GDP),5.4,6.0,True,Technology Adoption
1,Netherlands,2019,Size of gig economy (% of GDP),5.22,5.72561,True,Technology Adoption
2,United Kingdom,2019,Size of gig economy (% of GDP),5.19,5.679878,True,Technology Adoption
3,Saudi Arabia,2019,Size of gig economy (% of GDP),5.08,5.512195,True,Technology Adoption
4,Malaysia,2019,Size of gig economy (% of GDP),5.07,5.496951,True,Technology Adoption
5,Egypt,2019,Size of gig economy (% of GDP),5.05,5.466463,True,Technology Adoption
6,Israel,2019,Size of gig economy (% of GDP),5.02,5.420732,True,Technology Adoption
7,Canada,2019,Size of gig economy (% of GDP),4.94,5.29878,True,Technology Adoption
8,Singapore,2019,Size of gig economy (% of GDP),4.92,5.268293,True,Technology Adoption
9,United Arab Emirates,2019,Size of gig economy (% of GDP),4.87,5.192073,True,Technology Adoption


In [72]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

## 9. Size of digital economy (% of transactions)


In [73]:
indicator = indicators[8]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Size of digital economy (% of transactions)
size_digital_economy


In [74]:
df

Unnamed: 0,Order,Country Name,Value,Score
0,1.0,Singapore,78.13,100.00
1,2.0,Switzerland,64.57,82.59
2,3.0,"Korea, Rep.",63.66,81.42
3,4.0,Germany,61.45,78.58
4,5.0,Hungary,59.72,76.36
...,...,...,...,...
129,,"Congo, Dem. Rep.",,
130,,Dominican Republic,,
131,,Guinea,,
132,,Lesotho,,


In [75]:
subpillars[0]
subpillar = subpillars[0]
print(subpillar)

Technology Adoption


In [76]:
# create standard columns
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df['Year'] = 2019
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [77]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
0,Singapore,2019,Size of digital economy (% of transactions),78.13,6.0,True,Technology Adoption
1,Switzerland,2019,Size of digital economy (% of transactions),64.57,5.129318,True,Technology Adoption
2,"Korea, Rep.",2019,Size of digital economy (% of transactions),63.66,5.070887,True,Technology Adoption
3,Germany,2019,Size of digital economy (% of transactions),61.45,4.928984,True,Technology Adoption
4,Hungary,2019,Size of digital economy (% of transactions),59.72,4.817902,True,Technology Adoption
5,Japan,2019,Size of digital economy (% of transactions),56.21,4.592526,True,Technology Adoption
6,Ireland,2019,Size of digital economy (% of transactions),54.35,4.473096,True,Technology Adoption
7,Denmark,2019,Size of digital economy (% of transactions),54.22,4.464749,True,Technology Adoption
8,Qatar,2019,Size of digital economy (% of transactions),54.17,4.461538,True,Technology Adoption
9,Sweden,2019,Size of digital economy (% of transactions),53.01,4.387055,True,Technology Adoption


In [78]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

No Country Data

## 10. Venture Capital Availability


In [79]:
indicator = indicators[9]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Venture Capital Availability
TCdata360


In [80]:
df.head(15)

Unnamed: 0,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,...,2008-2009,2009-2010,2010-2011,2011-2012,2012-2013,2013-2014,2014-2015,2015-2016,2016-2017,2017-2018
0,,,,,,,,,,,...,,,40.9,54.127067,,40.9205,38.90301,,,
1,,,,,,,,,,,...,,,66.0,49.0,,87.0,92.0,,,
2,,,,,,,,,,,...,,,3.858249,3.165548,,2.823529,2.601164,,,
3,,,,,,,,,,,...,,,127.0,142.0,,148.0,144.0,,,
4,,,,,,,,,,,...,,,2.797549,2.242699,,2.028571,2.177586,,,
5,,,,,,,,,,,...,,,130.0,142.0,,148.0,144.0,,,
6,,,,,,,,,,,...,,,2.196592,2.090002,,2.030303,2.249658,,,
7,,,,,,,,,,,...,,,138.0,141.0,,148.0,141.0,,,
8,,,,,,,,,,,...,,,2.793415,2.468175,,2.352941,2.260678,,,
9,,,,,,,,,,,...,,,115.0,127.0,,139.0,138.0,,,


In [81]:
subpillars[1]
subpillar = subpillars[1]
print(subpillar)

Financing Incentives


In [82]:
df = df[(df.Indicator == 'Venture capital availability, 1-7 (best)')]
df = df[(df['Subindicator Type'] == '1-7 Best')]
df

Unnamed: 0,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,...,2008-2009,2009-2010,2010-2011,2011-2012,2012-2013,2013-2014,2014-2015,2015-2016,2016-2017,2017-2018
226,,,,,1.802891,1.494084,,2.117647,2.348170,,...,,,,,,,,,,
512,2.696235,2.665708,2.524466,2.321898,2.145235,1.956589,1.844924,1.859132,1.948528,1.898624,...,,,,,,,,,,
796,4.491447,4.544287,4.296279,3.881013,3.716301,3.973436,4.143612,4.123291,4.352251,4.352251,...,,,,,,,,,,
1080,2.893384,2.839896,2.425063,2.131687,1.897883,1.909561,1.822370,1.745999,1.779793,2.009052,...,,,,,,,,,,
1366,2.404752,2.123574,1.999757,1.921578,1.789272,2.136096,2.369044,2.425793,2.355755,2.537145,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41962,3.334126,3.115717,3.177300,3.044030,2.667287,2.311127,2.290705,2.565255,2.690639,2.968303,...,,,,,,,,,,
42246,,,,,,2.587327,2.262899,1.876288,1.692528,,...,,,,,,,,,,
42530,3.770429,3.719011,3.890964,3.374518,3.011638,2.932891,3.054382,3.294088,3.185590,2.962898,...,,,,,,,,,,
42816,1.494845,1.984663,2.526940,2.298351,1.992631,2.073739,2.489051,2.538825,2.372437,2.225287,...,,,,,,,,,,


In [83]:
# create standard columns
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2019'] 
df['Year'] = 2019
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [84]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
226,Angola,2019,Venture Capital Availability,1.705738,1.000000,True,Financing Incentives
512,Albania,2019,Venture Capital Availability,3.014087,2.853740,True,Financing Incentives
796,United Arab Emirates,2019,Venture Capital Availability,4.832029,5.429500,True,Financing Incentives
1080,Argentina,2019,Venture Capital Availability,2.328207,1.881947,True,Financing Incentives
1366,Armenia,2019,Venture Capital Availability,3.311092,3.274554,True,Financing Incentives
...,...,...,...,...,...,...,...
41962,Vietnam,2019,Venture Capital Availability,3.267500,3.212789,True,Financing Incentives
42246,"Yemen, Rep.",2019,Venture Capital Availability,2.163688,1.648849,True,Financing Incentives
42530,South Africa,2019,Venture Capital Availability,3.051583,2.906867,True,Financing Incentives
42816,Zambia,2019,Venture Capital Availability,1.803466,1.138466,True,Financing Incentives


In [85]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

In [86]:
## 11. Doing Business Index

In [87]:
indicator = indicators[10]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Doing Business Index
doing_bus_idx


In [88]:
df

Unnamed: 0,Country code,Economy,Region,Income group,DB Year,Ease of doing business rank (DB19),Ease of doing business score (DB17-19 methodology),Ease of doing business score (DB16 methodology),Ease of doing business score (DB15 methodology),Ease of doing business score (DB10-14 methodology),...,Recovery rate (cents on the dollar),Strength of insolvency framework index (0-16) (DB15-19 methodology),Commencement of proceedings index (0-3) (DB15-19 methodology),Management of debtor's assets index (0-6) (DB15-19 methodology),Reorganization proceedings index (0-3) (DB15-19 methodology),Creditor participation index (0-4) (DB15-19 methodology),Score-Recovery rate (cents on the dollar),Score-Strength of insolvency framework index (0-16) (DB15-19 methodology),data_country,data_year
0,AFG,Afghanistan,South Asia,Low income,2005,,,,,,...,0.0,,,,,,0.00,,,
1,AFG,Afghanistan,South Asia,Low income,2006,,,,,,...,0.0,,,,,,0.00,,,
2,AFG,Afghanistan,South Asia,Low income,2007,,,,,,...,0.0,,,,,,0.00,,,
3,AFG,Afghanistan,South Asia,Low income,2008,,,,,,...,25.1,,,,,,27.07,,,
4,AFG,Afghanistan,South Asia,Low income,2009,,,,,,...,24.3,,,,,,26.16,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3020,ZWE,Zimbabwe,Sub-Saharan Africa,Low income,2015,,,47.11,44.36,,...,13.8,5.0,3.0,2.0,0.0,0.0,14.81,31.25,,
3021,ZWE,Zimbabwe,Sub-Saharan Africa,Low income,2016,,47.74,47.94,,,...,16.1,5.0,3.0,2.0,0.0,0.0,17.38,31.25,,
3022,ZWE,Zimbabwe,Sub-Saharan Africa,Low income,2017,,47.73,,,,...,18.0,5.0,3.0,2.0,0.0,0.0,19.43,31.25,,
3023,ZWE,Zimbabwe,Sub-Saharan Africa,Low income,2018,,48.52,,,,...,19.7,5.0,3.0,2.0,0.0,0.0,21.17,31.25,,


In [89]:
subpillars[2]
subpillar = subpillars[2]
print(subpillar)

Startup Environment


In [90]:
df = df[(df['DB Year'] == 2019)]
df.head(15)

Unnamed: 0,Country code,Economy,Region,Income group,DB Year,Ease of doing business rank (DB19),Ease of doing business score (DB17-19 methodology),Ease of doing business score (DB16 methodology),Ease of doing business score (DB15 methodology),Ease of doing business score (DB10-14 methodology),...,Recovery rate (cents on the dollar),Strength of insolvency framework index (0-16) (DB15-19 methodology),Commencement of proceedings index (0-3) (DB15-19 methodology),Management of debtor's assets index (0-6) (DB15-19 methodology),Reorganization proceedings index (0-3) (DB15-19 methodology),Creditor participation index (0-4) (DB15-19 methodology),Score-Recovery rate (cents on the dollar),Score-Strength of insolvency framework index (0-16) (DB15-19 methodology),data_country,data_year
14,AFG,Afghanistan,South Asia,Low income,2019,167.0,47.77,,,,...,26.5,12.0,2.0,6.0,2.0,2.0,28.57,75.0,,
30,ALB,Albania,Europe & Central Asia,Upper middle income,2019,63.0,69.51,,,,...,44.0,14.0,3.0,6.0,3.0,2.0,47.33,87.5,,
46,DZA,Algeria,Middle East & North Africa,Upper middle income,2019,157.0,49.65,,,,...,50.8,7.0,3.0,2.0,1.0,1.0,54.72,43.75,,
62,AGO,Angola,Sub-Saharan Africa,Lower middle income,2019,173.0,43.86,,,,...,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,,
76,ATG,Antigua and Barbuda,Latin America & Caribbean,High income,2019,112.0,59.48,,,,...,36.8,5.0,2.0,2.0,0.0,1.0,39.56,31.25,,
92,ARG,Argentina,Latin America & Caribbean,High income,2019,119.0,58.8,,,,...,21.5,9.5,2.5,4.0,2.0,1.0,23.11,59.38,,
108,ARM,Armenia,Europe & Central Asia,Upper middle income,2019,41.0,75.37,,,,...,38.2,7.5,2.5,2.0,2.0,1.0,41.1,46.88,,
124,AUS,Australia,High income: OECD,High income,2019,18.0,80.13,,,,...,82.7,11.0,2.5,5.0,0.5,3.0,88.99,68.75,,
140,AUT,Austria,High income: OECD,High income,2019,26.0,78.57,,,,...,80.1,11.0,2.5,5.5,1.0,2.0,86.2,68.75,,
156,AZE,Azerbaijan,Europe & Central Asia,Upper middle income,2019,25.0,78.64,,,,...,40.1,13.5,3.0,6.0,1.5,3.0,43.21,84.38,,


In [91]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = False
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,5]
df['Year'] = df.iloc[:,4]
df['Country Name'] = df.iloc[:,1]
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

# invert since to put as higher is  better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['higher_is_better'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Indicator'] = indicator
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['data_col'] = df.iloc[:,5]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value i

In [92]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
14,Afghanistan,2019,Doing Business Index,167.0,1.608466,False,Startup Environment
30,Albania,2019,Doing Business Index,63.0,4.359788,False,Startup Environment
46,Algeria,2019,Doing Business Index,157.0,1.873016,False,Startup Environment
62,Angola,2019,Doing Business Index,173.0,1.449735,False,Startup Environment
76,Antigua and Barbuda,2019,Doing Business Index,112.0,3.063492,False,Startup Environment
92,Argentina,2019,Doing Business Index,119.0,2.878307,False,Startup Environment
108,Armenia,2019,Doing Business Index,41.0,4.941799,False,Startup Environment
124,Australia,2019,Doing Business Index,18.0,5.550265,False,Startup Environment
140,Austria,2019,Doing Business Index,26.0,5.338624,False,Startup Environment
156,Azerbaijan,2019,Doing Business Index,25.0,5.365079,False,Startup Environment


In [93]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

## 12. Strength of Legal Rights 

In [94]:
indicator = indicators[11]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Strength of Legal Rights
legal_rights_strength


In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Series Name    268 non-null    object
 1   Series Code    266 non-null    object
 2   Country Name   266 non-null    object
 3   Country Code   266 non-null    object
 4   1990 [YR1990]  266 non-null    object
 5   2000 [YR2000]  266 non-null    object
 6   2011 [YR2011]  266 non-null    object
 7   2012 [YR2012]  266 non-null    object
 8   2013 [YR2013]  266 non-null    object
 9   2014 [YR2014]  266 non-null    object
 10  2015 [YR2015]  266 non-null    object
 11  2016 [YR2016]  266 non-null    object
 12  2017 [YR2017]  266 non-null    object
 13  2018 [YR2018]  266 non-null    object
 14  2019 [YR2019]  266 non-null    object
 15  2020 [YR2020]  266 non-null    object
dtypes: object(16)
memory usage: 34.0+ KB


In [96]:
subpillars[2]
subpillar = subpillars[2]
print(subpillar)

Startup Environment


#### Find Relevant Columns

In [97]:
df['Series Name'].unique()

array(['Strength of legal rights index (0=weak to 12=strong)', nan,
       'Data from database: World Development Indicators',
       'Last Updated: 06/30/2021'], dtype=object)

In [98]:
# remove unwanted rows
df = df[~df['Series Code'].isna()]
df = df.replace('..', np.nan)

In [99]:
df.loc[0][0]

'Strength of legal rights index (0=weak to 12=strong)'

In [100]:
df.head()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,1990 [YR1990],2000 [YR2000],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020]
0,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Afghanistan,AFG,,,,,9.0,9.0,9.0,9.0,9.0,10.0,10.0,
1,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Albania,ALB,,,,,7.0,6.0,6.0,6.0,8.0,8.0,8.0,
2,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Algeria,DZA,,,,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,
3,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,American Samoa,ASM,,,,,,,,,,,,
4,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Andorra,AND,,,,,,,,,,,,


In [101]:
df.columns

Index(['Series Name', 'Series Code', 'Country Name', 'Country Code',
       '1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]'],
      dtype='object')

In [102]:
# clean data
df = df.replace('..', np.nan)

df[['1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]']] = df[['1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]']].astype(float)

In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 0 to 265
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Series Name    266 non-null    object 
 1   Series Code    266 non-null    object 
 2   Country Name   266 non-null    object 
 3   Country Code   266 non-null    object 
 4   1990 [YR1990]  0 non-null      float64
 5   2000 [YR2000]  0 non-null      float64
 6   2011 [YR2011]  0 non-null      float64
 7   2012 [YR2012]  0 non-null      float64
 8   2013 [YR2013]  236 non-null    float64
 9   2014 [YR2014]  236 non-null    float64
 10  2015 [YR2015]  237 non-null    float64
 11  2016 [YR2016]  237 non-null    float64
 12  2017 [YR2017]  237 non-null    float64
 13  2018 [YR2018]  238 non-null    float64
 14  2019 [YR2019]  238 non-null    float64
 15  2020 [YR2020]  0 non-null      float64
dtypes: float64(12), object(4)
memory usage: 43.4+ KB


In [104]:
# keep only most recent value
df = df[['Series Name', 'Series Code', 'Country Name', 'Country Code','2019 [YR2019]']]

df['higher_is_better'] = True
df['Indicator'] = df['Series Name']
df['data_col'] = df['2019 [YR2019]']
df['Year'] = 2019
df['Sub-Pillar'] = subpillar

#### Convert Scales

In [105]:
# convert 0-12 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=12))

In [106]:
df.head(16)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],higher_is_better,Indicator,data_col,Year,Sub-Pillar,new_rank_score
0,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Afghanistan,AFG,10.0,True,Strength of legal rights index (0=weak to 12=s...,10.0,2019,Startup Environment,5.166667
1,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Albania,ALB,8.0,True,Strength of legal rights index (0=weak to 12=s...,8.0,2019,Startup Environment,4.333333
2,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Algeria,DZA,2.0,True,Strength of legal rights index (0=weak to 12=s...,2.0,2019,Startup Environment,1.833333
3,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,American Samoa,ASM,,True,Strength of legal rights index (0=weak to 12=s...,,2019,Startup Environment,
4,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Andorra,AND,,True,Strength of legal rights index (0=weak to 12=s...,,2019,Startup Environment,
5,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Angola,AGO,1.0,True,Strength of legal rights index (0=weak to 12=s...,1.0,2019,Startup Environment,1.416667
6,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Antigua and Barbuda,ATG,5.0,True,Strength of legal rights index (0=weak to 12=s...,5.0,2019,Startup Environment,3.083333
7,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Argentina,ARG,2.0,True,Strength of legal rights index (0=weak to 12=s...,2.0,2019,Startup Environment,1.833333
8,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Armenia,ARM,6.0,True,Strength of legal rights index (0=weak to 12=s...,6.0,2019,Startup Environment,3.5
9,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Aruba,ABW,,True,Strength of legal rights index (0=weak to 12=s...,,2019,Startup Environment,


In [107]:
df.columns

Index(['Series Name', 'Series Code', 'Country Name', 'Country Code',
       '2019 [YR2019]', 'higher_is_better', 'Indicator', 'data_col', 'Year',
       'Sub-Pillar', 'new_rank_score'],
      dtype='object')

#### Prepare Output

In [108]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df.sort_values(by='new_rank_score', ascending=False).head(16)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
12,Azerbaijan,2019,Strength of legal rights index (0=weak to 12=s...,12.0,6.0,True,Startup Environment
132,Montenegro,2019,Strength of legal rights index (0=weak to 12=s...,12.0,6.0,True,Startup Environment
141,New Zealand,2019,Strength of legal rights index (0=weak to 12=s...,12.0,6.0,True,Startup Environment
158,Puerto Rico,2019,Strength of legal rights index (0=weak to 12=s...,12.0,6.0,True,Startup Environment
28,Brunei Darussalam,2019,Strength of legal rights index (0=weak to 12=s...,12.0,6.0,True,Startup Environment
119,Malawi,2019,Strength of legal rights index (0=weak to 12=s...,11.0,5.583333,True,Startup Environment
105,Kosovo,2019,Strength of legal rights index (0=weak to 12=s...,11.0,5.583333,True,Startup Environment
101,Kenya,2019,Strength of legal rights index (0=weak to 12=s...,11.0,5.583333,True,Startup Environment
162,Rwanda,2019,Strength of legal rights index (0=weak to 12=s...,11.0,5.583333,True,Startup Environment
42,Colombia,2019,Strength of legal rights index (0=weak to 12=s...,11.0,5.583333,True,Startup Environment


In [109]:
# output scores
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

### 13. Time to start a business


#### Load Data

In [110]:
indicator = indicators[12]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Time to Start a Business
time_start_bus


In [111]:
# remove unwanted rows
df = df[~df['Series Code'].isna()]
df = df.replace('..', np.nan)

In [112]:
df[['1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]']] = df[['1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]']].astype(float)

In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 0 to 265
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Series Name    266 non-null    object 
 1   Series Code    266 non-null    object 
 2   Country Name   266 non-null    object 
 3   Country Code   266 non-null    object 
 4   1990 [YR1990]  0 non-null      float64
 5   2000 [YR2000]  0 non-null      float64
 6   2011 [YR2011]  221 non-null    float64
 7   2012 [YR2012]  225 non-null    float64
 8   2013 [YR2013]  236 non-null    float64
 9   2014 [YR2014]  236 non-null    float64
 10  2015 [YR2015]  237 non-null    float64
 11  2016 [YR2016]  237 non-null    float64
 12  2017 [YR2017]  237 non-null    float64
 13  2018 [YR2018]  238 non-null    float64
 14  2019 [YR2019]  238 non-null    float64
 15  2020 [YR2020]  0 non-null      float64
dtypes: float64(12), object(4)
memory usage: 35.3+ KB


In [114]:
subpillars[2]
subpillar = subpillars[2]
print(subpillar)

Startup Environment


In [115]:
# keep only most recent value
df = df[['Series Name', 'Series Code', 'Country Name', 'Country Code','2019 [YR2019]']]

df['higher_is_better'] = True
df['Indicator'] = df['Series Name']
df['data_col'] = df['2019 [YR2019]']
df['Year'] = 2019
df['Sub-Pillar'] = subpillar

In [116]:
df.head()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],higher_is_better,Indicator,data_col,Year,Sub-Pillar
0,Time required to start a business (days),IC.REG.DURS,Afghanistan,AFG,8.5,True,Time required to start a business (days),8.5,2019,Startup Environment
1,Time required to start a business (days),IC.REG.DURS,Albania,ALB,4.5,True,Time required to start a business (days),4.5,2019,Startup Environment
2,Time required to start a business (days),IC.REG.DURS,Algeria,DZA,18.0,True,Time required to start a business (days),18.0,2019,Startup Environment
3,Time required to start a business (days),IC.REG.DURS,American Samoa,ASM,,True,Time required to start a business (days),,2019,Startup Environment
4,Time required to start a business (days),IC.REG.DURS,Andorra,AND,,True,Time required to start a business (days),,2019,Startup Environment


In [117]:
def map_days_to_scores(number):
    if number<=2:
        return 4
    if number <6 and number>2:
        return 3
    elif number >=6 and number <11:
        return 2
    elif number >=11:
        return 1

In [118]:
# map days to scores 
df['data_col'] = df['data_col'].apply(map_days_to_scores)

In [119]:
# convert 1-3 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=1,old_max=4))

In [120]:
df.head()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],higher_is_better,Indicator,data_col,Year,Sub-Pillar,new_rank_score
0,Time required to start a business (days),IC.REG.DURS,Afghanistan,AFG,8.5,True,Time required to start a business (days),2.0,2019,Startup Environment,2.666667
1,Time required to start a business (days),IC.REG.DURS,Albania,ALB,4.5,True,Time required to start a business (days),3.0,2019,Startup Environment,4.333333
2,Time required to start a business (days),IC.REG.DURS,Algeria,DZA,18.0,True,Time required to start a business (days),1.0,2019,Startup Environment,1.0
3,Time required to start a business (days),IC.REG.DURS,American Samoa,ASM,,True,Time required to start a business (days),,2019,Startup Environment,
4,Time required to start a business (days),IC.REG.DURS,Andorra,AND,,True,Time required to start a business (days),,2019,Startup Environment,


In [121]:
df[df['2019 [YR2019]']<3]

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],higher_is_better,Indicator,data_col,Year,Sub-Pillar,new_rank_score
10,Time required to start a business (days),IC.REG.DURS,Australia,AUS,2.0,True,Time required to start a business (days),4.0,2019,Startup Environment,6.0
35,Time required to start a business (days),IC.REG.DURS,Canada,CAN,1.5,True,Time required to start a business (days),4.0,2019,Startup Environment,6.0
72,Time required to start a business (days),IC.REG.DURS,Georgia,GEO,1.0,True,Time required to start a business (days),4.0,2019,Startup Environment,6.0
86,Time required to start a business (days),IC.REG.DURS,"Hong Kong SAR, China",HKG,1.5,True,Time required to start a business (days),4.0,2019,Startup Environment,6.0
141,Time required to start a business (days),IC.REG.DURS,New Zealand,NZL,0.5,True,Time required to start a business (days),4.0,2019,Startup Environment,6.0
171,Time required to start a business (days),IC.REG.DURS,Singapore,SGP,1.5,True,Time required to start a business (days),4.0,2019,Startup Environment,6.0
194,Time required to start a business (days),IC.REG.DURS,Togo,TGO,2.5,True,Time required to start a business (days),3.0,2019,Startup Environment,4.333333
251,Time required to start a business (days),IC.REG.DURS,North America,NAC,2.85,True,Time required to start a business (days),3.0,2019,Startup Environment,4.333333


In [122]:
df['2019 [YR2019]'].describe()

count    238.000000
mean      19.771819
std       23.169794
min        0.500000
25%        8.075000
50%       14.000000
75%       22.991844
max      230.000000
Name: 2019 [YR2019], dtype: float64

In [123]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df.sort_values(by='new_rank_score', ascending=False).head(16)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
171,Singapore,2019,Time required to start a business (days),4.0,6.0,True,Startup Environment
141,New Zealand,2019,Time required to start a business (days),4.0,6.0,True,Startup Environment
10,Australia,2019,Time required to start a business (days),4.0,6.0,True,Startup Environment
86,"Hong Kong SAR, China",2019,Time required to start a business (days),4.0,6.0,True,Startup Environment
35,Canada,2019,Time required to start a business (days),4.0,6.0,True,Startup Environment
72,Georgia,2019,Time required to start a business (days),4.0,6.0,True,Startup Environment
1,Albania,2019,Time required to start a business (days),3.0,4.333333,True,Startup Environment
205,United Kingdom,2019,Time required to start a business (days),3.0,4.333333,True,Startup Environment
31,Burundi,2019,Time required to start a business (days),3.0,4.333333,True,Startup Environment
62,Estonia,2019,Time required to start a business (days),3.0,4.333333,True,Startup Environment


#### Prepare Output

In [124]:
bf

'time_start_bus'

In [125]:
# output scores
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

### 14. Ease doing business


#### Load Data

In [126]:
indicator = indicators[13]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Ease of Doing Business
ease_doing_bus


In [127]:
# remove unwanted rows
df = df.replace('..', np.nan)
df = df[~df['Series Code'].isna()]


In [128]:
df['Series Name'].unique()

array(['Ease of doing business index (1=most business-friendly regulations)'],
      dtype=object)

In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 0 to 265
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Series Name    266 non-null    object 
 1   Series Code    266 non-null    object 
 2   Country Name   266 non-null    object 
 3   Country Code   266 non-null    object 
 4   1990 [YR1990]  0 non-null      float64
 5   2000 [YR2000]  0 non-null      float64
 6   2011 [YR2011]  0 non-null      float64
 7   2012 [YR2012]  0 non-null      float64
 8   2013 [YR2013]  0 non-null      float64
 9   2014 [YR2014]  0 non-null      float64
 10  2015 [YR2015]  0 non-null      float64
 11  2016 [YR2016]  0 non-null      float64
 12  2017 [YR2017]  0 non-null      float64
 13  2018 [YR2018]  0 non-null      float64
 14  2019 [YR2019]  189 non-null    object 
 15  2020 [YR2020]  0 non-null      float64
dtypes: float64(11), object(5)
memory usage: 35.3+ KB


In [130]:
subpillars[2]
subpillar = subpillars[2]
print(subpillar)

Startup Environment


In [131]:
# keep only most recent value
df = df[['Series Name', 'Series Code', 'Country Name', 'Country Code','2019 [YR2019]']]

# df['higher_is_better'] = False
df['Indicator'] = df['Series Name']
df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df['2019 [YR2019]']
df['Year'] = 2019
df['Sub-Pillar'] = subpillar

# df['data_norm'] = df['data_norm'] = (df.data_col - df.data_col.mean())/df.data_col.std()

In [132]:
rank_min = df.data_col.min()
rank_max = df.data_col.max()

In [133]:
rank_min, rank_max

(1.0, 190.0)

In [134]:
# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=rank_min,old_max=rank_max))

In [135]:
# invert since to put as higher is  better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [136]:
df.sort_values(by='new_rank_score', ascending=False).head(16)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],Indicator,data_col,Year,Sub-Pillar,new_rank_score
141,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,New Zealand,NZL,1.0,Ease of doing business index (1=most business-...,1.0,2019,Startup Environment,6.0
171,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Singapore,SGP,2.0,Ease of doing business index (1=most business-...,2.0,2019,Startup Environment,5.973545
86,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,"Hong Kong SAR, China",HKG,3.0,Ease of doing business index (1=most business-...,3.0,2019,Startup Environment,5.94709
53,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Denmark,DNK,4.0,Ease of doing business index (1=most business-...,4.0,2019,Startup Environment,5.920635
104,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,"Korea, Rep.",KOR,5.0,Ease of doing business index (1=most business-...,5.0,2019,Startup Environment,5.89418
206,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,United States,USA,6.0,Ease of doing business index (1=most business-...,6.0,2019,Startup Environment,5.867725
72,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Georgia,GEO,7.0,Ease of doing business index (1=most business-...,7.0,2019,Startup Environment,5.84127
205,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,United Kingdom,GBR,8.0,Ease of doing business index (1=most business-...,8.0,2019,Startup Environment,5.814815
147,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Norway,NOR,9.0,Ease of doing business index (1=most business-...,9.0,2019,Startup Environment,5.78836
187,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Sweden,SWE,10.0,Ease of doing business index (1=most business-...,10.0,2019,Startup Environment,5.761905


In [137]:
df['higher_is_better'] = True
df.head(15)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],Indicator,data_col,Year,Sub-Pillar,new_rank_score,higher_is_better
0,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Afghanistan,AFG,173.0,Ease of doing business index (1=most business-...,173.0,2019,Startup Environment,1.449735,True
1,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Albania,ALB,82.0,Ease of doing business index (1=most business-...,82.0,2019,Startup Environment,3.857143,True
2,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Algeria,DZA,157.0,Ease of doing business index (1=most business-...,157.0,2019,Startup Environment,1.873016,True
3,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,American Samoa,ASM,,Ease of doing business index (1=most business-...,,2019,Startup Environment,,True
4,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Andorra,AND,,Ease of doing business index (1=most business-...,,2019,Startup Environment,,True
5,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Angola,AGO,177.0,Ease of doing business index (1=most business-...,177.0,2019,Startup Environment,1.343915,True
6,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Antigua and Barbuda,ATG,113.0,Ease of doing business index (1=most business-...,113.0,2019,Startup Environment,3.037037,True
7,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Argentina,ARG,126.0,Ease of doing business index (1=most business-...,126.0,2019,Startup Environment,2.693122,True
8,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Armenia,ARM,47.0,Ease of doing business index (1=most business-...,47.0,2019,Startup Environment,4.783069,True
9,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Aruba,ABW,,Ease of doing business index (1=most business-...,,2019,Startup Environment,,True


#### Prepare Output

In [138]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]

# output scores
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

## 15. Ease of finding skilled employees

In [139]:
indicator = indicators[14]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Ease of finding skilled employees
ease_of_finding_skilled_employees


In [140]:
df.head()

Unnamed: 0,2017,2018,2019,Country,Unnamed: 4
0,3.88,4.03,3.89,Albania,
1,3.98,3.84,4.12,Algeria,
2,No data,2.08,2.76,Angola,
3,4.35,4.1,4.19,Argentina,
4,3.82,3.87,4.03,Armenia,


In [141]:
subpillars[2]
subpillar = subpillars[2]
print(subpillar)

Startup Environment


In [142]:
values = ['2017','2018','2019']

In [143]:
df = df.replace('No data', np.nan)

In [144]:
df[values] = df[values].astype(float)

In [145]:
df.head()

Unnamed: 0,2017,2018,2019,Country,Unnamed: 4
0,3.88,4.03,3.89,Albania,
1,3.98,3.84,4.12,Algeria,
2,,2.08,2.76,Angola,
3,4.35,4.1,4.19,Argentina,
4,3.82,3.87,4.03,Armenia,


In [146]:
df[values].describe()

Unnamed: 0,2017,2018,2019
count,132.0,136.0,137.0
mean,4.183258,4.136176,4.196058
std,0.664458,0.659186,0.589124
min,2.72,2.08,2.76
25%,3.685,3.6375,3.84
50%,4.065,4.095,4.17
75%,4.6925,4.655,4.63
max,5.67,5.75,5.32


In [147]:
# create standard columns
df['data_col'] = df['2019']
df['new_rank_score'] = df['data_col']
df['higher_is_better'] = True
df['Indicator'] = indicator
df['Year'] = 2019
df['Sub-Pillar'] = subpillar

df.rename(columns={'Country':'Country Name'}, inplace=True)


df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]


# output scores to csv
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']].to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

## 16. Amount invested into startups yearly from private, public, blended sources (respectively)


In [148]:
indicator = indicators[15]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Amount invested into startups yearly from private, public, blended sources (respectively)
start_up_investment


In [149]:
df.head(15)

Unnamed: 0,LOCATION,Country,SUBJECT,Subject,STAGES,Development stages,MEASURE,Measure,TIME,Year,Unit Code,Unit,PowerCode Code,PowerCode,Reference Period Code,Reference Period,Value,Flag Codes,Flags
0,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2006,2006,USD,US Dollar,6,Millions,,,456.334579,,
1,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2007,2007,USD,US Dollar,6,Millions,,,680.29317,,
2,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2008,2008,USD,US Dollar,6,Millions,,,755.759626,,
3,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2009,2009,USD,US Dollar,6,Millions,,,532.682779,,
4,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2010,2010,USD,US Dollar,6,Millions,,,367.836251,,
5,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2011,2011,USD,US Dollar,6,Millions,,,246.528233,,
6,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2012,2012,USD,US Dollar,6,Millions,,,331.331196,,
7,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2013,2013,USD,US Dollar,6,Millions,,,252.934084,,
8,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2014,2014,USD,US Dollar,6,Millions,,,265.918369,,
9,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2015,2015,USD,US Dollar,6,Millions,,,288.485377,,


In [150]:
subpillars[2]
subpillar = subpillars[2]
print(subpillar)

Startup Environment


In [151]:
df = df[(df['Development stages']=='Total') & (df.Year == 2019) & (df.MEASURE == 'USD_V')]
df.head(15)

Unnamed: 0,LOCATION,Country,SUBJECT,Subject,STAGES,Development stages,MEASURE,Measure,TIME,Year,Unit Code,Unit,PowerCode Code,PowerCode,Reference Period Code,Reference Period,Value,Flag Codes,Flags
13,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,487.310802,,
120,AUT,Austria,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,90.416881,,
232,BEL,Belgium,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,395.828165,,
343,CAN,Canada,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,3286.872933,,
448,CZE,Czech Republic,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,27.204683,,
538,DNK,Denmark,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,352.437466,,
650,FIN,Finland,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,321.566548,,
762,FRA,France,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,2164.94444,,
874,DEU,Germany,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,2379.672789,,
985,GRC,Greece,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,27.057841,,


In [152]:
# create standard columns
df['Country Name'] = df['Country']
df['Indicator'] = indicator
df['data_col'] = df['Value']
df['higher_is_better'] = True
df['Year'] = 2019
df['Sub-Pillar'] = subpillar

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]

In [153]:
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
13,Australia,2019,Amount invested into startups yearly from priv...,487.310802,1.017817,True,Startup Environment
120,Austria,2019,Amount invested into startups yearly from priv...,90.416881,1.003187,True,Startup Environment
232,Belgium,2019,Amount invested into startups yearly from priv...,395.828165,1.014445,True,Startup Environment
343,Canada,2019,Amount invested into startups yearly from priv...,3286.872933,1.121012,True,Startup Environment
448,Czech Republic,2019,Amount invested into startups yearly from priv...,27.204683,1.000857,True,Startup Environment
538,Denmark,2019,Amount invested into startups yearly from priv...,352.437466,1.012846,True,Startup Environment
650,Finland,2019,Amount invested into startups yearly from priv...,321.566548,1.011708,True,Startup Environment
762,France,2019,Amount invested into startups yearly from priv...,2164.94444,1.079656,True,Startup Environment
874,Germany,2019,Amount invested into startups yearly from priv...,2379.672789,1.087572,True,Startup Environment
985,Greece,2019,Amount invested into startups yearly from priv...,27.057841,1.000852,True,Startup Environment


In [154]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(bf), index=False)

In [155]:
### 17. Regulatory Quality

In [156]:
indicator = indicators[16]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))
df.head(16)

Regulatory Quality
global_innovation_dataset


Unnamed: 0,ISO2,Economy,IndCode,IndNum,IndName,Rank,Score,Value,MinDatCov,Outdated,OverallStreWeak,IncomeStreWeak,DataYear
0,ALB,Albania,Inputs,.1,Innovation Input Sub-index,71.0,39.940929,,,,,,
1,ALB,Albania,Outputs,.2,Innovation Output Sub-index,92.0,16.109792,,,,,,
2,ALB,Albania,Index,0,Global Innovation Index,84.0,28.025361,,,,,,
3,ALB,Albania,P1,1,Institutions,60.0,64.91924,,0.0,,,,
4,ALB,Albania,SP11,1.1.,Political environment,71.0,56.071183,,0.0,,,,
5,ALB,Albania,PolStab,1.1.1,Political and operational stability*,60.0,69.642859,2.1,,0.0,,,2020.0
6,ALB,Albania,GovEff,1.1.2,Government effectiveness*,76.0,49.285344,-0.061331,,0.0,,,2019.0
7,ALB,Albania,SP12,1.2.,Regulatory environment,82.0,58.941532,,0.0,,,,
8,ALB,Albania,RegQua,1.2.1,Regulatory quality*,58.0,50.670701,0.27438,,0.0,,,2019.0
9,ALB,Albania,RuleOL,1.2.2,Rule of law*,85.0,35.889077,-0.411179,,0.0,,,2019.0


In [157]:
subpillars[2]
subpillar = subpillars[2]
print(subpillar)

Startup Environment


In [158]:
df = df[(df.IndName == 'Regulatory quality*')]
df

Unnamed: 0,ISO2,Economy,IndCode,IndNum,IndName,Rank,Score,Value,MinDatCov,Outdated,OverallStreWeak,IncomeStreWeak,DataYear
8,ALB,Albania,RegQua,1.2.1,Regulatory quality*,58.0,50.670701,0.274380,,0.0,,,2019.0
120,DZA,Algeria,RegQua,1.2.1,Regulatory quality*,129.0,9.425411,-1.303379,,0.0,,W,2019.0
232,AGO,Angola,RegQua,1.2.1,Regulatory quality*,124.0,20.130634,-0.893871,,0.0,,,2019.0
344,ARG,Argentina,RegQua,1.2.1,Regulatory quality*,103.0,30.616559,-0.492753,,0.0,,W,2019.0
456,ARM,Armenia,RegQua,1.2.1,Regulatory quality*,59.0,50.020701,0.249515,,0.0,,,2019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14232,UZB,Uzbekistan,RegQua,1.2.1,Regulatory quality*,126.0,17.531597,-0.993293,,0.0,W,W,2019.0
14344,VNM,Viet Nam,RegQua,1.2.1,Regulatory quality*,93.0,36.639718,-0.262348,,0.0,,,2019.0
14456,YEM,Yemen,RegQua,1.2.1,Regulatory quality*,132.0,0.000000,-1.663930,,0.0,W,W,2019.0
14568,ZMB,Zambia,RegQua,1.2.1,Regulatory quality*,105.0,29.008412,-0.554269,,0.0,,,2019.0


In [159]:
df['higher_is_better'] = True
df['Year'] = df['DataYear']
df['Indicator'] = indicator
df['data_col'] = df['Score']
df['Sub-Pillar'] = subpillar
df['Country Name'] = df['Economy']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [160]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
8,Albania,2019.0,Regulatory Quality,50.670701,3.533535,True,Startup Environment
120,Algeria,2019.0,Regulatory Quality,9.425411,1.471271,True,Startup Environment
232,Angola,2019.0,Regulatory Quality,20.130634,2.006532,True,Startup Environment
344,Argentina,2019.0,Regulatory Quality,30.616559,2.530828,True,Startup Environment
456,Armenia,2019.0,Regulatory Quality,50.020701,3.501035,True,Startup Environment
...,...,...,...,...,...,...,...
14232,Uzbekistan,2019.0,Regulatory Quality,17.531597,1.876580,True,Startup Environment
14344,Viet Nam,2019.0,Regulatory Quality,36.639718,2.831986,True,Startup Environment
14456,Yemen,2019.0,Regulatory Quality,0.000000,1.000000,True,Startup Environment
14568,Zambia,2019.0,Regulatory Quality,29.008412,2.450421,True,Startup Environment


In [161]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

In [162]:
### 18. Ease of Getting Credit

In [163]:
indicator = indicators[17]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))
df.head(16)

Ease of Getting Credit
global_innovation_dataset


Unnamed: 0,ISO2,Economy,IndCode,IndNum,IndName,Rank,Score,Value,MinDatCov,Outdated,OverallStreWeak,IncomeStreWeak,DataYear
0,ALB,Albania,Inputs,.1,Innovation Input Sub-index,71.0,39.940929,,,,,,
1,ALB,Albania,Outputs,.2,Innovation Output Sub-index,92.0,16.109792,,,,,,
2,ALB,Albania,Index,0,Global Innovation Index,84.0,28.025361,,,,,,
3,ALB,Albania,P1,1,Institutions,60.0,64.91924,,0.0,,,,
4,ALB,Albania,SP11,1.1.,Political environment,71.0,56.071183,,0.0,,,,
5,ALB,Albania,PolStab,1.1.1,Political and operational stability*,60.0,69.642859,2.1,,0.0,,,2020.0
6,ALB,Albania,GovEff,1.1.2,Government effectiveness*,76.0,49.285344,-0.061331,,0.0,,,2019.0
7,ALB,Albania,SP12,1.2.,Regulatory environment,82.0,58.941532,,0.0,,,,
8,ALB,Albania,RegQua,1.2.1,Regulatory quality*,58.0,50.670701,0.27438,,0.0,,,2019.0
9,ALB,Albania,RuleOL,1.2.2,Rule of law*,85.0,35.889077,-0.411179,,0.0,,,2019.0


In [164]:
subpillars[2]
subpillar = subpillars[2]
print(subpillar)

Startup Environment


In [165]:
df = df[(df.IndName == 'Ease of getting credit*')]
df

Unnamed: 0,ISO2,Economy,IndCode,IndNum,IndName,Rank,Score,Value,MinDatCov,Outdated,OverallStreWeak,IncomeStreWeak,DataYear
46,ALB,Albania,EaseCred,4.1.1,Ease of getting credit*,44.0,70.0,70.0,,0.0,,,2019.0
158,DZA,Algeria,EaseCred,4.1.1,Ease of getting credit*,129.0,10.0,10.0,,0.0,W,W,2019.0
270,AGO,Angola,EaseCred,4.1.1,Ease of getting credit*,131.0,5.0,5.0,,0.0,W,W,2019.0
382,ARG,Argentina,EaseCred,4.1.1,Ease of getting credit*,94.0,50.0,50.0,,0.0,,W,2019.0
494,ARM,Armenia,EaseCred,4.1.1,Ease of getting credit*,44.0,70.0,70.0,,0.0,,,2019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14270,UZB,Uzbekistan,EaseCred,4.1.1,Ease of getting credit*,61.0,65.0,65.0,,0.0,,,2019.0
14382,VNM,Viet Nam,EaseCred,4.1.1,Ease of getting credit*,23.0,80.0,80.0,,0.0,,,2019.0
14494,YEM,Yemen,EaseCred,4.1.1,Ease of getting credit*,132.0,0.0,0.0,,0.0,W,W,2019.0
14606,ZMB,Zambia,EaseCred,4.1.1,Ease of getting credit*,4.0,95.0,95.0,,0.0,S,S,2019.0


In [166]:
df['higher_is_better'] = True
df['Year'] = df['DataYear']
df['Indicator'] = indicator
df['data_col'] = df['Score']
df['Sub-Pillar'] = subpillar
df['Country Name'] = df['Economy']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [167]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better','Sub-Pillar']]
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar
46,Albania,2019.0,Ease of Getting Credit,70.0,4.50,True,Startup Environment
158,Algeria,2019.0,Ease of Getting Credit,10.0,1.50,True,Startup Environment
270,Angola,2019.0,Ease of Getting Credit,5.0,1.25,True,Startup Environment
382,Argentina,2019.0,Ease of Getting Credit,50.0,3.50,True,Startup Environment
494,Armenia,2019.0,Ease of Getting Credit,70.0,4.50,True,Startup Environment
...,...,...,...,...,...,...,...
14270,Uzbekistan,2019.0,Ease of Getting Credit,65.0,4.25,True,Startup Environment
14382,Viet Nam,2019.0,Ease of Getting Credit,80.0,5.00,True,Startup Environment
14494,Yemen,2019.0,Ease of Getting Credit,0.0,1.00,True,Startup Environment
14606,Zambia,2019.0,Ease of Getting Credit,95.0,5.75,True,Startup Environment


In [168]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

### Score Aggregating

In [169]:
import os


In [170]:
# get list of files in scores folder
scores = os.listdir('../indicator_scores/')
scores = [s for s in scores if s.startswith('business')]

In [171]:
scores

['business_Cloud Services (Spend, IT Forecast Data)_scores.csv',
 'business_Doing Business Index_scores.csv',
 'business_Ease of Doing Business_scores.csv',
 'business_Ease of finding skilled employees_scores.csv',
 'business_Ease of Getting Credit_scores.csv',
 'business_global_innovation_dataset_scores.csv',
 'business_ICT task-intensive jobs as a percentage of total employment_scores.csv',
 'business_Networking Services (Spend, IT Forecast Data)_scores.csv',
 'business_Regulatory Quality_scores.csv',
 'business_Share of business with internet_scores.csv',
 'business_Share of businesses with broadband_scores.csv',
 'business_Share of businesses with online presence_scores.csv',
 'business_Size of digital economy (% of transactions)_scores.csv',
 'business_Size of gig economy (% of GDP)_scores.csv',
 'business_start_up_investment_scores.csv',
 'business_Strength of Legal Rights_scores.csv',
 'business_Time to Start a Business_scores.csv',
 'business_UNCTAD Business-to-Consumer (B2C) E

In [172]:
# create a dataframe that concatenates all these file into one table
df = pd.concat([pd.read_csv('../indicator_scores/{}'.format(s)) for s in scores])    

In [173]:
df

Unnamed: 0.1,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar,Unnamed: 0
0,Germany,2018.0,"Cloud Services (Spend, IT Forecast Data)",18.200000,4.739130,True,Technology Adoption,
1,Japan,2018.0,"Cloud Services (Spend, IT Forecast Data)",20.300000,5.195652,True,Technology Adoption,
2,United States,2018.0,"Cloud Services (Spend, IT Forecast Data)",18.000000,4.695652,True,Technology Adoption,
3,United Kingdom,2018.0,"Cloud Services (Spend, IT Forecast Data)",19.800000,5.086957,True,Technology Adoption,
4,Australia,2018.0,"Cloud Services (Spend, IT Forecast Data)",16.100000,4.282609,True,Technology Adoption,
...,...,...,...,...,...,...,...,...
147,Vietnam,2019.0,Venture Capital Availability,3.267500,3.212789,True,Financing Incentives,
148,"Yemen, Rep.",2019.0,Venture Capital Availability,2.163688,1.648849,True,Financing Incentives,
149,South Africa,2019.0,Venture Capital Availability,3.051583,2.906867,True,Financing Incentives,
150,Zambia,2019.0,Venture Capital Availability,1.803466,1.138466,True,Financing Incentives,


In [174]:
# Data cleaning
df['new_rank_score'] = df['new_rank_score'].fillna(0)
df.sort_values(by=['Country Name'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)

In [175]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2527 entries, 0 to 2526
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country Name      2527 non-null   object 
 1   Year              2527 non-null   float64
 2   Indicator         2527 non-null   object 
 3   data_col          2318 non-null   float64
 4   new_rank_score    2527 non-null   float64
 5   higher_is_better  2527 non-null   bool   
 6   Sub-Pillar        2527 non-null   object 
 7   Unnamed: 0        286 non-null    float64
dtypes: bool(1), float64(4), object(3)
memory usage: 140.8+ KB


In [176]:
df

Unnamed: 0.1,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar,Unnamed: 0
0,Afghanistan,2020.0,UNCTAD Business-to-Consumer (B2C) E-commerce I...,17.100000,1.636766,True,Technology Adoption,142.0
1,Afghanistan,2019.0,Doing Business Index,167.000000,1.608466,False,Startup Environment,
2,Afghanistan,2019.0,Strength of legal rights index (0=weak to 12=s...,10.000000,5.166667,True,Startup Environment,
3,Afghanistan,2019.0,Time required to start a business (days),2.000000,2.666667,True,Startup Environment,
4,Afghanistan,2019.0,Ease of doing business index (1=most business-...,173.000000,1.449735,True,Startup Environment,
...,...,...,...,...,...,...,...,...
2522,Zimbabwe,2018.0,Share of businesses with online presence,38.700000,2.694915,True,Technology Adoption,
2523,Zimbabwe,2019.0,Ease of finding skilled employees,4.380000,4.380000,True,Startup Environment,
2524,Zimbabwe,2019.0,Regulatory Quality,5.247431,1.262372,True,Startup Environment,
2525,Zimbabwe,2019.0,Size of gig economy (% of GDP),2.120000,1.000000,True,Technology Adoption,


In [177]:
df.head(15)

Unnamed: 0.1,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better,Sub-Pillar,Unnamed: 0
0,Afghanistan,2020.0,UNCTAD Business-to-Consumer (B2C) E-commerce I...,17.1,1.636766,True,Technology Adoption,142.0
1,Afghanistan,2019.0,Doing Business Index,167.0,1.608466,False,Startup Environment,
2,Afghanistan,2019.0,Strength of legal rights index (0=weak to 12=s...,10.0,5.166667,True,Startup Environment,
3,Afghanistan,2019.0,Time required to start a business (days),2.0,2.666667,True,Startup Environment,
4,Afghanistan,2019.0,Ease of doing business index (1=most business-...,173.0,1.449735,True,Startup Environment,
5,Africa Eastern and Southern,2019.0,Ease of doing business index (1=most business-...,,0.0,True,Startup Environment,
6,Africa Eastern and Southern,2019.0,Strength of legal rights index (0=weak to 12=s...,4.538462,2.891026,True,Startup Environment,
7,Africa Eastern and Southern,2019.0,Time required to start a business (days),1.0,1.0,True,Startup Environment,
8,Africa Western and Central,2019.0,Ease of doing business index (1=most business-...,,0.0,True,Startup Environment,
9,Africa Western and Central,2019.0,Time required to start a business (days),1.0,1.0,True,Startup Environment,


In [178]:
df.describe()

Unnamed: 0.1,Year,data_col,new_rank_score,Unnamed: 0
count,2527.0,2318.0,2527.0,286.0
mean,2018.976256,107.002363,3.025301,71.283217
std,0.932941,2820.345919,1.601642,41.839462
min,2010.0,0.0,0.0,0.0
25%,2019.0,4.001038,1.833333,35.25
50%,2019.0,26.016879,3.130474,71.0
75%,2019.0,66.6925,4.333333,106.75
max,2020.0,135648.690714,6.0,151.0


In [179]:
df['Country Name'] = df['Country Name'].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2527 entries, 0 to 2526
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country Name      2527 non-null   object 
 1   Year              2527 non-null   float64
 2   Indicator         2527 non-null   object 
 3   data_col          2318 non-null   float64
 4   new_rank_score    2527 non-null   float64
 5   higher_is_better  2527 non-null   bool   
 6   Sub-Pillar        2527 non-null   object 
 7   Unnamed: 0        286 non-null    float64
dtypes: bool(1), float64(4), object(3)
memory usage: 140.8+ KB


In [180]:
# checking country names
sorted(df['Country Name'].unique().tolist())

['Afghanistan',
 'Africa Eastern and Southern',
 'Africa Western and Central',
 'Albania',
 'Albania ',
 'Algeria',
 'Algeria ',
 'American Samoa',
 'Andorra',
 'Angola',
 'Angola ',
 'Antigua and Barbuda',
 'Arab World',
 'Argentina',
 'Argentina ',
 'Armenia',
 'Armenia ',
 'Aruba',
 'Australia',
 'Australia ',
 'Austria',
 'Austria ',
 'Azerbaijan',
 'Azerbaijan ',
 'B:',
 'Bahamas, The',
 'Bahrain',
 'Bahrain ',
 'Bangladesh',
 'Bangladesh ',
 'Bangladesh Chittagong',
 'Bangladesh Dhaka',
 'Barbados',
 'Belarus',
 'Belarus ',
 'Belgium',
 'Belgium ',
 'Belize',
 'Benin',
 'Benin ',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bolivia ',
 'Bolivia (Plurinational State of)',
 'Bosnia and Herzegovina',
 'Bosnia and Herzegovina ',
 'Botswana',
 'Botswana ',
 'Brazil',
 'Brazil ',
 'Brazil Rio de Janeiro',
 'Brazil São Paulo',
 'British Virgin Islands',
 'Brunei Darussalam',
 'Bulgaria',
 'Bulgaria ',
 'Burkina Faso',
 'Burkina Faso ',
 'Burundi',
 'Burundi ',
 'Cabo Verde',
 'Cabo Verde ',
 'C

In [181]:
# remove trailing whitespaces from country name
df['Country Name'] = df['Country Name'].str.strip()
df['Country Name'] = df['Country Name'].str.strip('**')
df['Country Name'] = df['Country Name'].str.strip('*')

In [182]:
# checking country names
sorted(df['Country Name'].unique().tolist())

['Afghanistan',
 'Africa Eastern and Southern',
 'Africa Western and Central',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Arab World',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'B:',
 'Bahamas, The',
 'Bahrain',
 'Bangladesh',
 'Bangladesh Chittagong',
 'Bangladesh Dhaka',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bolivia (Plurinational State of)',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brazil Rio de Janeiro',
 'Brazil São Paulo',
 'British Virgin Islands',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Caribbean small states',
 'Cayman Islands',
 'Central African Republic',
 'Central Europe and the Baltics',
 'Chad',
 'Channel Islands',
 'Chile',
 'China',
 'China Beijing',
 'China Shanghai',
 'China, Hong Kong SAR',
 'Colombia',
 'Comoros',
 'Congo',
 'Congo, Dem. 

In [183]:
# average indicator scores per country
agg_df = df.groupby(['Country Name']).agg({'new_rank_score':'mean','data_col':'count'})

In [184]:
agg_df.columns = ['agg_score', 'count_source' ]

In [185]:
max_number_sources = agg_df.describe()['count_source']['max']

In [186]:
agg_df['agg_score_wt'] = agg_df['agg_score']*(agg_df['count_source']/max_number_sources)

In [187]:
agg_df.sort_values(by='agg_score', ascending=False, inplace=True)

In [188]:
agg_df.head(25)

Unnamed: 0_level_0,agg_score,count_source,agg_score_wt
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"China, Hong Kong SAR",5.772979,1,0.303841
"Korea, Republic of",5.662237,1,0.298012
"Hong Kong SAR, China",5.495062,5,1.446069
Czechia,5.440753,1,0.286355
Hong Kong,5.428253,3,0.857093
United States of America,5.336368,4,1.123446
Singapore,5.138992,14,3.786626
"Taiwan, China",5.067787,3,0.800177
New Zealand,4.8723,16,4.102989
Kosovo,4.824405,4,1.015664


In [189]:
agg_df.to_csv('../pillar_scores/business_scores_v0.csv')

In [190]:
### Score Aggregating by Subpillars

In [191]:
df.insert(0,'Pillar','Business')
df

# remove trailing whitespaces from country name
df['Country Name'] = df['Country Name'].str.strip()
df['Country Name'] = df['Country Name'].str.strip('**')
df['Country Name'] = df['Country Name'].str.strip('*')

In [192]:
sub_df = df.groupby(['Pillar','Sub-Pillar','Country Name']).agg({'new_rank_score':'mean','data_col':'count'})

In [193]:
sub_df.columns = ['agg_score', 'count_source' ]

In [194]:
max_number_sources = sub_df.describe()['count_source']['max']

In [195]:
sub_df['agg_score_wt'] = sub_df['agg_score']*(sub_df['count_source']/max_number_sources)

In [196]:
sub_df.to_csv('../subpillar_score/business_scores_subpillar_v0.csv')