In [1]:
import pandas as pd
import numpy as np

### Get all the pillar names from the excel

In [2]:
names = pd.read_excel('../../UNDP Digital Assessment Data Framework Filename Matching V7.xlsx')

In [3]:
col_names = ['Indicator','check', 'Data Source','Index','Filename']

In [4]:
names = names[col_names]

In [5]:
names.head()

Unnamed: 0,Indicator,check,Data Source,Index,Filename
0,Countries,,United Nations,False,Countries
1,"Database of Global Administrative Areas (GADM,...",,GADM maps and data,False,
2,High Resolution Population Density Maps + Demo...,,Facebook,False,
3,population density vs openstreetmap object den...,,Kontur,False,
4,Population Density,Infrastructure,World Bank,False,population_density


In [6]:
# get all the files per pillar
data_stats = names.groupby('check').agg({'Filename':'count','Indicator':'count'})

In [7]:
data_stats

Unnamed: 0_level_0,Filename,Indicator
check,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,20,25
Foundations,9,12
Government,10,15
Infrastructure,39,48
People,39,47
Regulation,6,7
Strategy,1,1


### Business

In [8]:
bnames = names[(names.check=='Business')&(~names.Filename.isna())]#&(names.Index==False)]

In [9]:
bnames.head(25)

Unnamed: 0,Indicator,check,Data Source,Index,Filename
75,UNCTAD Business-to-Consumer (B2C) E-commerce I...,Business,UNCTAD/World Bank,True,b2c_ecommerse_idx
76,"Networking Services (Spend, IT Forecast Data)",Business,Portulans Institute,True,network_readiness_index
77,ICT goods exports,Business,UNCTAD,False,ict_goods
78,ICT goods imports,Business,UNCTAD,False,ict_goods
79,ICT service exports,Business,UNCTAD,False,ict_services
80,ICT service imports,Business,UNCTAD,False,ict_services
81,"Cloud Services (Spend, IT Forecast Data)",Business,Statista/Business Software Alliance,True,cloud_services
82,ICT task-intensive jobs as a percentage of tot...,Business,OECD,False,ICT_proportion
85,Share of business with internet,Business,OECD,False,business_internet
86,Share of businesses with broadband,Business,World Bank,False,business_broadband


In [10]:
# get list of names for all indicators
indicators = bnames.Indicator.unique()

In [11]:
# get all file names
bfiles = bnames.Filename.unique()

In [12]:
bfiles

array(['b2c_ecommerse_idx', 'network_readiness_index', 'ict_goods',
       'ict_services', 'cloud_services', 'ICT_proportion',
       'business_internet', 'business_broadband',
       'share_of_businesses_online_presence', 'prevalance_gig_economy',
       'size_digital_economy', 'venture_cap_avail',
       'legal_rights_strength', 'time_start_bus', 'ease_doing_bus',
       'ease_of_finding_skilled_employees', 'start_up_investment',
       'doing_bus_idx'], dtype=object)

In [13]:
# ls digital-readiness-assessment-main/processed/

In [14]:
##ict_goods and services not in process data

In [15]:
# formula for converting scale
def convert_rank(old_value, old_min=1, old_max=7, new_min=1, new_max=6 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

### 1. 'UNCTAD Business-to-Consumer (B2C) E-commerce Index'

In [16]:
indicators[0]

'UNCTAD Business-to-Consumer (B2C) E-commerce Index'

In [17]:
# load data
indicator = indicators[0]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

UNCTAD Business-to-Consumer (B2C) E-commerce Index
b2c_ecommerse_idx


In [18]:
df.Indicator.unique()

array(['Country rank and value in the UNCTAD B2C E-commerce Index'],
      dtype=object)

In [19]:
df.head()

Unnamed: 0,2015,2016,2017,Country ISO3,Country Name,Indicator Id,Indicator,Subindicator Type
0,14.1,17.0,,AFG,Afghanistan,24717,Country rank and value in the UNCTAD B2C E-com...,Value
1,,130.0,132.0,AFG,Afghanistan,24718,Country rank and value in the UNCTAD B2C E-com...,Rank
2,21.1,29.0,,AGO,Angola,24717,Country rank and value in the UNCTAD B2C E-com...,Value
3,,113.0,113.0,AGO,Angola,24718,Country rank and value in the UNCTAD B2C E-com...,Rank
4,51.0,62.0,,ALB,Albania,24717,Country rank and value in the UNCTAD B2C E-com...,Value


In [20]:
# two sub indicators per country
df['Subindicator Type'].unique()

array(['Value', 'Rank'], dtype=object)

In [21]:
# list of columns with data
value_cols = ['2015','2016','2017']

In [22]:
# value doesn't have data for 2017
df[df['Subindicator Type']=='Value'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 0 to 292
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   2015               137 non-null    float64
 1   2016               136 non-null    float64
 2   2017               0 non-null      float64
 3   Country ISO3       147 non-null    object 
 4   Country Name       147 non-null    object 
 5   Indicator Id       147 non-null    int64  
 6   Indicator          147 non-null    object 
 7   Subindicator Type  147 non-null    object 
dtypes: float64(3), int64(1), object(4)
memory usage: 10.3+ KB


In [23]:
df[df['Subindicator Type']=='Value'].describe()

Unnamed: 0,2015,2016,2017,Indicator Id
count,137.0,136.0,0.0,147.0
mean,47.151095,53.713382,,24717.0
std,22.978008,26.604495,,0.0
min,6.5,3.0,,24717.0
25%,28.7,32.0,,24717.0
50%,47.2,53.0,,24717.0
75%,65.2,78.25,,24717.0
max,89.7,96.5,,24717.0


In [24]:
# Rank does have 2017 data
df[df['Subindicator Type']=='Rank'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 1 to 293
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   2015               0 non-null      float64
 1   2016               137 non-null    float64
 2   2017               136 non-null    float64
 3   Country ISO3       147 non-null    object 
 4   Country Name       147 non-null    object 
 5   Indicator Id       147 non-null    int64  
 6   Indicator          147 non-null    object 
 7   Subindicator Type  147 non-null    object 
dtypes: float64(3), int64(1), object(4)
memory usage: 10.3+ KB


In [25]:
df[df['Subindicator Type']=='Rank'].describe()

Unnamed: 0,2015,2016,2017,Indicator Id
count,0.0,137.0,136.0,147.0
mean,,69.0,73.066176,24718.0
std,,39.692569,41.872682,0.0
min,,1.0,1.0,24718.0
25%,,35.0,36.75,24718.0
50%,,69.0,72.5,24718.0
75%,,103.0,110.25,24718.0
max,,137.0,144.0,24718.0


Going to use the rank column for now since it has more recent data

In [26]:
df_rank = df[df['Subindicator Type']=='Rank'].copy()

In [27]:
# convert rank into 1-6 - in order to get most recent data 
df_rank['data_col'] = df_rank['2017'] 

min_rank = df_rank['data_col'].min()
max_rank = df_rank['data_col'].max()

# transform 1-147 rank into 1-6
df_rank['new_rank_score'] = df_rank['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert score since higher rank is not better 
df_rank['new_rank_score'] = (6-df_rank['new_rank_score'])+1

In [28]:
# prep output

df_rank.rename(columns={'Country ISO3':'Country Code'}, inplace=True)

df_rank['higher_is_better'] = True


# output scores to csv
df_rank[['Country Code', 'Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

In [29]:
df_rank[['Country Code', 'Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


Unnamed: 0,Country Code,Country Name,Indicator,data_col,new_rank_score,higher_is_better
1,AFG,Afghanistan,Country rank and value in the UNCTAD B2C E-com...,132.0,1.41958,True
3,AGO,Angola,Country rank and value in the UNCTAD B2C E-com...,113.0,2.083916,True
5,ALB,Albania,Country rank and value in the UNCTAD B2C E-com...,59.0,3.972028,True
7,ARE,United Arab Emirates,Country rank and value in the UNCTAD B2C E-com...,23.0,5.230769,True
9,ARG,Argentina,Country rank and value in the UNCTAD B2C E-com...,81.0,3.202797,True
11,ARM,Armenia,Country rank and value in the UNCTAD B2C E-com...,78.0,3.307692,True
13,AUS,Australia,Country rank and value in the UNCTAD B2C E-com...,14.0,5.545455,True
15,AUT,Austria,Country rank and value in the UNCTAD B2C E-com...,17.0,5.440559,True
17,AZE,Azerbaijan,Country rank and value in the UNCTAD B2C E-com...,68.0,3.657343,True
19,BDI,Burundi,Country rank and value in the UNCTAD B2C E-com...,140.0,1.13986,True


## 2. Networking Services (Spend, IT Forecast Data)


In [30]:
indicator = indicators[1]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Networking Services (Spend, IT Forecast Data)
network_readiness_index


In [31]:
# no DATE associated wiht the data

In [32]:
df.head(16)

Unnamed: 0,Rank,Country,Score,Income Group,Region
0,1,Sweden,82.75,High-income,Europe
1,2,Denmark,82.19,High-income,Europe
2,3,Singapore,81.39,High-income,Asia & Pacific
3,4,Netherlands,81.37,High-income,Europe
4,5,Switzerland,80.41,High-income,Europe
5,6,Finland,80.16,High-income,Europe
6,7,Norway,79.39,High-income,Europe
7,8,United States,78.91,High-income,The Americas
8,9,Germany,77.48,High-income,Europe
9,10,United Kingdom,76.27,High-income,Europe


In [33]:
# going to use the  score column since this is already an Index

In [34]:
# convert rank into 1-6 - in order to get most recent data 
df['data_col'] = df['Score'] 

# going to assume index is between 1-100 but not 100% sure
min_rank = 1 #df['data_col'].min()
max_rank = 100 #df['data_col'].max()

# transform 1-100 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert score since higher rank is not better 
# df['new_rank_score'] = (6-df['new_rank_score'])+1

In [35]:
# prepare output
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['Indicator'] = indicator
df['higher_is_better'] = True

df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


# output scores to csv
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

## 3. ICT Services Imports

In [36]:
indicator = indicators[2]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

ICT goods exports 
ict_goods


The *ICT goods* file is actually *ICT services* data. Label is incorrect.

In [37]:
df.head()

Unnamed: 0,Year,Economy,Economy Label,Partner,Partner Label,Flow,Flow Label,IctProductCategory,IctProductCategory Label,Percentage of total merchandise trade,Percentage of total merchandise trade Footnote
0,2000,0,World,0,World,1,Imports,ICT00,Total ICT goods,16.05745,
1,2000,0,World,0,World,2,Exports,ICT00,Total ICT goods,16.0222,
2,2000,0,World,0,World,3,Re-exports,ICT00,Total ICT goods,29.61627,
3,2000,0,World,0,World,24,Re-imports,ICT00,Total ICT goods,31.78913,
4,2000,8,Albania,0,World,1,Imports,ICT00,Total ICT goods,3.33276,


In [38]:
dcol = 'Percentage of total merchandise trade'
indicol = 'IctProductCategory Label'
cname = 'Economy Label'

# filter most recent year and imports
df = df[(df.Year==2019)&(df['Flow Label']=='Imports')]

# create the standard columns
df['higher_is_better'] = True
df['Indicator'] =  df[indicol]+' '+ df['Flow Label']
df['Country Name'] = df[cname]
# df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df[dcol]


# convert 1-100 %  into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=100))

df.sort_values(by='new_rank_score', ascending=False)

# prepare output
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# output scores
df.to_csv('../indicator_scores/{}_scores.csv'.format(bf+'_imports'), index=False)

In [39]:
df.head(15)

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
21134,World,Total ICT goods Imports,13.36253,1.668127,True
21138,Antigua and Barbuda,Total ICT goods Imports,3.57755,1.178878,True
21141,Azerbaijan,Total ICT goods Imports,3.94651,1.197326,True
21143,Argentina,Total ICT goods Imports,8.09647,1.404824,True
21146,Australia,Total ICT goods Imports,10.35458,1.517729,True
21149,Austria,Total ICT goods Imports,5.23378,1.261689,True
21151,Armenia,Total ICT goods Imports,5.21115,1.260557,True
21155,Barbados,Total ICT goods Imports,3.08026,1.154013,True
21157,Belgium,Total ICT goods Imports,3.36165,1.168083,True
21159,Bermuda,Total ICT goods Imports,2.02207,1.101104,True


## 4. ICT Services Exports

In [40]:
indicator = indicators[3]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

ICT goods imports 
ict_goods


The *ICT goods* file is actually *ICT services* data. Label is incorrect.

In [41]:
dcol = 'Percentage of total trade in services'
indicol = 'Category Label'
cname = 'Economy Label'

# filter most recent year
df = df[(df.Year==2019)&(df['Flow Label']=='Exports')]

# create standard columns
df['higher_is_better'] = True
df['Indicator'] =  df[indicol]+' '+ df['Flow Label']
df['Country Name'] = df[cname]
# df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df[dcol]


# convert 1-100 %  into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=100))

df.sort_values(by='new_rank_score', ascending=False)

# prepare output
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# output scores
df.to_csv('../indicator_scores/{}_scores.csv'.format(bf+'_exports'), index=False)

KeyError: 'Category Label'

## 5. ICT Goods Exports

In [None]:
indicator = indicators[4]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df[(df.Year==2019)&(df['Flow Label']=='Exports')].sort_values(by=dcol, ascending=False)

In [None]:
dcol = 'Percentage of total merchandise trade'
indicol = 'IctProductCategory Label'
cname = 'Economy Label'

# filter most recent year and exports
df = df[(df.Year==2019)&(df['Flow Label']=='Exports')]

# create standard columns
df['higher_is_better'] = True
df['Indicator'] =  df[indicol]+' '+ df['Flow Label']
df['Country Name'] = df[cname]
# df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df[dcol]


# convert 1-100 %  into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=100))

df.sort_values(by='new_rank_score', ascending=False)

# prepare output
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# output scores
df.to_csv('../indicator_scores/{}_scores.csv'.format(bf+'_exports'), index=False)

In [None]:
df.head()

## 6. ICT Goods Imports

In [None]:
indicator = indicators[5]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head()

In [None]:
dcol = 'Percentage of total merchandise trade'
indicol = 'IctProductCategory Label'
cname = 'Economy Label'

# filter most recent year and exports
df = df[(df.Year==2019)&(df['Flow Label']=='Imports')]

# create standard columns
df['higher_is_better'] = True
df['Indicator'] =  df[indicol]+' '+ df['Flow Label']
df['Country Name'] = df[cname]
# df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df[dcol]


# convert 1-100 %  into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=100))

df.sort_values(by='new_rank_score', ascending=False)

# prepare output
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# output scores
df.to_csv('../indicator_scores/{}_scores.csv'.format(bf+'_imports'), index=False)

Question: do we want to use imports or exports?

## 7. Cloud Services (Spend, IT Forecast Data)


In [None]:
indicator = indicators[6]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
# remove nulls
df = df.dropna()

In [None]:
# prepare standard columns
df['data_col'] = df['Unnamed: 1'].astype(float)
df['Indicator'] = indicator
df['higher_is_better'] = True

In [None]:
# create country name column
df['Country Name'] = df['Cloud computing policy environment by category - country ranking 2018']

In [None]:
min_rank = 1
max_rank = df['Country Name'].nunique()

In [None]:
# transform 1-24 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert score since higher rank is not better 
# df['new_rank_score'] = (6-df['new_rank_score'])+1

In [None]:
# prepare output
df.sort_values(by='new_rank_score', ascending=False)

# prepare output
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# output scores
df.to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

## 8. ICT task-intensive jobs as a percentage of total employment

In [None]:
indicator = indicators[7]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head()

In [None]:
df[(df['Time']==2018)&(df['Information and communication technologies']=='ICT-intensive')].sort_values(by='Value', ascending=False)

In [None]:
# bnames

In [None]:
df['Information and communication technologies'].unique()

In [None]:
df.Sex.unique()

In [None]:
# convert to correct types
df['Value'] = df['Value'].astype(float)

In [None]:
df['Value'].describe()

In [None]:
# filter on relevant years
df = df[(df['Time']==2017)&(df['Information and communication technologies']=='ICT-intensive')]

# convert rank into 1-6 - in order to get most recent data 
df['data_col'] = df['Value'] 

# going to assume index is between 1-100 but not 100% sure
min_rank = 0 #df['data_col'].min()
max_rank = 100 #df['data_col'].max()

# transform 1-147 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert score since higher rank is not better 
# df['new_rank_score'] = (6-df['new_rank_score'])+1

# df_rank[['Country ISO3', 'Country Name','Indicator','data_col','new_rank_score']].head()

df.rename(columns={'Country':'Country Name'}, inplace=True)
df['Indicator'] = indicator
df['higher_is_better'] = True

df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


# output scores to csv
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

## 9. Share of business with internet

In [None]:
indicator = indicators[8]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

## 10. Share of businesses with broadband

In [None]:
indicator = indicators[9]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

No data found

## 11. Share of businesses with online presence

In [None]:
indicator = indicators[10]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df['Unnamed: 1'].unique()

 No data in the file

## 12. Share of businesses with online presence

In [None]:
indicator = indicators[11]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.iloc[0,0]

In [None]:
df.head()

### No country mapping

## 13. Size of digital economy (% of transactions)


In [None]:
indicator = indicators[12]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df


No Country Data

## 14. Venture Capital Availability


In [None]:
indicator = indicators[13]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.info()

#### What fields should we use?

In [None]:
df.Indicator.unique()  
# relavent indicator is:        'Venture capital availability, 1-7 (best)', id 529

In [None]:
df[df.Indicator=='Venture capital availability, 1-7 (best)']

# there are two indicator subtypes: Index and Rank
df[df.Indicator=='Venture capital availability, 1-7 (best)']['Subindicator Type'].unique()


In [None]:
df['data_col'] = df['2019']#.apply(lambda row: convert_rank(row))

In [None]:
# create two data frames for each subtype
df_rank = df[(df.Indicator=='Venture capital availability, 1-7 (best)')&(df['Subindicator Type']=='Rank')]
df_index = df[(df.Indicator=='Venture capital availability, 1-7 (best)')&(df['Subindicator Type']=='Index 1-7 (best)')]

In [None]:
df_index.head()

In [None]:
df_rank.head()

In [None]:
# 152 countries in data
df_rank['Country Name'].nunique()

In [None]:
# list of columns with data
value_cols = ['2007-2008', '2008-2009', '2009-2010',
       '2010-2011', '2011-2012', '2012-2013', '2013-2014', '2014-2015',
       '2015-2016', '2016-2017', '2017-2018', '2017', '2018', '2019']

In [None]:
# quick snapshot of dataframes
df_rank[value_cols].describe()

In [None]:
# index values have no 2019 data
df_index[value_cols].describe()

Two datasets, 1 rank (1-140ish per year), 1 index (1-7).  
Index is normally prefered but doesn't have data for 2019.  
Will use rank for this reason. Transformation should be the approx the same.

In [None]:
df_rank.data_col.describe()

In [None]:
# transform 1-141 rank into 1-6
min_rank = df_rank.data_col.min()
max_rank = df_rank.data_col.max()
df_rank['new_rank_score'] = df_rank['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
# need to invert score since higher rank is not better 
df_rank['new_rank_score'] = (6-df_rank['new_rank_score'])+1

###### Going to test how rank and index conversions compare.  

Unlike index scores, Rank scores don't necessarily have equal weightings as you go up. E.g. the difference betwee ranks 15-16 is not the same as ranks 17-18. But for our conversion we will have to treat them as equal, so I am keen to see how this affects results compared to the index scores.  
To do this I construct a simple test example of an older data column 2007-2008

In [None]:
df_rank['2007-2008'].describe()

In [None]:
# convert rank into 1-6
df_rank['test0'] = df_rank['2007-2008'].apply(lambda row: convert_rank(row, 1, 130))
df_rank['test0'] = (6-df_rank['test0'])+1

In [None]:
df_index['test0'] = df_index['2007-2008'].apply(lambda row: convert_rank(row, 1, 7))

In [None]:
new_df = df_index[['test0','Country Name']].merge(df_rank[['test0','Country Name']], suffixes=['index','rank'],on='Country Name')

In [None]:
new_df.corr()

The two are correlated enough to not dramatically impact results. But could be worth revising in the future when there is more recent index data.

#### Prepare Output

In [None]:
df_rank[['Country ISO3', 'Country Name','Indicator','data_col','new_rank_score']].head()

df_rank.rename(columns={'Country ISO3':'Country Code'}, inplace=True)

df_rank['higher_is_better'] = True

df_rank[['Country Code', 'Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)



In [None]:

# output scores to csv
df_rank[['Country Code', 'Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

### 15. Legal Rights Strength

In [None]:
indicator = indicators[14]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.info()

#### Find Relevant Columns

In [None]:
df['Series Name'].unique()

In [None]:
# remove unwanted rows
df = df[~df['Series Code'].isna()]
df = df.replace('..', np.nan)

In [None]:
df.loc[0][0]

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# clean data
df = df.replace('..', np.nan)

df[['1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]']] = df[['1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]']].astype(float)

In [None]:
df.info()

In [None]:
# keep only most recent value
df = df[['Series Name', 'Series Code', 'Country Name', 'Country Code','2019 [YR2019]']]

df['higher_is_better'] = True
df['Indicator'] = df['Series Name']
df['data_col'] = df['2019 [YR2019]']


#### Convert Scales

In [None]:
# convert 0-12 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=12))

In [None]:
df.head(16)

In [None]:
df.columns

#### Prepare Output

In [None]:
df = df[['Country Name', 'Country Code','Indicator', 'data_col', 'new_rank_score','higher_is_better']]
df.sort_values(by='new_rank_score', ascending=False).head(16)

In [None]:
# output scores
df.to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

### 16. Time to start business


#### Load Data

In [None]:
indicator = indicators[15]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
# remove unwanted rows
df = df[~df['Series Code'].isna()]
df = df.replace('..', np.nan)

In [None]:
df[['1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]']] = df[['1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]']].astype(float)

In [None]:
df.info()

In [None]:
# keep only most recent value
df = df[['Series Name', 'Series Code', 'Country Name', 'Country Code','2019 [YR2019]']]

df['higher_is_better'] = True
df['Indicator'] = df['Series Name']
df['data_col'] = df['2019 [YR2019]']


In [None]:
df.head()

In [None]:
def map_days_to_scores(number):
    if number<=2:
        return 4
    if number <6 and number>2:
        return 3
    elif number >=6 and number <11:
        return 2
    elif number >=11:
        return 1

In [None]:
# map days to scores 
df['data_col'] = df['data_col'].apply(map_days_to_scores)

In [None]:
# convert 1-3 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=1,old_max=4))

In [None]:
df.head()

In [None]:
df[df['2019 [YR2019]']<3]

In [None]:
df['2019 [YR2019]'].describe()

In [None]:
df = df[['Country Name', 'Country Code','Indicator', 'data_col', 'new_rank_score','higher_is_better']]
df.sort_values(by='new_rank_score', ascending=False).head(16)

#### Prepare Output

In [None]:
bf

In [None]:
# output scores
df.to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

### 17. Ease doing business


#### Load Data

In [None]:
indicator = indicators[16]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
# remove unwanted rows
df = df.replace('..', np.nan)
df = df[~df['Series Code'].isna()]


In [None]:
df['Series Name'].unique()

In [None]:
df.info()

In [None]:
# keep only most recent value
df = df[['Series Name', 'Series Code', 'Country Name', 'Country Code','2019 [YR2019]']]

# df['higher_is_better'] = False
df['Indicator'] = df['Series Name']
df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df['2019 [YR2019]']
# df['data_norm'] = df['data_norm'] = (df.data_col - df.data_col.mean())/df.data_col.std()

In [None]:
rank_min = df.data_col.min()
rank_max = df.data_col.max()

In [None]:
rank_min, rank_max

In [None]:
# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=rank_min,old_max=rank_max))

In [None]:
# invert since to put as higher is  better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [None]:
df.sort_values(by='new_rank_score', ascending=False).head(16)

In [None]:
df['higher_is_better'] = True


#### Prepare Output

In [None]:
df = df[['Country Name', 'Country Code','Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# output scores
df.to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

## 18. Ease of finding skilled employees

In [None]:
indicator = indicators[17]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head()

In [None]:
values = ['2017','2018','2019']

In [None]:
df = df.replace('No data', np.nan)

In [None]:
df[values] = df[values].astype(float)

In [None]:
df.head()

In [None]:
df[values].describe()

In [None]:
# create standard columns
df['data_col'] = df['2019']
df['new_rank_score'] = df['data_col']
df['higher_is_better'] = True
df['Indicator'] = indicator


df.rename(columns={'Country':'Country Name'}, inplace=True)


df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


# output scores to csv
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

## 19. Amount invested into startups yearly from private, public, blended sources (respectively)


In [None]:
indicator = indicators[18]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

No data

## 20. Doing Business Index


In [None]:
indicator = indicators[19]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
bnames

In [None]:
df

### Score Aggregating

In [None]:
import os


In [None]:
# get list of files in scores folder
scores = os.listdir('../indicator_scores/')

In [None]:
scores

In [None]:
# create a dataframe that concatenates all these file into one table
df = pd.concat([pd.read_csv('../indicator_scores/{}'.format(s)) for s in scores])    

In [None]:
df

In [None]:
# Data cleaning
df['new_rank_score'] = df['new_rank_score'].fillna(0)
df.sort_values(by=['Country Name'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.info()

In [None]:
df

In [None]:
df.head(15)

In [None]:
df.describe()

In [None]:
# checking country names
sorted(df['Country Name'].unique().tolist())

In [None]:
# remove trailing whitespaces from country name
df['Country Name'] = df['Country Name'].str.strip()


In [None]:
# checking country names
sorted(df['Country Name'].unique().tolist())

In [None]:
# average indicator scores per country
agg_df = df.groupby(['Country Name']).agg({'new_rank_score':'mean','data_col':'count'})

In [None]:
agg_df.columns = ['agg_score', 'count_source' ]

In [None]:
max_number_sources = agg_df.describe()['count_source']['max']

In [None]:
agg_df['agg_score_wt'] = agg_df['agg_score']*(agg_df['count_source']/max_number_sources)

In [None]:
agg_df.sort_values(by='agg_score', ascending=False, inplace=True)

In [None]:
agg_df.head(25)

In [None]:
agg_df.to_csv('../pillar_scores/business_scores_v0.csv')