In [1]:
import pandas as pd
import numpy as np

In [2]:
### Get all the pillar names from the excel

In [3]:
names = pd.read_excel('../../UNDP Digital Assessment Data Framework Filename Matching V7.xlsx')

In [4]:
col_names = ['Indicator','check', 'Data Source','Index','Filename']

In [5]:
names = names[col_names]

In [6]:
names.head()

Unnamed: 0,Indicator,check,Data Source,Index,Filename
0,Countries,,United Nations,False,Countries
1,"Database of Global Administrative Areas (GADM,...",,GADM maps and data,False,
2,High Resolution Population Density Maps + Demo...,,Facebook,False,
3,population density vs openstreetmap object den...,,Kontur,False,
4,Population Density,Infrastructure,World Bank,False,population_density


In [7]:
# get all the files per pillar
data_stats = names.groupby('check').agg({'Filename':'count','Indicator':'count'})

In [8]:
data_stats

Unnamed: 0_level_0,Filename,Indicator
check,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,20,25
Foundations,9,12
Government,10,15
Infrastructure,39,48
People,39,47
Regulation,6,7
Strategy,1,1


In [9]:
### People

In [10]:
bnames = names[(names.check=='People')&(~names.Filename.isna())]#&(names.Index==False)]
bnames

Unnamed: 0,Indicator,check,Data Source,Index,Filename
99,Human Capital Index (HCI),People,DESA,True,e_government_index
100,% of population using internet (all),People,ITU,False,ITU_database
101,% of population using internet (female),People,ITU,False,ITU_database
102,% of population using internet (male),People,ITU,False,ITU_database
103,SDG 4.4 Digital literacy data,People,UNESCO,False,SDG_digital_literacy_data
104,UNDP Human Development Index (HDI),People,UNDP,True,undp_human_developmnt
105,Facebook Social Connectedness Index,People,Facebook,True,fb_social_connectedness
106,Share of individuals using the Internet to int...,People,OECD,False,population_interacting_public_officials
107,Level of satisfaction for online public servic...,People,Boston Consulting Group/SalesForce,False,digital_public_service_use
108,Number of mobile apps available in national la...,People,GSMA Mobile Connectivity Index,False,apps_in_national_language


In [11]:
# get list of names for all indicators
indicators = bnames.Indicator.unique()

In [12]:
# get all file names
bfiles = bnames.Filename.unique()

In [13]:
bfiles

array(['e_government_index', 'ITU_database', 'SDG_digital_literacy_data',
       'undp_human_developmnt', 'fb_social_connectedness',
       'population_interacting_public_officials',
       'digital_public_service_use', 'apps_in_national_language',
       'time_spent_online', 'happiness_score', 'cryptocurrency_adoption',
       'not_buying_online_concern_about_returning',
       'not_buying_online_concern_about_security',
       'ewaste_per_inhabitant', 'automation_led_unemployment',
       'cyberbullying_rate', 'global_wellbeing_initiative',
       'financial_inclusiveness', 'individuals_buying_online_frequency',
       'e-commerce_activity', 'top_sites', 'youtube_searches',
       'google_trends', 'FB_users', 'gender_gaps',
       'population_digital_financial_services',
       'tax_percent_mobile_ownership', 'population_with_smartphones',
       'Chainalysis_2020_Geography_Cryptocurrency_Report'], dtype=object)

In [14]:
# formula for converting scale
def convert_rank(old_value, old_min=1, old_max=7, new_min=1, new_max=6 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

In [15]:
### 1. Human Capital Index (HCI)

In [16]:
indicators[0]

# load data
indicator = indicators[0]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Human Capital Index (HCI)
e_government_index


In [17]:
df.head(10)

Unnamed: 0,Survey Year,Country Name,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
0,2020,Iraq,143,0.436,0.3095,0.3353,0.4358,0.537
1,2020,Ireland,27,0.8433,0.8571,0.7706,0.9494,0.81
2,2020,Israel,30,0.8361,0.7143,0.7471,0.8924,0.8689
3,2020,Italy,37,0.8231,0.8214,0.8294,0.8466,0.7932
4,2020,Jamaica,114,0.5392,0.369,0.3882,0.7142,0.5151
5,2020,Japan,14,0.8989,0.9881,0.9059,0.8684,0.9223
6,2020,Jordan,117,0.5309,0.3333,0.3588,0.68,0.554
7,2020,Kazakhstan,29,0.8375,0.881,0.9235,0.8866,0.7024
8,2020,Kenya,116,0.5326,0.5952,0.6765,0.5812,0.3402
9,2020,Kiribati,145,0.432,0.5595,0.4941,0.6778,0.1241


In [18]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Human Capital Index'] 
df['Year'] = df['Survey Year']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [19]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Iraq,2020,Human Capital Index (HCI),0.4358,3.179,True
1,Ireland,2020,Human Capital Index (HCI),0.9494,5.747,True
2,Israel,2020,Human Capital Index (HCI),0.8924,5.462,True
3,Italy,2020,Human Capital Index (HCI),0.8466,5.233,True
4,Jamaica,2020,Human Capital Index (HCI),0.7142,4.571,True
5,Japan,2020,Human Capital Index (HCI),0.8684,5.342,True
6,Jordan,2020,Human Capital Index (HCI),0.68,4.4,True
7,Kazakhstan,2020,Human Capital Index (HCI),0.8866,5.433,True
8,Kenya,2020,Human Capital Index (HCI),0.5812,3.906,True
9,Kiribati,2020,Human Capital Index (HCI),0.6778,4.389,True


In [20]:
### 2. % of population using internet (all)

In [21]:
indicators[1]

# load data
indicator = indicators[1]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population using internet (all)
ITU_database


In [22]:
df.head(150)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
0,Angola,Africa,AGO,Female mobile phone ownership as a % of total ...,2010.0,,,
1,Benin,Africa,BEN,Female mobile phone ownership as a % of total ...,2010.0,,,
2,Botswana,Africa,BWA,Female mobile phone ownership as a % of total ...,2010.0,,,
3,Burkina Faso,Africa,BFA,Female mobile phone ownership as a % of total ...,2010.0,,,
4,Burundi,Africa,BDI,Female mobile phone ownership as a % of total ...,2010.0,,,
...,...,...,...,...,...,...,...,...
145,North Macedonia,Europe,MKD,Female mobile phone ownership as a % of total ...,2010.0,,,
146,Norway,Europe,NOR,Female mobile phone ownership as a % of total ...,2010.0,,,
147,Poland,Europe,POL,Female mobile phone ownership as a % of total ...,2010.0,,,
148,Portugal,Europe,PRT,Female mobile phone ownership as a % of total ...,2010.0,,,


In [23]:
df = df[(df.Year==2019)]
df = df[(df.iloc[:,3] =='Internet users: 25-74 years as a % of all 25-74 years')]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df['Country Name'] = df['Country']


min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [24]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(100)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
7750,Angola,2019.0,% of population using internet (all),,,True
7751,Benin,2019.0,% of population using internet (all),,,True
7752,Botswana,2019.0,% of population using internet (all),,,True
7753,Burkina Faso,2019.0,% of population using internet (all),,,True
7754,Burundi,2019.0,% of population using internet (all),,,True
...,...,...,...,...,...,...
7845,Samoa,2019.0,% of population using internet (all),,,True
7846,Singapore,2019.0,% of population using internet (all),,,True
7847,Solomon Islands,2019.0,% of population using internet (all),,,True
7848,Sri Lanka,2019.0,% of population using internet (all),,,True


In [25]:
### 3. % of population using internet (female)

In [26]:
indicators[2]

# load data
indicator = indicators[2]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population using internet (female)
ITU_database


In [27]:
df.head(10)

# Must convert the string in the dataset to float

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
0,Angola,Africa,AGO,Female mobile phone ownership as a % of total ...,2010.0,,,
1,Benin,Africa,BEN,Female mobile phone ownership as a % of total ...,2010.0,,,
2,Botswana,Africa,BWA,Female mobile phone ownership as a % of total ...,2010.0,,,
3,Burkina Faso,Africa,BFA,Female mobile phone ownership as a % of total ...,2010.0,,,
4,Burundi,Africa,BDI,Female mobile phone ownership as a % of total ...,2010.0,,,
5,Cabo Verde,Africa,CPV,Female mobile phone ownership as a % of total ...,2010.0,,,
6,Cameroon,Africa,CMR,Female mobile phone ownership as a % of total ...,2010.0,,,
7,Central African Rep.,Africa,CAF,Female mobile phone ownership as a % of total ...,2010.0,,,
8,Chad,Africa,TCD,Female mobile phone ownership as a % of total ...,2010.0,,,
9,Congo (Rep. of the),Africa,COG,Female mobile phone ownership as a % of total ...,2010.0,,,


In [28]:
df = df[(df.iloc[:,3] =='Female Internet users as a % of total female population')]
df = df[(df.Year==2019)]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value']
df['Country Name'] = df['Country']


min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [29]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
40307,Cabo Verde,2019.0,% of population using internet (female),60.965017,3.76639,True
40308,Côte d'Ivoire,2019.0,% of population using internet (female),32.924445,2.151554,True
40309,Kenya,2019.0,% of population using internet (female),20.125122,1.414451,True
40310,Lesotho,2019.0,% of population using internet (female),44.922768,2.842529,True
40311,Mauritius,2019.0,% of population using internet (female),60.130645,3.718339,True
40312,Bahrain,2019.0,% of population using internet (female),99.316403,5.975016,True
40313,Egypt,2019.0,% of population using internet (female),52.98699,3.306941,True
40314,Kuwait,2019.0,% of population using internet (female),99.579959,5.990194,True
40315,Morocco,2019.0,% of population using internet (female),70.16713,4.296332,True
40316,Oman,2019.0,% of population using internet (female),97.153466,5.850455,True


In [30]:
### 4. % of population using internet (male)

In [31]:
indicators[3]

# load data
indicator = indicators[3]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population using internet (male)
ITU_database


In [32]:
df.head(10)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
0,Angola,Africa,AGO,Female mobile phone ownership as a % of total ...,2010.0,,,
1,Benin,Africa,BEN,Female mobile phone ownership as a % of total ...,2010.0,,,
2,Botswana,Africa,BWA,Female mobile phone ownership as a % of total ...,2010.0,,,
3,Burkina Faso,Africa,BFA,Female mobile phone ownership as a % of total ...,2010.0,,,
4,Burundi,Africa,BDI,Female mobile phone ownership as a % of total ...,2010.0,,,
5,Cabo Verde,Africa,CPV,Female mobile phone ownership as a % of total ...,2010.0,,,
6,Cameroon,Africa,CMR,Female mobile phone ownership as a % of total ...,2010.0,,,
7,Central African Rep.,Africa,CAF,Female mobile phone ownership as a % of total ...,2010.0,,,
8,Chad,Africa,TCD,Female mobile phone ownership as a % of total ...,2010.0,,,
9,Congo (Rep. of the),Africa,COG,Female mobile phone ownership as a % of total ...,2010.0,,,


In [33]:
df = df[(df.iloc[:,3] =='Male Internet users as a % of total male population')]
df = df[(df.Year==2019)]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value']
df['Country Name'] = df['Country']


min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [34]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
41078,Cabo Verde,2019.0,% of population using internet (male),62.921166,3.643039,True
41079,Côte d'Ivoire,2019.0,% of population using internet (male),39.808871,2.173867,True
41080,Kenya,2019.0,% of population using internet (male),25.071668,1.237071,True
41081,Lesotho,2019.0,% of population using internet (male),38.349985,2.08113,True
41082,Mauritius,2019.0,% of population using internet (male),63.393093,3.673038,True
41083,Bahrain,2019.0,% of population using internet (male),99.910314,5.994317,True
41084,Egypt,2019.0,% of population using internet (male),61.541347,3.555329,True
41085,Kuwait,2019.0,% of population using internet (male),99.521506,5.969602,True
41086,Morocco,2019.0,% of population using internet (male),78.636833,4.642032,True
41087,Oman,2019.0,% of population using internet (male),94.404849,5.644353,True


In [35]:
### 5. SDG 4.4 Digital literacy data

In [36]:
indicators[4]

# load data
indicator = indicators[4]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# The indicators are all jumbled, need to unclutter this

SDG 4.4 Digital literacy data
SDG_digital_literacy_data


In [37]:
df.head(15)

Unnamed: 0,SDG_IND,Indicator,LOCATION,Country,TIME,Time,Value,Flag Codes,Flags
0,ICTSKILLTRANSFERFILE_M,Proportion of youth and adults who have transf...,BRA,Brazil,2014,2014,23.31007,,
1,ICTSKILLTRANSFERFILE_M,Proportion of youth and adults who have transf...,BRA,Brazil,2016,2016,21.53173,,
2,ICTSKILLTRANSFERFILE_M,Proportion of youth and adults who have transf...,BRA,Brazil,2017,2017,21.1488,,
3,ICTSKILLTRANSFERFILE_M,Proportion of youth and adults who have transf...,BRA,Brazil,2018,2018,21.84886,,
4,ICTSKILLDUPLIC_M,Proportion of youth and adults who have used c...,BRA,Brazil,2014,2014,25.64427,,
5,ICTSKILLDUPLIC_M,Proportion of youth and adults who have used c...,BRA,Brazil,2016,2016,22.01463,,
6,ICTSKILLDUPLIC_M,Proportion of youth and adults who have used c...,BRA,Brazil,2017,2017,22.69577,,
7,ICTSKILLDUPLIC_M,Proportion of youth and adults who have used c...,BRA,Brazil,2018,2018,22.59234,,
8,ICTSKILLTRANSFERFILE,Proportion of youth and adults who have transf...,TUR,Turkey,2014,2014,27.72807,,
9,ICTSKILLTRANSFERFILE,Proportion of youth and adults who have transf...,TUR,Turkey,2015,2015,25.66142,,


In [38]:
df = df[(df.Time == 2019)]
df.head(15)

Unnamed: 0,SDG_IND,Indicator,LOCATION,Country,TIME,Time,Value,Flag Codes,Flags
12,ICTSKILLTRANSFERFILE,Proportion of youth and adults who have transf...,TUR,Turkey,2019,2019,38.1844,,
35,ICTSKILLSOFTWARE,"Proportion of youth and adults who have found,...",THA,Thailand,2019,2019,5.3,,
75,ICTSKILLTRANSFERFILE,Proportion of youth and adults who have transf...,PAK,Pakistan,2019,2019,2.7,,
90,ICTSKILLPROGLANG,Proportion of youth and adults who have wrote ...,PRT,Portugal,2019,2019,8.2,,
103,ICTSKILLSOFTWARE,"Proportion of youth and adults who have found,...",FIN,Finland,2019,2019,63.1,,
121,ICTSKILLTRANSFERFILE,Proportion of youth and adults who have transf...,FRA,France,2019,2019,54.6,,
126,ICTSKILLTRANSFERFILE,Proportion of youth and adults who have transf...,ESP,Spain,2019,2019,54.67742,,
133,ICTSKILLATTACH,Proportion of youth and adults who have sent e...,KAZ,Kazakhstan,2019,2019,51.7,,
143,ICTSKILLPROGLANG,Proportion of youth and adults who have wrote ...,KAZ,Kazakhstan,2019,2019,6.3,,
152,ICTSKILLPROGLANG,Proportion of youth and adults who have wrote ...,BEL,Belgium,2019,2019,4.5,,


In [39]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value']
df['Country Name'] = df['Country']
df['Year'] = df['Time']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [40]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(10)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
12,Turkey,2019,SDG 4.4 Digital literacy data,38.1844,2.920344,True
35,Thailand,2019,SDG 4.4 Digital literacy data,5.3,1.257836,True
75,Pakistan,2019,SDG 4.4 Digital literacy data,2.7,1.12639,True
90,Portugal,2019,SDG 4.4 Digital literacy data,8.2,1.404449,True
103,Finland,2019,SDG 4.4 Digital literacy data,63.1,4.17998,True
121,France,2019,SDG 4.4 Digital literacy data,54.6,3.750253,True
126,Spain,2019,SDG 4.4 Digital literacy data,54.67742,3.754167,True
133,Kazakhstan,2019,SDG 4.4 Digital literacy data,51.7,3.60364,True
143,Kazakhstan,2019,SDG 4.4 Digital literacy data,6.3,1.308392,True
152,Belgium,2019,SDG 4.4 Digital literacy data,4.5,1.217391,True


In [41]:
### 6. UNDP Human Development Index (HDI)

In [42]:
indicators[5]

# load data
indicator = indicators[5]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

UNDP Human Development Index (HDI) 
undp_human_developmnt


In [43]:
df.head(15)

Unnamed: 0,HDI rank,Country,Value,Unnamed: 3,(years),(2017 PPP $),data_country,data_year
0,,,2019.0,,2019.0,2019.0,,
1,,VERY HIGH HUMAN DEVELOPMENT,,,,,,
2,1.0,Norway,0.957,,12.89775,66494.25217,,
3,2.0,Ireland,0.955,,12.6663305,68370.58737,,
4,2.0,Switzerland,0.955,,13.38081241,69393.52076,,
5,4.0,"Hong Kong, China (SAR)",0.949,,12.27996,62984.76553,,
6,4.0,Iceland,0.949,,12.77278684,54682.38057,,
7,6.0,Germany,0.947,,14.15168,55314.35355,,
8,7.0,Sweden,0.945,,12.54847,54507.80504,,
9,8.0,Australia,0.944,,12.72469119,48084.84207,,


In [44]:
# choose only the rows where column HDI rank is numeric

df = df.iloc[0:194,:]
df = df[pd.to_numeric(df['HDI rank'], errors='coerce').notnull()]



In [45]:
df.head(67)

Unnamed: 0,HDI rank,Country,Value,Unnamed: 3,(years),(2017 PPP $),data_country,data_year
2,1.0,Norway,0.957,,12.89775,66494.25217,,
3,2.0,Ireland,0.955,,12.6663305,68370.58737,,
4,2.0,Switzerland,0.955,,13.38081241,69393.52076,,
5,4.0,"Hong Kong, China (SAR)",0.949,,12.27996,62984.76553,,
6,4.0,Iceland,0.949,,12.77278684,54682.38057,,
...,...,...,...,...,...,...,...,...
64,62.0,Malaysia,0.81,,10.37283,27534.09856,,
65,64.0,Kuwait,0.806,,7.275667996,58590.08219,,
66,64.0,Serbia,0.806,,11.19411,17191.66873,,
67,66.0,Mauritius,0.804,,9.54,25266.21195,,


In [46]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = False
df['Indicator'] = indicator
df['data_col'] = df['HDI rank']
df['Country Name'] = df['Country']
df['Year'] = 2019


min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# invert since to put as higher is  better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [47]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
2,Norway,2019,UNDP Human Development Index (HDI),1.0,6.0,False
3,Ireland,2019,UNDP Human Development Index (HDI),2.0,5.973404,False
4,Switzerland,2019,UNDP Human Development Index (HDI),2.0,5.973404,False
5,"Hong Kong, China (SAR)",2019,UNDP Human Development Index (HDI),4.0,5.920213,False
6,Iceland,2019,UNDP Human Development Index (HDI),4.0,5.920213,False
7,Germany,2019,UNDP Human Development Index (HDI),6.0,5.867021,False
8,Sweden,2019,UNDP Human Development Index (HDI),7.0,5.840426,False
9,Australia,2019,UNDP Human Development Index (HDI),8.0,5.81383,False
10,Netherlands,2019,UNDP Human Development Index (HDI),8.0,5.81383,False
11,Denmark,2019,UNDP Human Development Index (HDI),10.0,5.760638,False


In [48]:
### 7. Facebook Social Connectedness Index

In [49]:
indicators[6]

# load data
indicator = indicators[6]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Facebook Social Connectedness Index
fb_social_connectedness


In [50]:
df.head(25)

Unnamed: 0,user_loc,fr_loc,scaled_sci
0,1001,AE,8729
1,1001,AG,95256
2,1001,AL,3122
3,1001,AM,3470
4,1001,AO,2839
5,1001,AR,3729
6,1001,AT,6977
7,1001,AU,21136
8,1001,AW,27607
9,1001,AZ,1108


In [51]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['scaled_sci'] 
df['Country Name'] = df['fr_loc']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [52]:
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(30)

# Need to find a way to convert ISO codes to full country names

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,AE,Facebook Social Connectedness Index,8729,1.000044,True
1,AG,Facebook Social Connectedness Index,95256,1.000476,True
2,AL,Facebook Social Connectedness Index,3122,1.000016,True
3,AM,Facebook Social Connectedness Index,3470,1.000017,True
4,AO,Facebook Social Connectedness Index,2839,1.000014,True
5,AR,Facebook Social Connectedness Index,3729,1.000019,True
6,AT,Facebook Social Connectedness Index,6977,1.000035,True
7,AU,Facebook Social Connectedness Index,21136,1.000106,True
8,AW,Facebook Social Connectedness Index,27607,1.000138,True
9,AZ,Facebook Social Connectedness Index,1108,1.000006,True


In [53]:
### 8. Share of individuals using the Internet to interact with officials

In [54]:
indicators[7]

# load data
indicator = indicators[7]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Share of individuals using the Internet to interact with public authorities
population_interacting_public_officials


In [55]:
df.head(15)

Unnamed: 0,Country,Indicator,Breakdowns,Time,Unit,PowerCode,Reference Period,Value,Flags
0,Australia,Individuals using the Internet for downloading...,All (individuals aged 16-74),2010,Percentage,Units,,38.11,Difference in methodology
1,Australia,Individuals using the Internet for downloading...,All (individuals aged 16-74),2012,Percentage,Units,,49.96,Difference in methodology
2,Austria,Individuals using the Internet for visiting or...,All (individuals aged 16-74),2005,Percentage,Units,,29.194,
3,Austria,Individuals using the Internet for visiting or...,All (individuals aged 16-74),2006,Percentage,Units,,32.9733,
4,Austria,Individuals using the Internet for visiting or...,All (individuals aged 16-74),2007,Percentage,Units,,27.4741,
5,Austria,Individuals using the Internet for visiting or...,All (individuals aged 16-74),2008,Percentage,Units,,51.2252,Break
6,Austria,Individuals using the Internet for visiting or...,All (individuals aged 16-74),2009,Percentage,Units,,48.8515,
7,Austria,Individuals using the Internet for visiting or...,All (individuals aged 16-74),2010,Percentage,Units,,51.0458,
8,Austria,Individuals using the Internet for visiting or...,All (individuals aged 16-74),2011,Percentage,Units,,51.2893,
9,Austria,Individuals using the Internet for visiting or...,All (individuals aged 16-74),2012,Percentage,Units,,52.8274,


In [56]:
# filter most recent year
df = df[(df.Time==2019)]
df = df[(df.Indicator=='Individuals using the Internet for visiting or interacting with public authorities websites - last 12 m (%)')]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df['Year'] = df['Time']
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [57]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)


Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
16,Austria,2019,Share of individuals using the Internet to int...,69.7355,4.715671,True
64,Belgium,2019,Share of individuals using the Internet to int...,58.605,4.063974,True
109,Brazil,2019,Share of individuals using the Internet to int...,34.194129,2.634704,True
148,Colombia,2019,Share of individuals using the Internet to int...,6.274581,1.0,True
169,Czech Republic,2019,Share of individuals using the Internet to int...,53.7766,3.781269,True
216,Denmark,2019,Share of individuals using the Internet to int...,91.6709,6.0,True
262,Estonia,2019,Share of individuals using the Internet to int...,80.0156,5.317576,True
310,Finland,2019,Share of individuals using the Internet to int...,87.2954,5.743812,True
357,France,2019,Share of individuals using the Internet to int...,74.7242,5.007762,True
397,Germany,2019,Share of individuals using the Internet to int...,59.0977,4.092822,True


In [58]:
### 9. Level of satisfaction for online public service

In [59]:
indicators[8]

# load data
indicator = indicators[8]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Level of satisfaction for online public services (% of users, by type of interaction and service)
digital_public_service_use


In [60]:
df.head(10)

Unnamed: 0,Country,Net Perception (%)
0,Estonia,67
1,UAE,61
2,Saudi Arabia,59
3,Singapore,54
4,China,53
5,New Zealand,52
6,Netherlands,51
7,Qatar,51
8,Canada,40
9,Denmark,48


In [61]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Net Perception (%)'] 
df['Year'] = 2020
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# Need to replace the % mark 

In [62]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Estonia,2020,Level of satisfaction for online public servic...,67,6.0,True
1,UAE,2020,Level of satisfaction for online public servic...,61,5.655172,True
2,Saudi Arabia,2020,Level of satisfaction for online public servic...,59,5.54023,True
3,Singapore,2020,Level of satisfaction for online public servic...,54,5.252874,True
4,China,2020,Level of satisfaction for online public servic...,53,5.195402,True
5,New Zealand,2020,Level of satisfaction for online public servic...,52,5.137931,True
6,Netherlands,2020,Level of satisfaction for online public servic...,51,5.08046,True
7,Qatar,2020,Level of satisfaction for online public servic...,51,5.08046,True
8,Canada,2020,Level of satisfaction for online public servic...,40,4.448276,True
9,Denmark,2020,Level of satisfaction for online public servic...,48,4.908046,True


In [63]:
### 10. Number of mobile apps available in national language

In [64]:
indicators[9]

# load data
indicator = indicators[9]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Number of mobile apps available in national language(s)
apps_in_national_language


In [65]:
df.head(15)

Unnamed: 0.1,Unnamed: 0,ISO Code,Country,Region,Year,Number of apps in national language
0,1,AFG,Afghanistan,South Asia,2014,2.444741
1,2,AFG,Afghanistan,South Asia,2015,2.793221
2,3,AFG,Afghanistan,South Asia,2016,2.849881
3,4,AFG,Afghanistan,South Asia,2017,2.913741
4,5,AFG,Afghanistan,South Asia,2018,2.961247
5,6,AFG,Afghanistan,South Asia,2019,3.0
6,7,AGO,Angola,Sub-Saharan Africa,2014,53.333237
7,8,AGO,Angola,Sub-Saharan Africa,2015,55.08091
8,9,AGO,Angola,Sub-Saharan Africa,2016,56.516411
9,10,AGO,Angola,Sub-Saharan Africa,2017,57.061077


In [66]:
# filter most recent year and global value
df = df[(df.Year==2019)]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Number of apps in national language'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [67]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Afghanistan,2019,Number of mobile apps available in national la...,3.0,1.0,True
11,Angola,2019,Number of mobile apps available in national la...,57.9785,3.833943,True
17,Albania,2019,Number of mobile apps available in national la...,67.286446,4.313734,True
23,United Arab Emirates,2019,Number of mobile apps available in national la...,78.453438,4.889352,True
29,Argentina,2019,Number of mobile apps available in national la...,89.724289,5.470324,True
35,Armenia,2019,Number of mobile apps available in national la...,12.541476,1.491829,True
41,Australia,2019,Number of mobile apps available in national la...,100.0,6.0,True
47,Austria,2019,Number of mobile apps available in national la...,89.197319,5.443161,True
53,Azerbaijan,2019,Number of mobile apps available in national la...,25.400829,2.154682,True
59,Burundi,2019,Number of mobile apps available in national la...,6.738045,1.192683,True


In [68]:
### 11. Device Addiction (time of use on internet)

In [69]:
indicators[10]

# load data
indicator = indicators[10]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Device Addiction (time of use on internet / on devices)
time_spent_online


In [70]:
df.head(15)

# Need to move the row further down

Unnamed: 0,Country,"Average daily time spent using the internet by online users worldwide as of 3rd quarter 2020, by region (in hours.minutes)"
0,Philippines,10.56
1,Brazil,10.08
2,Colombia,10.07
3,South Africa,10.06
4,Argentina,9.39
5,Malaysia,9.17
6,Mexico,9.01
7,Indonesia,8.52
8,Thailand,8.44
9,Taiwan,8.08


In [71]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Average daily time spent using the internet by online users worldwide as of 3rd quarter 2020, by region (in hours.minutes)'] 
df['Country Name'] = df['Country']
df['Year'] = 2020

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [72]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Philippines,2020,Device Addiction (time of use on internet / on...,10.56,6.0,True
1,Brazil,2020,Device Addiction (time of use on internet / on...,10.08,5.619651,True
2,Colombia,2020,Device Addiction (time of use on internet / on...,10.07,5.611727,True
3,South Africa,2020,Device Addiction (time of use on internet / on...,10.06,5.603803,True
4,Argentina,2020,Device Addiction (time of use on internet / on...,9.39,5.0729,True
5,Malaysia,2020,Device Addiction (time of use on internet / on...,9.17,4.898574,True
6,Mexico,2020,Device Addiction (time of use on internet / on...,9.01,4.771791,True
7,Indonesia,2020,Device Addiction (time of use on internet / on...,8.52,4.383518,True
8,Thailand,2020,Device Addiction (time of use on internet / on...,8.44,4.320127,True
9,Taiwan,2020,Device Addiction (time of use on internet / on...,8.08,4.034865,True


In [73]:
### 12. Gross National Wellbeing

In [74]:
indicators[11]

# load data
indicator = indicators[11]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# happiness_score is missing looks like the problem is in a redundant space before it in the filename matching

Gross National Wellbeing
happiness_score


In [75]:
df.head(15)

Unnamed: 0,RANK,COUNTRY/ECONOMY,VALUE,SCORE
0,1.0,Finland,7.78,100.0
1,2.0,Switzerland,7.69,98.31
2,3.0,Denmark,7.69,98.28
3,4.0,Iceland,7.53,95.13
4,5.0,Norway,7.44,93.35
5,6.0,Netherlands,7.43,93.02
6,7.0,Luxembourg,7.4,92.6
7,8.0,Sweden,7.4,92.49
8,9.0,Ireland,7.25,89.67
9,10.0,Australia,7.23,89.26


In [76]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['VALUE'] 
df['Country Name'] = df['COUNTRY/ECONOMY']
df['Year'] = 2019

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [77]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Finland,2019,Gross National Wellbeing,7.78,6.0,True
1,Switzerland,2019,Gross National Wellbeing,7.69,5.911591,True
2,Denmark,2019,Gross National Wellbeing,7.69,5.911591,True
3,Iceland,2019,Gross National Wellbeing,7.53,5.75442,True
4,Norway,2019,Gross National Wellbeing,7.44,5.666012,True
5,Netherlands,2019,Gross National Wellbeing,7.43,5.656189,True
6,Luxembourg,2019,Gross National Wellbeing,7.4,5.626719,True
7,Sweden,2019,Gross National Wellbeing,7.4,5.626719,True
8,Ireland,2019,Gross National Wellbeing,7.25,5.479371,True
9,Australia,2019,Gross National Wellbeing,7.23,5.459725,True


In [78]:
### 13. % of internet users who own cryptocurrency

In [79]:
indicators[12]

# load data
indicator = indicators[12]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of internet users who own cryptocurrency
cryptocurrency_adoption


In [80]:
df.head(15)

Unnamed: 0,Country,Share of respondents who indicated they either owned or used cryptocurrencies in 55 countries worldwide in 2020,Units
0,Nigeria,31.9,in %
1,Vietnam,21.1,in %
2,Philippines,19.8,in %
3,South Africa,17.8,in %
4,Thailand,17.6,in %
5,Peru,16.1,in %
6,Turkey,16.1,in %
7,Colombia,15.3,in %
8,Argentina,14.4,in %
9,Indonesia,13.0,in %


In [81]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Share of respondents who indicated they either owned or used cryptocurrencies in 55 countries worldwide in 2020'] 
df['Country Name'] = df['Country']
df['Year'] = 2020

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [82]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Nigeria,2020,% of internet users who own cryptocurrency,31.9,6.0,True
1,Vietnam,2020,% of internet users who own cryptocurrency,21.1,4.085106,True
2,Philippines,2020,% of internet users who own cryptocurrency,19.8,3.85461,True
3,South Africa,2020,% of internet users who own cryptocurrency,17.8,3.5,True
4,Thailand,2020,% of internet users who own cryptocurrency,17.6,3.464539,True
5,Peru,2020,% of internet users who own cryptocurrency,16.1,3.198582,True
6,Turkey,2020,% of internet users who own cryptocurrency,16.1,3.198582,True
7,Colombia,2020,% of internet users who own cryptocurrency,15.3,3.056738,True
8,Argentina,2020,% of internet users who own cryptocurrency,14.4,2.897163,True
9,Indonesia,2020,% of internet users who own cryptocurrency,13.0,2.648936,True


In [83]:
### 14. Percentage of individuals not buying online due to concerns about returning products

In [84]:
indicators[13]

# load data
indicator = indicators[13]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Percentage of individuals not buying online due to concerns about returning products
not_buying_online_concern_about_returning


In [85]:
df.head(15)

Unnamed: 0,Indicator,Country,Variable,Unit,Scope,Time,Value,Flags
0,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,All individuals (aged 16-74),2005,10.3582,
1,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,All individuals (aged 16-74),2006,14.6973,
2,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,All individuals (aged 16-74),2009,25.081,
3,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,All individuals (aged 16-74),2015,17.0493,
4,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,All individuals (aged 16-74),2017,15.11467,
5,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,All individuals (aged 16-74),2019,16.72749,
6,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,Individuals aged 16-24,2005,7.9119,
7,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,Individuals aged 16-24,2006,13.7518,
8,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,Individuals aged 16-24,2009,24.0688,
9,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,Individuals aged 16-24,2015,12.671,


In [86]:
# filter most recent year and global value
df = df[(df.Time==2019)]
df = df[(df.Scope =='All individuals (aged 16-74)')]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = False
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df['Year'] = df['Time']
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

# invert since to put as higher is  better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [87]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Austria,2019,Percentage of individuals not buying online du...,16.72749,4.589811,False
22,Belgium,2019,Percentage of individuals not buying online du...,9.16428,5.380248,False
37,Czech Republic,2019,Percentage of individuals not buying online du...,8.857784,5.41228,False
51,Denmark,2019,Percentage of individuals not buying online du...,13.27322,4.950819,False
67,Estonia,2019,Percentage of individuals not buying online du...,3.968984,5.923212,False
121,European Union (28 countries),2019,Percentage of individuals not buying online du...,17.25651,4.534523,False
135,Finland,2019,Percentage of individuals not buying online du...,46.96071,1.430114,False
152,France,2019,Percentage of individuals not buying online du...,14.23637,4.85016,False
167,Germany,2019,Percentage of individuals not buying online du...,16.45007,4.618804,False
184,Greece,2019,Percentage of individuals not buying online du...,9.599324,5.334781,False


In [88]:
### 15. Percentage of individuals not buying online due to payment security concerns

In [89]:
indicators[14]

# load data
indicator = indicators[14]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Percentage of individuals not buying online due to payment security concerns
not_buying_online_concern_about_security


In [90]:
df.head(15)

Unnamed: 0,Indicator,Country,Variable,Unit,Scope,Time,Value,Flags
0,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,All individuals (aged 16-74),2009,38.8093,
1,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,All individuals (aged 16-74),2015,34.8985,
2,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,All individuals (aged 16-74),2017,33.04743,
3,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,All individuals (aged 16-74),2019,31.19888,
4,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,Individuals aged 16-24,2009,38.3401,
5,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,Individuals aged 16-24,2015,34.9687,
6,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,Individuals aged 16-24,2017,16.65675,
7,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,Individuals aged 16-24,2019,12.88121,
8,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,Individuals aged 55-74,2009,40.5761,
9,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,Individuals aged 55-74,2015,42.602,


In [91]:
# filter most recent year and global value
df = df[(df.Time==2019)]
df = df[(df.Scope =='All individuals (aged 16-74)')]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = False
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df['Year'] = df['Time']
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

# invert since to put as higher is  better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [92]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
3,Austria,2019,Percentage of individuals not buying online du...,31.19888,3.765819,False
15,Belgium,2019,Percentage of individuals not buying online du...,18.12961,4.853663,False
27,Czech Republic,2019,Percentage of individuals not buying online du...,9.814267,5.545805,False
39,Denmark,2019,Percentage of individuals not buying online du...,18.77055,4.800313,False
51,Estonia,2019,Percentage of individuals not buying online du...,4.357608,6.0,False
84,European Union (28 countries),2019,Percentage of individuals not buying online du...,24.3086,4.339344,False
96,Finland,2019,Percentage of individuals not buying online du...,64.42721,1.0,False
108,France,2019,Percentage of individuals not buying online du...,40.84349,2.963033,False
120,Germany,2019,Percentage of individuals not buying online du...,25.12184,4.271652,False
132,Greece,2019,Percentage of individuals not buying online du...,23.22381,4.429638,False


In [93]:
### 16. E-waste generated, kilograms per inhabitant

In [94]:
indicators[15]

# load data
indicator = indicators[15]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

E-waste generated, kilograms per inhabitant
ewaste_per_inhabitant


In [95]:
df.head(15)

Unnamed: 0,iso3c,region_id,country_name,income_id,gdp,composition_food_organic_waste_percent,composition_glass_percent,composition_metal_percent,composition_other_percent,composition_paper_cardboard_percent,...,waste_treatment_controlled_landfill_percent,waste_treatment_incineration_percent,waste_treatment_landfill_unspecified_percent,waste_treatment_open_dump_percent,waste_treatment_other_percent,waste_treatment_recycling_percent,waste_treatment_sanitary_landfill_landfill_gas_system_percent,waste_treatment_unaccounted_for_percent,waste_treatment_waterways_marine_percent,where_where_is_this_data_measured
0,ABW,LCN,Aruba,HIC,35563.3125,,,,,,...,,,,,,11.0,,89.0,,
1,AFG,SAS,Afghanistan,LIC,2057.062256,,,,,,...,,,,,,,,,,Other
2,AGO,SSF,Angola,LMC,8036.69043,51.8,6.7,4.4,11.5,11.9,...,,,,,,,,,,
3,ALB,ECS,Albania,UMC,13724.058594,51.4,4.5,4.8,15.21,9.9,...,,,,,,,,,,Some disposal sites
4,AND,ECS,Andorra,HIC,43711.800781,31.2,8.2,2.6,11.6,35.1,...,,52.1,,,,,,47.9,,
5,ARE,MEA,United Arab Emirates,HIC,67119.132812,39.0,4.0,3.0,10.0,25.0,...,,,9.0,62.0,,20.0,,,,
6,ARG,LCN,Argentina,HIC,23550.099609,38.74,3.16,1.84,15.36,13.96,...,8.9,,,22.6,,6.0,62.5,,,Other
7,ARM,ECS,Armenia,UMC,11019.838867,57.0,3.2,3.4,17.4,6.7,...,,,,100.0,,,,,,Other
8,ASM,EAS,American Samoa,UMC,11113.442383,19.7,3.4,7.9,25.6,26.4,...,,,,,,,,,,
9,ATG,LCN,Antigua and Barbuda,HIC,17965.501953,46.0,7.0,7.0,12.0,15.0,...,98.68,,,,,,,1.14,0.1,Disposal Site


In [96]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
# Use special_waste_e_waste_tons_year times one thousand and divde by total population
df['data_col'] = df['special_waste_e_waste_tons_year']*1000/df['population_population_number_of_people'] 
df['Year'] = 2021
df['Country Name'] = df['country_name']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

In [97]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Aruba,2021,"E-waste generated, kilograms per inhabitant",,,True
1,Afghanistan,2021,"E-waste generated, kilograms per inhabitant",0.5771,1.040883,True
2,Angola,2021,"E-waste generated, kilograms per inhabitant",3.665901,1.261006,True
3,Albania,2021,"E-waste generated, kilograms per inhabitant",7.00724,1.499126,True
4,Andorra,2021,"E-waste generated, kilograms per inhabitant",,,True
5,United Arab Emirates,2021,"E-waste generated, kilograms per inhabitant",13.714713,1.977133,True
6,Argentina,2021,"E-waste generated, kilograms per inhabitant",6.786638,1.483405,True
7,Armenia,2021,"E-waste generated, kilograms per inhabitant",4.817254,1.343057,True
8,American Samoa,2021,"E-waste generated, kilograms per inhabitant",,,True
9,Antigua and Barbuda,2021,"E-waste generated, kilograms per inhabitant",11.366337,1.809777,True


In [98]:
### 17. Automation-led unemployment

In [99]:
indicators[16]

# load data
indicator = indicators[16]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Automation-led unemployment
automation_led_unemployment


In [100]:
df.head(15)

Unnamed: 0,Country,No. of employees potentially automable (millions),Total employees (millions),Potential Rate of Automation (%)
0,Japan,35.6,63.9,55.71
1,Thailand,21.0,38.4,54.69
2,Senegal,2.2,4.07,54.0
3,Colombia,9.3,17.5,53.14
4,Peru,6.9,13.0,53.08
5,Taiwan,5.2,9.8,53.06
6,Kenya,7.4,14.2,52.11
7,South Korea,12.5,24.0,52.08
8,Sweden,2.1,4.04,52.0
9,Costa Rica,1.1,2.12,52.0


In [101]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Potential Rate of Automation (%)'] 
df['Year'] = 2018
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

In [102]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Japan,2018,Automation-led unemployment,55.71,6.0,True
1,Thailand,2018,Automation-led unemployment,54.69,5.656797,True
2,Senegal,2018,Automation-led unemployment,54.0,5.42463,True
3,Colombia,2018,Automation-led unemployment,53.14,5.135262,True
4,Peru,2018,Automation-led unemployment,53.08,5.115074,True
5,Taiwan,2018,Automation-led unemployment,53.06,5.108345,True
6,Kenya,2018,Automation-led unemployment,52.11,4.788694,True
7,South Korea,2018,Automation-led unemployment,52.08,4.7786,True
8,Sweden,2018,Automation-led unemployment,52.0,4.751682,True
9,Costa Rica,2018,Automation-led unemployment,52.0,4.751682,True


In [103]:
### 18. Cyberbullying

In [104]:
indicators[17]

# load data
indicator = indicators[17]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Cyberbullying
cyberbullying_rate


In [105]:
df.head(15)

# Need to move the top row down further

Unnamed: 0,2011,2016,2018,Country
0,32,32,37,India
1,20,19,29,Brazil
2,15,34,26,United States
3,12,13,25,Belgium
4,10,25,26,South Africa
5,--,--,23,Malaysia
6,14,20,23,Sweden
7,18,17,20,Canada
8,5,14,20,Turkey
9,18,17,19,Saudi Arabia


In [106]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2018'] 
df['Year'] = 2018
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

# invert since to put as higher is  better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [107]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,India,2018,Cyberbullying,37,1.0,True
1,Brazil,2018,Cyberbullying,29,2.111111,True
2,United States,2018,Cyberbullying,26,2.527778,True
3,Belgium,2018,Cyberbullying,25,2.666667,True
4,South Africa,2018,Cyberbullying,26,2.527778,True
5,Malaysia,2018,Cyberbullying,23,2.944444,True
6,Sweden,2018,Cyberbullying,23,2.944444,True
7,Canada,2018,Cyberbullying,20,3.361111,True
8,Turkey,2018,Cyberbullying,20,3.361111,True
9,Saudi Arabia,2018,Cyberbullying,19,3.5,True


In [108]:
### 19. Global Wellbeing Initiative

In [109]:
indicators[18]

# load data
indicator = indicators[18]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# Strange the global_wellbeing_initiative is clearly in the data manifest and the link works well 
# but it has not been moved to the processed folder

Global Wellbeing Initiative (World Happiness Index)
global_wellbeing_initiative


In [110]:
### 20. Financial Inclusiveness

In [111]:
indicators[19]

# load data
indicator = indicators[19]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)


df = pd.read_csv('../../processed/{}.csv'.format(bf))
# Strange the global_wellbeing_initiative is clearly in the data manifest and the link works well 
# but it has not been moved to the processed folder

Financial Inclusiveness
financial_inclusiveness


In [112]:
df.head(15)

Unnamed: 0.1,Unnamed: 0,Account (% age 15+),"Account, male (% age 15+)","Account, in labor force (% age 15+)","Account, out of labor force (% age 15+)","Account, female (% age 15+)","Account, young adults (% ages 15-24)","Account, older adults (% ages 25+)","Account, primary education or less (% ages 15+)","Account, secondary education or more (% ages 15+)",...,"Mobile money account, female (% age 15+)","Mobile money account, young adults (% age 15-24)","Mobile money account, older adults (% age 25+)","Mobile money account, primary education or less (% age 15+)","Mobile money account, secondary education or less (% age 15+)","Mobile money account, income, poorest 40% (% age 15+)","Mobile money account, income, richest 60% (% age 15+)","Mobile money account, rural (% age 15+)",data_country,data_year
0,Low income,9%,15%,15%,2%,3%,6%,11%,5%,30%,...,,,,,,,,,,Low income
1,Low income,10%,16%,15%,4%,4%,7%,12%,5%,23%,...,0%,0%,0%,0%,0%,0%,1%,0%,,Low income
2,Low income,15%,23%,25%,4%,7%,10%,18%,9%,31%,...,1%,0%,1%,0%,2%,0%,1%,1%,,Low income
3,Lower middle income,39%,39%,46%,31%,39%,30%,45%,35%,44%,...,,,,,,,,,,Lower middle income
4,Lower middle income,29%,36%,36%,12%,22%,15%,38%,14%,70%,...,,,,,,,,,,Lower middle income
5,Upper middle income,28%,34%,36%,15%,23%,26%,29%,15%,36%,...,,,,,,,,,,Upper middle income
6,Upper middle income,38%,43%,44%,29%,34%,30%,40%,24%,56%,...,,,,,,,,,,Upper middle income
7,Upper middle income,40%,42%,53%,27%,38%,32%,43%,27%,56%,...,2%,6%,1%,1%,4%,0%,4%,2%,,Upper middle income
8,,22%,30%,33%,11%,14%,15%,26%,14%,31%,...,,,,,,,,,,
9,,30%,38%,42%,18%,22%,21%,34%,21%,39%,...,,,,,,,,,,


In [113]:
### 21. E-commerce activity (% of individuals buying online and frequency)

In [114]:
indicators[20]

# load data
indicator = indicators[20]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# Added _ between spaces to see if this changes anything

E-commerce activity (% of individuals buying online and frequency)
individuals_buying_online_frequency


In [115]:
df.head(15)

# Move the top row down further

Unnamed: 0,RANK,COUNTRY/ECONOMY,VALUE (%),SCORE,Year
0,1.0,Denmark,77.97,100.0,2017
1,2.0,Netherlands,75.72,97.11,2017
2,3.0,Norway,75.63,96.99,2017
3,4.0,United Kingdom,74.74,95.86,2017
4,5.0,"Korea, Rep.",72.47,92.94,2017
5,6.0,Sweden,71.65,91.9,2017
6,7.0,United States,70.43,90.33,2017
7,8.0,New Zealand,69.11,88.63,2017
8,9.0,Canada,68.57,87.95,2017
9,10.0,Australia,67.69,86.81,2017


In [116]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['VALUE (%)'] 
df['Country Name'] = df['COUNTRY/ECONOMY']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

In [117]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Denmark,2017,E-commerce activity (% of individuals buying o...,77.97,6.0,True
1,Netherlands,2017,E-commerce activity (% of individuals buying o...,75.72,5.855658,True
2,Norway,2017,E-commerce activity (% of individuals buying o...,75.63,5.849885,True
3,United Kingdom,2017,E-commerce activity (% of individuals buying o...,74.74,5.792789,True
4,"Korea, Rep.",2017,E-commerce activity (% of individuals buying o...,72.47,5.647164,True
5,Sweden,2017,E-commerce activity (% of individuals buying o...,71.65,5.59456,True
6,United States,2017,E-commerce activity (% of individuals buying o...,70.43,5.516295,True
7,New Zealand,2017,E-commerce activity (% of individuals buying o...,69.11,5.431614,True
8,Canada,2017,E-commerce activity (% of individuals buying o...,68.57,5.396972,True
9,Australia,2017,E-commerce activity (% of individuals buying o...,67.69,5.340518,True


In [118]:
### 22. E-commerce activity (Types of goods and services purchased online) 

In [119]:
indicators[21]

# load data
indicator = indicators[21]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# Move the top row down further
# This is a list of categories, wonder if it will do

E-commerce activity (Types of goods and services purchased online) 
e-commerce_activity


In [120]:
df.head(15)

Unnamed: 0,Category,"Average e-commerce spending per online shopper worldwide per visit as of 1st quarter 2021, by category (in U.S. dollars)"
0,Luxury apparel,3.45
1,Active apparel,2.96
2,General apparel,2.7
3,All verticals,3.39
4,Health and beauty,2.4
5,Home and appliances,2.08


In [121]:
### 23. Top Visited websites

In [122]:
indicators[22]

# load data
indicator = indicators[22]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Top Visited websites
top_sites


In [123]:
df.head(15)

Unnamed: 0,1,google.com
0,2,youtube.com
1,3,facebook.com
2,4,baidu.com
3,5,wikipedia.org
4,6,yahoo.com
5,7,google.co.in
6,8,reddit.com
7,9,qq.com
8,10,amazon.com
9,11,taobao.com


In [124]:
### 24. Top YouTube Searches

In [125]:
indicators[23]

# load data
indicator = indicators[23]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Top YouTube Searches
youtube_searches


In [126]:
df.head(15)

Unnamed: 0,#,Keyword,Search Volume
0,1,bts,16723304
1,2,pewdiepie,16495659
2,3,asmr,14655088
3,4,billie eilish,13801247
4,5,baby shark,12110100
5,6,old town road,10456524
6,7,music,10232134
7,8,badabun,10188997
8,9,blackpink,9580131
9,10,fortnite,9117342


In [127]:
### 25. Top Google searches

In [128]:
indicators[24]

# load data
indicator = indicators[24]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Top Google searches
google_trends


In [129]:
df.head(15)

Unnamed: 0.1,Unnamed: 0,data_country,data_year
0,13.6,13.6,13.6
1,4.1,4.1,4.1
2,4.1,4.1,4.1
3,4.1,4.1,4.1
4,2.2,2.2,2.2
5,2.2,2.2,2.2
6,1.8,1.8,1.8
7,1.5,1.5,1.5
8,1.5,1.5,1.5
9,1.2,1.2,1.2


In [130]:
### 26. Internet Usage

In [139]:
indicators[25]

# load data
indicator = indicators[25]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Internet Usage
ITU_database


In [140]:
df.head(15)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
0,Angola,Africa,AGO,Female mobile phone ownership as a % of total ...,2010.0,,,
1,Benin,Africa,BEN,Female mobile phone ownership as a % of total ...,2010.0,,,
2,Botswana,Africa,BWA,Female mobile phone ownership as a % of total ...,2010.0,,,
3,Burkina Faso,Africa,BFA,Female mobile phone ownership as a % of total ...,2010.0,,,
4,Burundi,Africa,BDI,Female mobile phone ownership as a % of total ...,2010.0,,,
5,Cabo Verde,Africa,CPV,Female mobile phone ownership as a % of total ...,2010.0,,,
6,Cameroon,Africa,CMR,Female mobile phone ownership as a % of total ...,2010.0,,,
7,Central African Rep.,Africa,CAF,Female mobile phone ownership as a % of total ...,2010.0,,,
8,Chad,Africa,TCD,Female mobile phone ownership as a % of total ...,2010.0,,,
9,Congo (Rep. of the),Africa,COG,Female mobile phone ownership as a % of total ...,2010.0,,,


In [141]:
df = df[(df['Indicator name'] == 'Individuals using the Internet, total (%)')]

In [142]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df['Year'] = 2019
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

In [144]:
df[['Country Name','Year','data_col','new_rank_score','higher_is_better']].head(150)

Unnamed: 0,Country Name,Year,data_col,new_rank_score,higher_is_better
45916,Angola,2019,2.80,1.1400,True
45917,Benin,2019,3.13,1.1565,True
45918,Botswana,2019,6.00,1.3000,True
45919,Burkina Faso,2019,2.40,1.1200,True
45920,Burundi,2019,1.00,1.0500,True
...,...,...,...,...,...
46061,Norway,2019,93.39,5.6695,True
46062,Poland,2019,62.32,4.1160,True
46063,Portugal,2019,53.30,3.6650,True
46064,Romania,2019,39.93,2.9965,True


In [None]:
### 27. Households with a computer and with Internet Access

In [158]:
indicators[26]

# load data
indicator = indicators[26]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Households with a computer and with Internet access
ITU_database


In [159]:
df.head(15)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
0,Angola,Africa,AGO,Female mobile phone ownership as a % of total ...,2010.0,,,
1,Benin,Africa,BEN,Female mobile phone ownership as a % of total ...,2010.0,,,
2,Botswana,Africa,BWA,Female mobile phone ownership as a % of total ...,2010.0,,,
3,Burkina Faso,Africa,BFA,Female mobile phone ownership as a % of total ...,2010.0,,,
4,Burundi,Africa,BDI,Female mobile phone ownership as a % of total ...,2010.0,,,
5,Cabo Verde,Africa,CPV,Female mobile phone ownership as a % of total ...,2010.0,,,
6,Cameroon,Africa,CMR,Female mobile phone ownership as a % of total ...,2010.0,,,
7,Central African Rep.,Africa,CAF,Female mobile phone ownership as a % of total ...,2010.0,,,
8,Chad,Africa,TCD,Female mobile phone ownership as a % of total ...,2010.0,,,
9,Congo (Rep. of the),Africa,COG,Female mobile phone ownership as a % of total ...,2010.0,,,


In [160]:
df = df[(df['Indicator name'] == 'Households with Internet access at home (%)')]
df = df[(df.Year == 2019)]
df

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
44796,Angola,Africa,AGO,Households with Internet access at home (%),2018.0,6.711401,,
44797,Benin,Africa,BEN,Households with Internet access at home (%),2018.0,,,
44798,Botswana,Africa,BWA,Households with Internet access at home (%),2018.0,,,
44799,Burkina Faso,Africa,BFA,Households with Internet access at home (%),2018.0,,,
44800,Burundi,Africa,BDI,Households with Internet access at home (%),2018.0,,,
...,...,...,...,...,...,...,...,...
44987,Suriname,The Americas,SUR,Households with Internet access at home (%),2018.0,52.752486,,
44988,Trinidad and Tobago,The Americas,TTO,Households with Internet access at home (%),2018.0,,,
44989,United States,The Americas,USA,Households with Internet access at home (%),2018.0,85.349794,,
44990,Uruguay,The Americas,URY,Households with Internet access at home (%),2018.0,68.412062,,


In [161]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [162]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']]

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
44796,Angola,2018.0,Households with a computer and with Internet a...,6.711401,1.004915,True
44797,Benin,2018.0,Households with a computer and with Internet a...,,,True
44798,Botswana,2018.0,Households with a computer and with Internet a...,,,True
44799,Burkina Faso,2018.0,Households with a computer and with Internet a...,,,True
44800,Burundi,2018.0,Households with a computer and with Internet a...,,,True
...,...,...,...,...,...,...
44987,Suriname,2018.0,Households with a computer and with Internet a...,52.752486,3.470159,True
44988,Trinidad and Tobago,2018.0,Households with a computer and with Internet a...,,,True
44989,United States,2018.0,Households with a computer and with Internet a...,85.349794,5.215563,True
44990,Uruguay,2018.0,Households with a computer and with Internet a...,68.412062,4.308642,True


In [None]:
### 28. % of population using Facebook

In [163]:
indicators[27]

# load data
indicator = indicators[27]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population using Facebook
FB_users


In [164]:
df.head(15)

Unnamed: 0,Country,Facebook Users,Population in Thousands (2021),Percentage of Facebook Users
0,India,251000000,1393409,18.01
1,United States,240000000,332915,72.09
2,Brazil,139000000,213993,64.96
3,Indonesia,136960000,276362,49.56
4,Mexico,78000000,130262,59.88
5,Philippines,71760000,111047,64.62
6,Vietnam,66720000,98169,67.96
7,Thailand,46000000,69951,65.76
8,United Kingdom,44000000,68207,64.51
9,Turkey,44000000,85043,51.74


In [165]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Percentage of Facebook Users'] 
df['Year'] = 2021
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [166]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,India,2021,% of population using Facebook,18.01,1.897091,True
1,United States,2021,% of population using Facebook,72.09,4.590855,True
2,Brazil,2021,% of population using Facebook,64.96,4.235704,True
3,Indonesia,2021,% of population using Facebook,49.56,3.468619,True
4,Mexico,2021,% of population using Facebook,59.88,3.982666,True
5,Philippines,2021,% of population using Facebook,64.62,4.218769,True
6,Vietnam,2021,% of population using Facebook,67.96,4.385136,True
7,Thailand,2021,% of population using Facebook,65.76,4.275553,True
8,United Kingdom,2021,% of population using Facebook,64.51,4.213289,True
9,Turkey,2021,% of population using Facebook,51.74,3.577207,True


In [None]:
### 29. Gender gap for social media use

In [167]:
indicators[28]

# load data
indicator = indicators[28]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Gender gap for social media use
gender_gaps


In [168]:
df.head(15)

Unnamed: 0,ISO Code,Country,Region,Year,Gender parity in account ownership,Gender gap in social media use,Gender gap in mobile ownership
0,AFG,Afghanistan,South Asia,2014,11.109999,0.0,0.0
1,AFG,Afghanistan,South Asia,2015,14.087173,0.0,0.0
2,AFG,Afghanistan,South Asia,2016,17.06435,0.0,0.0
3,AFG,Afghanistan,South Asia,2017,20.041523,0.0,0.0
4,AFG,Afghanistan,South Asia,2018,20.041523,0.0,0.0
5,AFG,Afghanistan,South Asia,2019,20.041523,0.0,0.0
6,AGO,Angola,Sub-Saharan Africa,2014,55.237572,46.42857,51.645042
7,AGO,Angola,Sub-Saharan Africa,2015,55.237572,46.42857,57.001461
8,AGO,Angola,Sub-Saharan Africa,2016,55.237572,46.42857,63.725491
9,AGO,Angola,Sub-Saharan Africa,2017,55.237572,38.646553,53.966476


In [169]:
df = df[df.Year==2019]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Gender gap in social media use'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [170]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Afghanistan,2019,Gender gap for social media use,0.0,1.0,True
11,Angola,2019,Gender gap for social media use,49.552494,3.477625,True
17,Albania,2019,Gender gap for social media use,41.40612,3.070306,True
23,United Arab Emirates,2019,Gender gap for social media use,13.890497,1.694525,True
29,Argentina,2019,Gender gap for social media use,100.0,6.0,True
35,Armenia,2019,Gender gap for social media use,88.382561,5.419128,True
41,Australia,2019,Gender gap for social media use,100.0,6.0,True
47,Austria,2019,Gender gap for social media use,93.794891,5.689745,True
53,Azerbaijan,2019,Gender gap for social media use,82.5784,5.12892,True
59,Burundi,2019,Gender gap for social media use,20.199215,2.009961,True


In [None]:
### 30. % of population using digital financial services

In [171]:
indicators[29]

# load data
indicator = indicators[29]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population using digital financial services
population_digital_financial_services


In [172]:
df.head(15)

# May have to convert the data to csv file as the original file is missing many important columns

Unnamed: 0.1,Unnamed: 0,Account (% age 15+),"Account, male (% age 15+)","Account, in labor force (% age 15+)","Account, out of labor force (% age 15+)","Account, female (% age 15+)","Account, young adults (% ages 15-24)","Account, older adults (% ages 25+)","Account, primary education or less (% ages 15+)","Account, secondary education or more (% ages 15+)",...,"Mobile money account, female (% age 15+)","Mobile money account, young adults (% age 15-24)","Mobile money account, older adults (% age 25+)","Mobile money account, primary education or less (% age 15+)","Mobile money account, secondary education or less (% age 15+)","Mobile money account, income, poorest 40% (% age 15+)","Mobile money account, income, richest 60% (% age 15+)","Mobile money account, rural (% age 15+)",data_country,data_year
0,Low income,9%,15%,15%,2%,3%,6%,11%,5%,30%,...,,,,,,,,,Low income,Low income
1,Low income,10%,16%,15%,4%,4%,7%,12%,5%,23%,...,0%,0%,0%,0%,0%,0%,1%,0%,Low income,Low income
2,Low income,15%,23%,25%,4%,7%,10%,18%,9%,31%,...,1%,0%,1%,0%,2%,0%,1%,1%,Low income,Low income
3,Lower middle income,39%,39%,46%,31%,39%,30%,45%,35%,44%,...,,,,,,,,,Lower middle income,Lower middle income
4,Lower middle income,29%,36%,36%,12%,22%,15%,38%,14%,70%,...,,,,,,,,,Lower middle income,Lower middle income
5,Upper middle income,28%,34%,36%,15%,23%,26%,29%,15%,36%,...,,,,,,,,,Upper middle income,Upper middle income
6,Upper middle income,38%,43%,44%,29%,34%,30%,40%,24%,56%,...,,,,,,,,,Upper middle income,Upper middle income
7,Upper middle income,40%,42%,53%,27%,38%,32%,43%,27%,56%,...,2%,6%,1%,1%,4%,0%,4%,2%,Upper middle income,Upper middle income
8,,22%,30%,33%,11%,14%,15%,26%,14%,31%,...,,,,,,,,,,
9,,30%,38%,42%,18%,22%,21%,34%,21%,39%,...,,,,,,,,,,


In [None]:
### 31. Mobile Broadband Pricing (pre-paid)

In [173]:
indicators[30]

# load data
indicator = indicators[30]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# File not found despite its presence in the personal repo and the data manifest

Mobile Broadband Pricing (pre-paid)
ITU_database


In [175]:
df.head(15)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
0,Angola,Africa,AGO,Female mobile phone ownership as a % of total ...,2010.0,,,
1,Benin,Africa,BEN,Female mobile phone ownership as a % of total ...,2010.0,,,
2,Botswana,Africa,BWA,Female mobile phone ownership as a % of total ...,2010.0,,,
3,Burkina Faso,Africa,BFA,Female mobile phone ownership as a % of total ...,2010.0,,,
4,Burundi,Africa,BDI,Female mobile phone ownership as a % of total ...,2010.0,,,
5,Cabo Verde,Africa,CPV,Female mobile phone ownership as a % of total ...,2010.0,,,
6,Cameroon,Africa,CMR,Female mobile phone ownership as a % of total ...,2010.0,,,
7,Central African Rep.,Africa,CAF,Female mobile phone ownership as a % of total ...,2010.0,,,
8,Chad,Africa,TCD,Female mobile phone ownership as a % of total ...,2010.0,,,
9,Congo (Rep. of the),Africa,COG,Female mobile phone ownership as a % of total ...,2010.0,,,


In [177]:
df = df[df.Year==2019]
df = df[df['Indicator name'] == 'Mobile broadband basket as a % of GNI p.c.']
df

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
17974,Angola,Africa,AGO,Mobile broadband basket as a % of GNI p.c.,2019.0,5.63,,
17975,Benin,Africa,BEN,Mobile broadband basket as a % of GNI p.c.,2019.0,6.21,,
17976,Botswana,Africa,BWA,Mobile broadband basket as a % of GNI p.c.,2019.0,2.87,,
17977,Burkina Faso,Africa,BFA,Mobile broadband basket as a % of GNI p.c.,2019.0,19.63,,
17978,Burundi,Africa,BDI,Mobile broadband basket as a % of GNI p.c.,2019.0,20.43,,
...,...,...,...,...,...,...,...,...
18161,Saint Vincent and the Grenadines,The Americas,VCT,Mobile broadband basket as a % of GNI p.c.,2019.0,4.59,,
18162,Suriname,The Americas,SUR,Mobile broadband basket as a % of GNI p.c.,2019.0,1.93,,
18163,Trinidad and Tobago,The Americas,TTO,Mobile broadband basket as a % of GNI p.c.,2019.0,3.07,,
18164,United States,The Americas,USA,Mobile broadband basket as a % of GNI p.c.,2019.0,0.42,,


In [178]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# invert since to put as higher is  better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [179]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
17974,Angola,2019.0,Mobile Broadband Pricing (pre-paid),5.63,5.553571,True
17975,Benin,2019.0,Mobile Broadband Pricing (pre-paid),6.21,5.506062,True
17976,Botswana,2019.0,Mobile Broadband Pricing (pre-paid),2.87,5.779653,True
17977,Burkina Faso,2019.0,Mobile Broadband Pricing (pre-paid),19.63,4.406782,True
17978,Burundi,2019.0,Mobile Broadband Pricing (pre-paid),20.43,4.341252,True
17979,Cabo Verde,2019.0,Mobile Broadband Pricing (pre-paid),3.16,5.755898,True
17980,Cameroon,2019.0,Mobile Broadband Pricing (pre-paid),4.67,5.632208,True
17981,Central African Rep.,2019.0,Mobile Broadband Pricing (pre-paid),26.99,3.803899,True
17982,Chad,2019.0,Mobile Broadband Pricing (pre-paid),32.23,3.374672,True
17983,Congo (Rep. of the),2019.0,Mobile Broadband Pricing (pre-paid),13.17,4.935944,True


In [None]:
### 32. Tax as % of total cost of mobile ownership

In [180]:
indicators[31]

# load data
indicator = indicators[31]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Tax as % of total cost of mobile ownership
tax_percent_mobile_ownership


In [181]:
df.head(15)

Unnamed: 0,ISO Code,Country,Region,Year,Cluster,Index,Infrastructure,Affordability,Consumer Readiness,Content and Services,...,Gender gap in mobile ownership,TLDs per capita,E-Government Score,Mobile Social Media Penetration,Apps developed per person,Number of apps in national language,Accessibility of top ranked apps,Cybersecurity Index,data_country,data_year
0,AFG,Afghanistan,South Asia,2014,Discoverer,22.12,21.74,31.79,24.4,14.19,...,0.0,39.55,18.11,3.28,20.98,2.44,4.37,26.5,,
1,AFG,Afghanistan,South Asia,2015,Discoverer,22.99,22.82,30.81,25.28,15.71,...,0.0,39.57,24.27,4.36,22.93,2.79,8.03,25.83,,
2,AFG,Afghanistan,South Asia,2016,Discoverer,23.71,26.92,26.75,26.07,16.83,...,0.0,39.58,30.43,6.73,30.31,2.85,5.9,25.17,,
3,AFG,Afghanistan,South Asia,2017,Discoverer,25.82,33.54,27.22,28.56,17.04,...,0.0,39.47,30.5,7.78,31.62,2.91,6.15,24.5,,
4,AFG,Afghanistan,South Asia,2018,Discoverer,28.39,30.91,42.64,29.24,16.87,...,0.0,39.39,30.56,8.54,36.54,2.96,8.66,17.7,,
5,AFG,Afghanistan,South Asia,2019,Discoverer,28.94,32.34,41.53,29.72,17.58,...,0.0,39.41,41.18,9.39,39.36,3.0,5.72,17.7,,
6,AGO,Angola,Sub-Saharan Africa,2014,Discoverer,32.78,25.99,35.99,44.33,27.85,...,51.65,0.0,29.92,3.99,22.74,53.33,49.09,8.8,,
7,AGO,Angola,Sub-Saharan Africa,2015,Emerging,37.18,33.09,42.01,45.41,30.27,...,57.0,0.31,32.35,5.26,22.12,55.08,58.33,8.47,,
8,AGO,Angola,Sub-Saharan Africa,2016,Emerging,39.85,37.8,44.74,46.47,32.11,...,63.73,0.0,34.78,6.27,27.46,56.52,61.73,8.13,,
9,AGO,Angola,Sub-Saharan Africa,2017,Emerging,42.89,48.6,47.94,46.32,31.36,...,53.97,0.0,37.88,4.26,31.94,57.06,55.09,7.8,,


In [182]:
df = df[df.Year==2019]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = False
df['Indicator'] = indicator
df['data_col'] = df['Tax as a % of TCMO'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# invert since to put as higher is  better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [183]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
5,Afghanistan,2019,Tax as % of total cost of mobile ownership,79.16,2.042,False
11,Angola,2019,Tax as % of total cost of mobile ownership,65.83,2.7085,False
17,Albania,2019,Tax as % of total cost of mobile ownership,50.0,3.5,False
23,United Arab Emirates,2019,Tax as % of total cost of mobile ownership,82.31,1.8845,False
29,Argentina,2019,Tax as % of total cost of mobile ownership,5.23,5.7385,False
35,Armenia,2019,Tax as % of total cost of mobile ownership,57.58,3.121,False
41,Australia,2019,Tax as % of total cost of mobile ownership,75.0,2.25,False
47,Austria,2019,Tax as % of total cost of mobile ownership,50.0,3.5,False
53,Azerbaijan,2019,Tax as % of total cost of mobile ownership,36.41,4.1795,False
59,Burundi,2019,Tax as % of total cost of mobile ownership,44.57,3.7715,False


In [None]:
### 33. % of population with a SIM card

In [189]:
indicators[32]

# load data
indicator = indicators[32]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population with a SIM card
ITU_database


In [191]:
df = df[df.Year==2019]
df = df[df['Indicator name'] == 'Mobile-cellular subscriptions per 100 inhabitants']
df

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
32279,Angola,Africa,AGO,Mobile-cellular subscriptions per 100 inhabitants,2019.0,46.598638,,
32280,Benin,Africa,BEN,Mobile-cellular subscriptions per 100 inhabitants,2019.0,87.702013,,
32281,Botswana,Africa,BWA,Mobile-cellular subscriptions per 100 inhabitants,2019.0,162.641181,,
32282,Burkina Faso,Africa,BFA,Mobile-cellular subscriptions per 100 inhabitants,2019.0,100.212239,,
32283,Burundi,Africa,BDI,Mobile-cellular subscriptions per 100 inhabitants,2019.0,56.649700,,
...,...,...,...,...,...,...,...,...
32470,Suriname,The Americas,SUR,Mobile-cellular subscriptions per 100 inhabitants,2019.0,139.986790,,
32471,Trinidad and Tobago,The Americas,TTO,Mobile-cellular subscriptions per 100 inhabitants,2019.0,155.108737,,
32472,United States,The Americas,USA,Mobile-cellular subscriptions per 100 inhabitants,2019.0,134.458879,,
32473,Uruguay,The Americas,URY,Mobile-cellular subscriptions per 100 inhabitants,2019.0,138.074936,,


In [194]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [195]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
32279,Angola,2019.0,% of population with a SIM card,46.598638,1.493811,True
32280,Benin,2019.0,% of population with a SIM card,87.702013,2.25939,True
32281,Botswana,2019.0,% of population with a SIM card,162.641181,3.655183,True
32282,Burkina Faso,2019.0,% of population with a SIM card,100.212239,2.492401,True
32283,Burundi,2019.0,% of population with a SIM card,56.6497,1.681019,True
32284,Cabo Verde,2019.0,% of population with a SIM card,108.318438,2.643385,True
32285,Cameroon,2019.0,% of population with a SIM card,82.703748,2.166294,True
32286,Central African Rep.,2019.0,% of population with a SIM card,33.619216,1.25206,True
32287,Chad,2019.0,% of population with a SIM card,48.064831,1.52112,True
32288,Congo (Rep. of the),2019.0,% of population with a SIM card,,,True


In [None]:
### 34. % of population with a smartphone

In [196]:
indicators[33]

# load data
indicator = indicators[33]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population with a smartphone
population_with_smartphones


In [197]:
df.head(15)

Unnamed: 0.1,Unnamed: 0,Country,Total Population,Smartphone Penetration Rate,Smartphone Users
0,1,China,1.44B,0.64,918.45M
1,2,India,1.38B,0.32,439.42M
2,3,United States,331M,0.82,270M
3,4,Indonesia,273.52M,0.59,160.23M
4,5,Brazil,212.56M,0.51,109.34M
5,6,Russia,145.93M,0.69,99.93M
6,7,Japan,126.48M,0.63,80M
7,8,Mexico,128.93M,0.54,70.14M
8,9,Germany,83.78M,0.78,65.24M
9,10,Vietnam,97.34M,0.63,61.37M


In [202]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Smartphone Penetration Rate'] 
df['Country Name'] = df['Country']
df['Year'] = 2020

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [203]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,China,2020,% of population with a smartphone,0.64,4.59375,True
1,India,2020,% of population with a smartphone,0.32,2.09375,True
2,United States,2020,% of population with a smartphone,0.82,6.0,True
3,Indonesia,2020,% of population with a smartphone,0.59,4.203125,True
4,Brazil,2020,% of population with a smartphone,0.51,3.578125,True
5,Russia,2020,% of population with a smartphone,0.69,4.984375,True
6,Japan,2020,% of population with a smartphone,0.63,4.515625,True
7,Mexico,2020,% of population with a smartphone,0.54,3.8125,True
8,Germany,2020,% of population with a smartphone,0.78,5.6875,True
9,Vietnam,2020,% of population with a smartphone,0.63,4.515625,True


In [None]:
### 35. Gender gap in internet usage

In [204]:
indicators[34]

# load data
indicator = indicators[34]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Gender gap in internet usage
ITU_database


In [206]:
df.head(15)

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
0,Angola,Africa,AGO,Female mobile phone ownership as a % of total ...,2010.0,,,
1,Benin,Africa,BEN,Female mobile phone ownership as a % of total ...,2010.0,,,
2,Botswana,Africa,BWA,Female mobile phone ownership as a % of total ...,2010.0,,,
3,Burkina Faso,Africa,BFA,Female mobile phone ownership as a % of total ...,2010.0,,,
4,Burundi,Africa,BDI,Female mobile phone ownership as a % of total ...,2010.0,,,
5,Cabo Verde,Africa,CPV,Female mobile phone ownership as a % of total ...,2010.0,,,
6,Cameroon,Africa,CMR,Female mobile phone ownership as a % of total ...,2010.0,,,
7,Central African Rep.,Africa,CAF,Female mobile phone ownership as a % of total ...,2010.0,,,
8,Chad,Africa,TCD,Female mobile phone ownership as a % of total ...,2010.0,,,
9,Congo (Rep. of the),Africa,COG,Female mobile phone ownership as a % of total ...,2010.0,,,


In [209]:
df = df[(df['Indicator name'] == 'Female Internet users as a % of total female population')]
df = df[(df.Year == 2019)]
df

Unnamed: 0,Country,Region,ISO,Indicator name,Year,Value,data_country,data_year
40307,Cabo Verde,Africa,CPV,Female Internet users as a % of total female p...,2019.0,60.965017,,
40308,Côte d'Ivoire,Africa,CIV,Female Internet users as a % of total female p...,2019.0,32.924445,,
40309,Kenya,Africa,KEN,Female Internet users as a % of total female p...,2019.0,20.125122,,
40310,Lesotho,Africa,LSO,Female Internet users as a % of total female p...,2019.0,44.922768,,
40311,Mauritius,Africa,MUS,Female Internet users as a % of total female p...,2019.0,60.130645,,
...,...,...,...,...,...,...,...,...
40388,Mexico,The Americas,MEX,Female Internet users as a % of total female p...,2019.0,68.574653,,
40389,Panama,The Americas,PAN,Female Internet users as a % of total female p...,2019.0,63.811494,,
40390,Paraguay,The Americas,PRY,Female Internet users as a % of total female p...,2019.0,69.017230,,
40391,Peru,The Americas,PER,Female Internet users as a % of total female p...,2019.0,56.992144,,


In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
### 36. Gender gap in mobile usage

In [None]:
indicators[35]

# load data
indicator = indicators[35]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
df = df[(df.Year==2019)]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Gender gap in mobile ownership'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 37. Mobile Device Penetration

In [None]:
indicators[36]

# load data
indicator = indicators[36]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))


In [None]:
df.head(15)

In [None]:
df = df[(df.Year==2019)]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Mobile Ownership'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

In [None]:
### 38. Mobile Device Penetration (female)

In [None]:
indicators[37]

# load data
indicator = indicators[37]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))


In [None]:
df.head(15)

In [None]:
df = df[(df.Year==2019)]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Gender gap in mobile ownership'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)