Using data in file 'data/world_bank_projects.json':
<br>1. Find the 10 countries with most projects
<br>2. Find the top 10 major project themes (using column 'mjtheme_namecode')
<br>3. In 2. above you will notice that some entries have only the code and the name is missing. Create a dataframe with the missing names filled in.

In [None]:
## import modules.
import numpy as np
import pandas as pd
import json
from pandas.io.json import json_normalize

## loading data for the problems.
with open('data\\world_bank_projects.json') as json_file:
    json_data = json.load(json_file)

## use normalization to create tables from nested element.
table = json_normalize(json_data)

## table overview.
print(table.info())
print(list(table.columns.values))

In [None]:
## exercise 1.
## table for exercise 1: table_ex1.
table_ex1 = pd.DataFrame(table[['countryname','project_name']]).sort_values(by=['countryname'])
print(table_ex1.info())
print(table_ex1.describe())

## 'project_name' has 500 unique values, but let's check for duplicates anyway...
duplicate_projects = table_ex1[table_ex1['project_name'].duplicated()]

## if no duplicates, count countries in 'countryname': ex1.
if duplicate_projects.empty:
    ex1 = table_ex1['countryname'].value_counts().sort_values(ascending=False)
    print('\n' + 'The 10 countries with most projects are...')
    print(ex1.head(10))
else:
    print('Attention! Duplicates!')

In [None]:
## exercise 2.
## table for exercise 2: table_ex2.
table_ex2 = json_normalize(json_data, 'mjtheme_namecode', ['countryname','project_name'])
print(table_ex2.info())
print(table_ex2[['code', 'name']].describe())
print('Weird, "code" has 11 but "name" has 12 unique values...')

## counting codes in 'code': ex2_code.
ex2_code = table_ex2['code'].value_counts().sort_values(ascending=False)
print('\n' + 'According to "code", the top 10 major project themes are...')
print(ex2_code.head(10))

## counting names in 'name': ex2_name.
ex2_name = table_ex2['name'].value_counts().sort_values(ascending=False)
print('\n' + 'But according to "name", the top 10 major project themes are...')
print(ex2_name.head(10))

## filtering results to show missing names and counting them by code: ex2_missing.
ex2_missname = table_ex2[table_ex2['name'] == ''][['name', 'code']]
print('\n' + 'Attention! Missing names!')
print(ex2_missname.groupby(['code']).count())

In [None]:
## exercise 3.
## changing type of 'code' for int in table_ex2 and identifying code of each name: name_code.
table_ex2['code'] = table_ex2['code'].astype(int)
name_code = table_ex2[['name', 'code']].groupby(['name']).mean()
print(name_code)

## identifying name of each code: code_name.
code_name = name_code.reset_index().set_index('code')
print(code_name)

## transposing code_name and converting it to dictionay: code_dict.
code_dict = code_name.T.to_dict(orient='records')[0]
print(code_dict)

## table for exercise 3: table_ex3.
table_ex3 = table_ex2

## using code_dict to substitute missing names in 'name' (slower solution).
for [i, name, code] in zip(table_ex3.index, table_ex3['name'], table_ex3['code']):
    if name == '':
        table_ex3['name'][i] = code_dict[code]

## using code_dict to substitute missing names in 'name' (faster solution).       
#for code in code_dict:
#    table_ex3.name[table_ex3.code == code] = code_dict[code]
        
## counting codes in 'code': ex3_code.
ex3_code = table_ex3['code'].value_counts().sort_values(ascending=False)
print('\n' + 'According to "code", the top 10 major project themes are...')
print(ex3_code.head(10))

## counting names in 'name': ex3_code.
ex3_name = table_ex3['name'].value_counts().sort_values(ascending=False)
print('\n' + 'And according to "name", the top 10 major project themes are...')
print(ex3_name.head(10))