Import relevant packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import missingno

%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Data

Source: https://www.kaggle.com/gsutters/the-human-freedom-index/version/2#_=_

In [None]:
complete_data = pd.read_csv('../data/original/hfi_cc_2018.csv')

In [None]:
complete_data.head()

### Basic categories and their full forms

- `pf_score, pf_rank` - **Personal Freedom (pf)**
    - `pf_rol` - Rule of Law (rol)
    - `pf_ss` - Security and Safety (ss)
    - `pf_movement` - Freedom of Movement
    - `pf_religion` - Religious Freedom
    - `pf_association` - Freedom to Associate and Assemble with peaceful individuals or organizations
    - `pf_expression` - Freedom of Expression
    - `pf_identity` - Identity and Relationships

- ef (`ef_score, ef_rank`) - **Economic Freedom (ef)**
    - `ef_government` - Size of Government
    - `ef_legal` - Legal System and Property Rights
    - `ef_money` - Sound Money
    - `ef_trade` - Freedom to Trade Internationally
    - `ef_regulation` - Regulation

- hf (`hf_score, hf_rank, hf_quartile`) - **Human Freedom (hf)**

### Viewing the data

In [None]:
shape = complete_data.shape
print('Dataset has ' + str(shape[0]) + ' rows')
print('Dataset has ' + str(shape[1]) + ' columns')

##### Data types of the variables

In [None]:
complete_data.dtypes

##### Summary statistics

In [None]:
complete_data.describe()

- As expected, variables lie between 0 and 10 except year, rank, quartile
- All variables except 'year' have some missing values 

##### Missing data matrix

In [None]:
missingno.matrix(complete_data)

### Creating df focusing on Human Freedom

In [None]:
hf = complete_data.filter(
    ['year',
     'ISO_code',
     'countries',
     'region',
     'hf_score',
     'hf_rank',
     'hf_quartile']
    , axis=1)

##### Which rows have NaNs?

In [None]:
missingno.matrix(hf)

In [None]:
hf[hf.isnull().any(axis=1)].sort_values(by=['countries','year'])

##### Remove rows with NaNs

In [None]:
hf = hf.dropna()

In [None]:
hf.to_csv('human_freedom.csv', index=False)

In [None]:
hf.head()

### Creating df focusing on Personal and Economic freedom

In [None]:
first_layer = complete_data.filter(
    ['year',
     'ISO_code',
     'countries',
     'region',
     'pf_score',
     'ef_score']
    , axis=1)

In [None]:
missingno.matrix(first_layer)

In [None]:
first_layer = first_layer.dropna()

In [None]:
first_layer.to_csv('personal_economic_freedom.csv', index=False)

In [None]:
first_layer.head()