# THE HUMAN FREEDOM INDEX

## Introduction

#### Import relevant packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import missingno

%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Data

Source: https://www.kaggle.com/gsutters/the-human-freedom-index/version/2#_=_

In [2]:
hf_df = pd.read_csv('../data/human_freedom.csv')

### Viewing the data

In [3]:
hf_df.head()

Unnamed: 0,year,ISO_code,countries,region,hf_score,hf_rank,hf_quartile
0,2016,ALB,Albania,Eastern Europe,7.56814,48.0,2.0
1,2016,DZA,Algeria,Middle East & North Africa,5.135886,155.0,4.0
2,2016,AGO,Angola,Sub-Saharan Africa,5.640662,142.0,4.0
3,2016,ARG,Argentina,Latin America & the Caribbean,6.469848,107.0,3.0
4,2016,ARM,Armenia,Caucasus & Central Asia,7.241402,57.0,2.0


##### Data types of the variables

In [4]:
hf_df.dtypes

year             int64
ISO_code        object
countries       object
region          object
hf_score       float64
hf_rank        float64
hf_quartile    float64
dtype: object

##### Summary statistics

In [5]:
hf_df.describe()

Unnamed: 0,year,hf_score,hf_rank,hf_quartile
count,1378.0,1378.0,1378.0,1378.0
mean,2012.111756,6.993444,77.007983,2.490566
std,2.56913,1.025811,44.506549,1.119698
min,2008.0,3.765827,1.0,1.0
25%,2010.0,6.336685,39.0,1.0
50%,2012.0,6.92384,76.0,2.0
75%,2014.0,7.89466,115.0,3.0
max,2016.0,9.126313,162.0,4.0


### Remove country data that is not complete for all years

In [6]:
country_missing_data = hf_df[hf_df.isnull().any(axis=1)]['countries'].unique()
country_missing_data

NameError: name 'hf' is not defined

In [None]:
hf_country_years_complete = hf[~hf['countries'].isin(country_missing_data)]

In [None]:
hf_country_years_complete['year'].value_counts()

In [None]:
hf_country_years_complete.groupby(['year']).mean()

In [None]:
7.010904 - 7.069785

### HFI for the world over the years

In [None]:
hf_country_years_complete.groupby(['year']).mean().plot.line(y='hf_score', use_index=True)

### HFI by region over the years

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
for index, group in hf_country_years_complete.groupby(['region']):
    group_agg = group.groupby(['year']).aggregate(np.mean)
    group_agg.plot(y='hf_score', label=index, ax=ax)
ax.legend(bbox_to_anchor=(1, 1))
plt.show()

- Western Europe has much more freedom than Eastern Europe
- 2012 onwards Middle East & North Africa has less freedom than Sub-Saharan Africa

### Top 10 countries over the years

In [None]:
hf.sort_values(by=['hf_rank','year'])

In [None]:
for i in range(2008, 2017):
    print('\033[1m' + str(i) + '\033[0m')
    print(hf.loc[hf['year'] == i].sort_values(by=['hf_rank']).head(10))

### Bottom 10 countries over the years

In [None]:
for year in range(2008, 2017):
    print('\033[1m' + str(year) + '\033[0m')
    print(hf.loc[hf['year'] == year].sort_values(by=['hf_rank']).dropna().tail(10))

### HFI Univariate Analysis

In [None]:
sns.distplot(hf['hf_score'].dropna())

In [None]:
for year in range(2008,2017):
    # Subset of year
    subset = hf[hf['year'] == year]
    
    # Draw the density plot
    sns.distplot(subset['hf_score'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = year)
    
# Plot formatting
plt.legend(prop={'size': 16}, title = 'Year')
plt.title('Density Plot of HFI for different years')
plt.xlabel('HFI')
plt.ylabel('Density')

In [None]:
sns.boxplot(x="year", y="hf_score", data=hf)

- HFI follows a bimodal distribution
- The distribution is pretty similar every year
- Year 2013 and 2016 have outliers, each of these years has one country that has a much lower HFI score than the overall distribution pattern for that year 
- The later years (2012, 2016) have longer lower whiskers than upper whiskers. This indicates that the HFI scores vary more for the least positive quartile group than the most positive quartile group. This is most noticed in 2014, 2015.
- The upper half of the box plot for all the years is bigger than the lower half. This indicates that the countries with more freedom vary more than the countries with less freedom.
- The min HFI score has decreased over the years

### Min HFI score over the years

In [None]:
hf.groupby(['year'], sort=False)['hf_score'].min()

In [None]:
3.765827 - 4.823528

### Max HFI score over the years

In [None]:
hf.groupby(['year'], sort=False)['hf_score'].max()

In [None]:
8.887410 - 9.126313

### HFI distribution per region

In [None]:
i = 0
sns.set(rc={'figure.figsize':(15,8)})
for region in hf['region'].unique():
    plt.figure(i)
    print(region)
    
    subset = hf[hf['region'] == region]
    sns.distplot(subset['hf_score'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = region)
    i+=1
    plt.title('Density Plot of HFI for ' + region)
    plt.xlabel('HFI')
    plt.ylabel('Density')
    plt.show()

### HFI distribution per region by year

In [None]:
i = 0
sns.set(rc={'figure.figsize':(15,8)})
for region in hf['region'].unique():
    plt.figure(i)
    print(region)
    sns.boxplot(x="year", y="hf_score", data=hf[hf['region'] == region], palette="Set3")
    i+=1
    plt.show()

#### Eastern Europe
- Negative Outlier: Russia (Ukraine (2014, 2015))

#### Sub-Saharan Africa
- Positive Outlier: Mauritius

#### Latin America & the Caribbean
- Positive Outlier: Chile, Uruguay (Costa Rica (2016))
- Negative Outlier: Venezuela

#### Western Europe
- Negative Outlier: Iceland

#### South Asia
- Positive Outlier: Singapore
- Negative Outlier: Myanmar, Pakistan

#### East Asia
- Positive Outlier: Hong Kong
- Negative Outlier: China

### HFI distibution per region by country

In [None]:
i = 0
sns.set(rc={'figure.figsize':(15,8)})
for region in hf['region'].unique():
    plt.figure(i)
    print(region)
    sns.boxplot(x="countries", y="hf_score", data=hf[hf['region'] == region], palette="Set3")
    i+=1
    plt.show()

### HFI distribution of the top 15 influential countries

In [None]:
most_influential_countries_iso = [
    'USA', 'RUS','CHN','GBR','DEU',
    'FRA','JPN','ISR','SAU','ARE',
    'CAN','KOR','IRN','TUR','IND']

most_influential_countries = hf[hf['ISO_code'].isin(most_influential_countries_iso)]

In [None]:
most_influential_countries['countries'].unique()

In [None]:
sns.boxplot(x="ISO_code", y="hf_score", data=most_influential_countries, palette="Set3")

## Colonies
https://ourworldindata.org/colonialism

In [None]:
len(complete_data['countries'].unique())

In [None]:
colonies = pd.read_csv('temp.csv', sep=';')