# OECD Data Preprocess

In [1]:
import pandas as pd
import numpy as np

In [2]:
# get data
economic_indicator_full = pd.read_csv('data/economic_indicator.csv')
population_full = pd.read_csv('data/population.csv')
cpi_full = pd.read_csv('data/cpi.csv')
gdp_and_production_full = pd.read_csv('data/gdp_and_productivity.csv')
health_full = pd.read_csv('data/health.csv')
ppp_full = pd.read_csv('data/ppp.csv')
productivity_ulc_full = pd.read_csv('data/productivity_ulc.csv')
resident_full = pd.read_csv('data/resident.csv')

In [3]:
# get relevant columns from data
economic_indicator = economic_indicator_full[['Subject', 'Country', 'Time', 'Value']]
population = population_full[['Country', 'Time', 'Value']]
cpi = cpi_full[['Subject', 'Country', 'Time', 'Value']]
gdp_production = gdp_and_production_full[['Subject', 'Country', 'Time', 'Value']]
health = health_full[['Function', 'Country', 'Year', 'Value']]
ppp = ppp_full[['Indicator', 'Country', 'Time', 'Value']]
productivity_ulc = productivity_ulc_full[['Subject', 'Country', 'Time', 'Value']]
resident = resident_full[['Subject', 'Country', 'Time', 'Value']]

Now we check if all the dataset has the same countries.

In [4]:
def assert_same_countries(list1, list2):
    """
    Check if lists have some countries
    """
    try:
        assert len(list1['Country'].unique()) == len(list2['Country'].unique())
    except:
        print("Country not the same")

        if len(list1['Country'].unique()) > len(list2['Country'].unique()):
            print('list 2 missing:')
            for col in list1['Country'].unique():
                if col not in list2['Country'].unique():
                    print(col)
        else:
            print('list 1 missing:')
            for col in list2['Country'].unique():
                if col not in list1['Country'].unique():
                    print(col)

Now we use `assert_same_countries` to see if the data have the same countries.

In [5]:
assert_same_countries(economic_indicator, population)

In [6]:
assert_same_countries(economic_indicator, cpi)

In [7]:
assert_same_countries(economic_indicator, gdp_production)

Country not the same
list 2 missing:
Argentina
Saudi Arabia


In [8]:
assert_same_countries(economic_indicator, health)

Country not the same
list 2 missing:
Argentina
Saudi Arabia


In [9]:
assert_same_countries(economic_indicator, ppp)

Country not the same
list 2 missing:
Argentina
Saudi Arabia
Costa Rica


In [10]:
assert_same_countries(economic_indicator, productivity_ulc)

Country not the same
list 2 missing:
Argentina
Saudi Arabia
Costa Rica


In [11]:
assert_same_countries(economic_indicator, resident)

Country not the same
list 2 missing:
Argentina
Costa Rica


From the output above, we see that some data are missing Argentina, Saudi Arabia, and Costa Rica. Therefore, the three countries will be dropped from all dataset.

In [12]:
df_list = []

for data in [economic_indicator, population, cpi, gdp_production, health, ppp, productivity_ulc, resident]:
    df_list.append(data.loc[(data['Country'] != 'Argentina') & 
                                          (data['Country'] != 'Saudi Arabia') & 
                                          (data['Country'] != 'Costa Rica')])

In [13]:
economic_indicator = df_list[0]
population = df_list[1]
cpi = df_list[2]
gdp_production = df_list[3]
health = df_list[4]
ppp = df_list[5]
productivity_ulc = df_list[6]
resident = df_list[7]

In [14]:
assert_same_countries(economic_indicator, population)
assert_same_countries(economic_indicator, cpi)
assert_same_countries(economic_indicator, gdp_production)
assert_same_countries(economic_indicator, health)
assert_same_countries(economic_indicator, ppp)
assert_same_countries(economic_indicator, productivity_ulc)
assert_same_countries(economic_indicator, resident)

## Transforming the Data

In [15]:
def create_two_dimensional_arr(subject, countries, data):
    """
    """
    avg_value_subject = []
    for subject in subjects:
        averages = []
        for country in countries:
            subject_value_avg = data.loc[(data['Subject'] == subject) & 
                                                       (data['Country'] == country)]['Value'].mean()
            averages.append(subject_value_avg)
        avg_value_subject.append(averages)
    return avg_value_subject

In [18]:
avg_data = []
data = [economic_indicator, cpi, gdp_production, productivity_ulc, resident]

for datum in data:
    subjects = datum['Subject'].unique()
    countries = sorted(datum['Country'].unique())
    
    index_arr = create_two_dimensional_arr(subjects, countries, datum)
    
    assert len(index_arr[0]) == len(countries)
    assert len(index_arr) == len(subjects)
    
    df = dict(zip(subjects, index_arr))
    df_index = pd.DataFrame(df, index=countries)
    avg_data.append(df_index)

In [19]:
economic_indicator_index = avg_data[0]
cpi_index = avg_data[1]
gdp_production_index = avg_data[2]
productivity_ulc_index = avg_data[3]
resident_index = avg_data[4]

### Population

In [20]:
population.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71982 entries, 0 to 77003
Data columns (total 3 columns):
Country    71982 non-null object
Time       71982 non-null int64
Value      71982 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 2.2+ MB


In [21]:
population.head()

Unnamed: 0,Country,Time,Value
0,Australia,2001,9712875.0
1,Australia,2002,9819727.0
2,Australia,2003,9933140.0
3,Australia,2004,10036771.0
4,Australia,2005,10157211.0


In [22]:
countries = sorted(population['Country'].unique())

In [23]:
mean_values = []

for country in countries:
    lst = population.loc[population['Country'] == country][['Value']]
    aus = [lst[i:i + 18] for i in range(0, len(lst), 18)]
    mean = [aus[i].pct_change().dropna().values.mean() for i in range(len(aus))]
    mean_values.append(mean)

In [24]:
population_mean_dict = dict(zip(countries, mean_values))

In [25]:
avg_population = pd.DataFrame(population_mean_dict)
population_index = avg_population.T

In [27]:
population_index.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,83,84,85,86,87,88,89,90,91,92
Australia,0.015409,0.015399,0.015404,0.012788,0.013029,0.012912,0.010611,0.010593,0.010602,0.006907,...,0.014254,-0.00016,0.002136,0.000976,-0.000817,0.019901,-0.001224,-0.001631,0.022263,0.025514
Austria,0.004681,0.006498,0.005567,0.00343,0.004044,0.003745,-0.006905,-0.006279,-0.006584,-0.006874,...,0.012429,-0.00167,0.001448,-0.000265,0.059131,-0.000793,-0.001393,0.053378,-0.00024,0.071006
Belgium,0.005712,0.006472,0.006085,0.003969,0.004259,0.004118,0.005497,0.005562,0.00553,0.002429,...,0.007542,0.001139,0.003637,0.002316,0.035183,-0.001672,-0.000723,0.030503,-0.001192,0.025675
Brazil,0.010273,0.009609,0.009947,-0.009491,-0.009041,-0.009261,-0.01058,-0.00932,-0.00994,-0.009198,...,0.021941,-0.017064,-0.018461,-0.017705,0.004375,-0.030872,-0.031504,-0.030182,0.004035,0.004729
Canada,0.010372,0.010666,0.010517,0.006302,0.006351,0.006327,0.000711,0.000369,0.000536,-0.002076,...,0.019703,0.001331,0.001964,0.001635,0.025217,-0.001391,0.02982,-0.001677,0.027329,-0.001532


### Health

### Purchasing Power Parity