<h1> BART for inequalities </h1>

In [14]:
import os

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

<h2> Preprocessing </h2>

Life expectancy for all countries from the World Bank

In [2]:
life_exp_df = pd.read_csv("wb_life_expectancy.csv", skiprows = 4)
life_exp_df = life_exp_df[["Country Name", "2017"]]
life_exp_df.rename(columns = {"2017":"Life Expectancy 2017"}, inplace=True)
life_exp_df = life_exp_df.dropna()
life_exp_df

Unnamed: 0,Country Name,Life Expectancy 2017
0,Aruba,76.010000
1,Afghanistan,64.130000
2,Angola,60.379000
3,Albania,78.333000
5,Arab World,71.622526
...,...,...
259,Kosovo,71.946341
260,"Yemen, Rep.",66.086000
261,South Africa,63.538000
262,Zambia,63.043000


Income distribution for all countries and world regions from WID.world. The distribution is split into bottom 50 percent, 50-90 percent (middle class), top 10 percent and the top 1 percent share

In [3]:
income_df = pd.read_csv("wid_income_dist.csv", skiprows = 1, sep = ";", header = None)
income_df = income_df[[0, 2, 4]]
income_df.columns = ["Region Name", "percentile", "Income Share"]
income_df = income_df.dropna() # Only keep regions with all 4 parts of the income distribution
income_df = income_df.pivot(index='Region Name', columns='percentile')['Income Share'] # reshape, col per share
income_df

percentile,p0p50,p50p90,p90p100,p99p100
Region Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,0.088212,0.368794,0.542994,0.190221
Albania,0.209400,0.470900,0.319700,0.082100
Algeria,0.207066,0.420077,0.372856,0.097033
Angola,0.130631,0.380834,0.488535,0.151751
Austria,0.234300,0.449100,0.316600,0.092700
...,...,...,...,...
United Kingdom,0.206100,0.439300,0.354600,0.126100
Western Africa,0.116490,0.375802,0.507708,0.164721
Zambia,0.073127,0.311930,0.614943,0.230787
Zanzibar,0.154000,0.365000,0.481000,0.161700


Merge the life expectancy and income dataframes on country

In [4]:
le_income_df = life_exp_df.merge(income_df, left_on = "Country Name", right_on = "Region Name")
le_income_df

Unnamed: 0,Country Name,Life Expectancy 2017,p0p50,p50p90,p90p100,p99p100
0,Angola,60.379000,0.130631,0.380834,0.488535,0.151751
1,Albania,78.333000,0.209400,0.470900,0.319700,0.082100
2,Austria,81.641463,0.234300,0.449100,0.316600,0.092700
3,Burundi,60.898000,0.151344,0.371082,0.477574,0.145485
4,Belgium,81.439024,0.205900,0.480100,0.313900,0.077700
...,...,...,...,...,...,...
79,Tanzania,64.479000,0.153972,0.365047,0.480980,0.161714
80,Uganda,62.516000,0.131229,0.353945,0.514826,0.168541
81,South Africa,63.538000,0.062700,0.286500,0.650800,0.192100
82,Zambia,63.043000,0.073127,0.311930,0.614943,0.230787


In [5]:
X = le_income_df[["p0p50", "p50p90", "p90p100", "p99p100"]]
y = le_income_df[["Life Expectancy 2017"]]

<h2> Random Forest implementation </h2>

In [77]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
rf_pipeline = Pipeline(steps=[("model", RandomForestRegressor(n_estimators = 10, random_state = 0))
                             ])

In [78]:
from sklearn.model_selection import cross_val_score
# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(rf_pipeline, X, y,
                              cv = 5,
                              scoring = "neg_mean_absolute_error")

print("MAE scores:\n", scores)
print("Average MAE score (across experiments):")
print(scores.mean())

MAE scores:
 [3.59117346 2.56071964 5.68925601 3.39699125 4.89878293]
Average MAE score (across experiments):
4.027384656285378


In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {"n_estimators": np.arange(4, 100 + 1, 3)}
grid_search = GridSearchCV(rf_pipeline, parameters)
grid_search.fit(X, y)
grid_search.best_params_

In [None]:
rf_pipeline = Pipeline(steps=[("model", RandomForestRegressor(n_estimators = grid_search.best_params_, random_state = 0))
                             ])
                             
scores = -1 * cross_val_score(rf_pipeline, X, y,
                              cv = 5,
                              scoring = "neg_mean_absolute_error")

print("MAE scores:\n", scores)
print("Average MAE score (across experiments):")
print(scores.mean())

<h2> bartpy implementation </h2>

In [32]:
le_income_df[["p99p100"]].reset_index(drop=True)

Unnamed: 0,p99p100
0,0.151751
1,0.082100
2,0.092700
3,0.145485
4,0.077700
...,...
79,0.161714
80,0.168541
81,0.192100
82,0.230787


In [66]:
X = le_income_df[["p0p50", "p50p90", "p90p100", "p90p100"]]
y = le_income_df["Life Expectancy 2017"].values

In [75]:
bart_pipeline = Pipeline(steps=[("model", SklearnModel(n_burn = 100, n_chains = 1, n_jobs = 1, n_samples = 1000, n_trees = 10))
                             ])

In [74]:
model = SklearnModel(n_burn = 100, n_chains = 1, n_jobs = 1, n_samples = 1000, n_trees = 10)
model.fit(X, y)
model.predict(X)

8%|▊         | 8/100 [00:00<00:01, 79.35it/s]Starting burn
100%|██████████| 100/100 [00:00<00:00, 105.41it/s]
  1%|          | 7/1000 [00:00<00:16, 59.42it/s]Starting sampling
100%|██████████| 1000/1000 [00:09<00:00, 109.92it/s]


array([64.64455203, 79.99541098, 79.99541098, 64.64455203, 79.99541098,
       62.37464835, 64.64455203, 78.08964777, 79.99541098, 61.42783276,
       61.42783276, 79.99541098, 62.07349535, 62.07349535, 62.37464835,
       78.08964777, 79.99541098, 78.08964777, 64.64455203, 79.99541098,
       78.08964777, 64.64455203, 80.69085015, 80.29472466, 64.64455203,
       80.69085015, 79.99541098, 79.99541098, 65.15210269, 78.08964777,
       62.07349535, 65.15210269, 61.42783276, 62.37464835, 76.31641407,
       79.99541098, 79.99541098, 80.69085015, 79.99541098, 79.99541098,
       64.64455203, 65.15210269, 64.64455203, 62.37464835, 76.31641407,
       80.69085015, 78.08964777, 64.64455203, 79.99541098, 64.94570504,
       65.15210269, 80.29472466, 79.99541098, 61.42783276, 66.9253364 ,
       64.64455203, 61.42783276, 61.42783276, 65.15210269, 64.64455203,
       79.99541098, 79.99541098, 78.08964777, 78.08964777, 65.15210269,
       62.37464835, 65.15210269, 64.64455203, 65.15210269, 64.94

In [86]:
parameters = {"n_trees": np.arange(4, 100 + 1, 3)}
grid_search = GridSearchCV(model, parameters)
grid_search.fit(X, y)
grid_search.best_params_

25%|██▌       | 25/100 [00:00<00:00, 248.54it/s]Starting burn
100%|██████████| 100/100 [00:00<00:00, 203.25it/s]
  2%|▏         | 20/1000 [00:00<00:05, 190.95it/s]Starting sampling
100%|██████████| 1000/1000 [00:03<00:00, 283.38it/s]
 45%|████▌     | 45/100 [00:00<00:00, 444.17it/s]Starting burn
100%|██████████| 100/100 [00:00<00:00, 428.75it/s]
  4%|▍         | 40/1000 [00:00<00:02, 395.36it/s]Starting sampling
100%|██████████| 1000/1000 [00:02<00:00, 373.78it/s]
 38%|███▊      | 38/100 [00:00<00:00, 373.74it/s]Starting burn
100%|██████████| 100/100 [00:00<00:00, 332.16it/s]
  3%|▎         | 32/1000 [00:00<00:03, 314.31it/s]Starting sampling
100%|██████████| 1000/1000 [00:02<00:00, 408.57it/s]
 33%|███▎      | 33/100 [00:00<00:00, 327.98it/s]Starting burn
100%|██████████| 100/100 [00:00<00:00, 363.41it/s]
  4%|▍         | 40/1000 [00:00<00:02, 399.13it/s]Starting sampling
100%|██████████| 1000/1000 [00:02<00:00, 366.34it/s]
 33%|███▎      | 33/100 [00:00<00:00, 325.26it/s]Starting bur

KeyboardInterrupt: 

In [None]:
bart_pipeline = Pipeline(steps=[("model", SklearnModel(n_burn = 100, n_chains = 1, n_jobs = 1, n_samples = 1000, n_trees = grid_search.best_params_))
                               ])
scores = -1 * cross_val_score(bart_pipeline, X, y,
                              cv = 5,
                              scoring = "neg_mean_absolute_error")
                              
print("MAE scores:\n", scores)
print("Average MAE score (across experiments):")
print(scores.mean())

In [None]:
from bartpy.extensions.baseestimator import ResidualBART
model = ResidualBART(base_estimator = LinearModel())
model.fit(X, y)

<h2> Predictions </h2>

In [76]:
scores = -1 * cross_val_score(bart_pipeline, X, y,
                              cv = 5,
                              scoring = "neg_mean_absolute_error")
                              
print("MAE scores:\n", scores)
print("Average MAE score (across experiments):")
print(scores.mean())

0%|          | 0/100 [00:00<?, ?it/s]Starting burn
100%|██████████| 100/100 [00:01<00:00, 61.96it/s]
  2%|▏         | 15/1000 [00:00<00:06, 143.47it/s]Starting sampling
100%|██████████| 1000/1000 [00:06<00:00, 153.35it/s]
 17%|█▋        | 17/100 [00:00<00:00, 164.84it/s]Starting burn
100%|██████████| 100/100 [00:00<00:00, 167.79it/s]
  2%|▏         | 16/1000 [00:00<00:06, 156.90it/s]Starting sampling
100%|██████████| 1000/1000 [00:06<00:00, 160.89it/s]
 16%|█▌        | 16/100 [00:00<00:00, 155.79it/s]Starting burn
100%|██████████| 100/100 [00:00<00:00, 153.77it/s]
  3%|▎         | 31/1000 [00:00<00:06, 153.71it/s]Starting sampling
100%|██████████| 1000/1000 [00:06<00:00, 160.70it/s]
 13%|█▎        | 13/100 [00:00<00:00, 123.50it/s]Starting burn
100%|██████████| 100/100 [00:00<00:00, 153.66it/s]
  3%|▎         | 32/1000 [00:00<00:06, 142.89it/s]Starting sampling
100%|██████████| 1000/1000 [00:06<00:00, 160.72it/s]
 13%|█▎        | 13/100 [00:00<00:00, 124.70it/s]Starting burn
100%|█████