In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

from sklearn import tree
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression

%matplotlib inline

In [2]:
hfi_df = pd.read_csv('hfi_cc_2018.csv')
hfi_df.isna().sum()

year                                    0
ISO_code                                0
countries                               0
region                                  0
pf_rol_procedural                     578
pf_rol_civil                          578
pf_rol_criminal                       578
pf_rol                                 80
pf_ss_homicide                         80
pf_ss_disappearances_disap             89
pf_ss_disappearances_violent           80
pf_ss_disappearances_organized        179
pf_ss_disappearances_fatalities        80
pf_ss_disappearances_injuries          80
pf_ss_disappearances                   80
pf_ss_women_fgm                       172
pf_ss_women_missing                   120
pf_ss_women_inheritance_widows        541
pf_ss_women_inheritance_daughters     541
pf_ss_women_inheritance               119
pf_ss_women                           100
pf_ss                                  80
pf_movement_domestic                   98
pf_movement_foreign               

In [3]:
def impute_ranked(df, cols, ranks):
    ''' A function to impute null values in a data frame based on a ranked column
    :param df: df, a pandas dataframe with null values for scores that correspond to a rank
    :param cols: array-like, a list or array containing the columns for which to impute nulls
    :param ranks: str, the column name wherein ranks are found
    '''
    df['pct_off_mean'] = (df[ranks].max() - df[ranks]) / df[ranks].mean()
    for col in cols:
        df[col].fillna(np.nanmean(df[col])*df['pct_off_mean'], inplace=True)
    df = df.drop(columns=['pct_off_mean'])
    return df

In [4]:
pf_cols = list(hfi_df.columns[4:61])
ef_cols = list(hfi_df.columns[64:-6])

hf_df = impute_ranked(hfi_df, pf_cols, 'pf_rank')
hf_df = impute_ranked(hfi_df, ef_cols, 'ef_rank')

In [5]:
hf16_df = hf_df.iloc[:162]
hf16_df.head()

Unnamed: 0,year,ISO_code,countries,region,pf_rol_procedural,pf_rol_civil,pf_rol_criminal,pf_rol,pf_ss_homicide,pf_ss_disappearances_disap,...,ef_regulation_business_bribes,ef_regulation_business_licensing,ef_regulation_business_compliance,ef_regulation_business,ef_regulation,ef_score,ef_rank,hf_score,hf_rank,hf_quartile
0,2016,ALB,Albania,Eastern Europe,6.661503,4.547244,4.666508,5.291752,8.920429,10.0,...,4.050196,7.324582,7.074366,6.705863,6.906901,7.54,34.0,7.56814,48.0,2.0
1,2016,DZA,Algeria,Middle East & North Africa,1.085714,1.063457,0.979795,3.819566,9.456254,10.0,...,3.765515,8.523503,7.029528,5.676956,5.268992,4.99,159.0,5.135886,155.0,4.0
2,2016,AGO,Angola,Sub-Saharan Africa,3.257143,3.19037,2.939384,3.451814,8.06026,5.0,...,1.94554,8.096776,6.782923,4.930271,5.5185,5.17,155.0,5.640662,142.0,4.0
3,2016,ARG,Argentina,Latin America & the Caribbean,7.098483,5.79196,4.34393,5.744791,7.622974,10.0,...,3.260044,5.253411,6.508295,5.535831,5.369019,4.84,160.0,6.469848,107.0,3.0
4,2016,ARM,Armenia,Caucasus & Central Asia,5.645715,5.529975,5.094932,5.003205,8.80875,10.0,...,4.575152,9.319612,6.491481,6.79753,7.378069,7.57,29.0,7.241402,57.0,2.0


In [6]:
hf16_df = hf16_df.drop(columns=['year', 'ISO_code', 'hf_rank', 'hf_quartile'])

In [7]:
X = hf16_df.drop(columns=['countries', 'region', 'hf_score']).copy()
Y = hf16_df['hf_score']

In [8]:
sel = VarianceThreshold(threshold=(.7 * (.3)))
sel.fit_transform(X)
X = X[X.columns[sel.get_support(indices=True)]]
X.head()

Unnamed: 0,pf_rol_procedural,pf_rol_civil,pf_rol_criminal,pf_rol,pf_ss_homicide,pf_ss_disappearances_disap,pf_ss_disappearances_violent,pf_ss_disappearances_organized,pf_ss_disappearances_fatalities,pf_ss_disappearances_injuries,...,ef_regulation_business_adm,ef_regulation_business_bureaucracy,ef_regulation_business_start,ef_regulation_business_bribes,ef_regulation_business_licensing,ef_regulation_business_compliance,ef_regulation_business,ef_regulation,ef_score,ef_rank
0,6.661503,4.547244,4.666508,5.291752,8.920429,10.0,10.0,10.0,10.0,10.0,...,6.072172,6.0,9.713864,4.050196,7.324582,7.074366,6.705863,6.906901,7.54,34.0
1,1.085714,1.063457,0.979795,3.819566,9.456254,10.0,9.29403,5.0,9.926119,9.990149,...,3.722341,1.777778,9.24307,3.765515,8.523503,7.029528,5.676956,5.268992,4.99,159.0
2,3.257143,3.19037,2.939384,3.451814,8.06026,5.0,10.0,7.5,10.0,10.0,...,2.758428,1.333333,8.664627,1.94554,8.096776,6.782923,4.930271,5.5185,5.17,155.0
3,7.098483,5.79196,4.34393,5.744791,7.622974,10.0,10.0,7.5,10.0,9.990877,...,2.404211,6.666667,9.122357,3.260044,5.253411,6.508295,5.535831,5.369019,4.84,160.0
4,5.645715,5.529975,5.094932,5.003205,8.80875,10.0,10.0,7.5,9.316196,9.93162,...,4.552464,6.0,9.846472,4.575152,9.319612,6.491481,6.79753,7.378069,7.57,29.0


In [9]:
X_new = SelectKBest(f_regression, k=10).fit(X, Y).get_support(indices=True)
X_new = X[X.columns[X_new]]
X_new.head()

Unnamed: 0,pf_religion_estop_establish,pf_religion_estop_operate,pf_association_political_establish,pf_association_political_operate,pf_association_prof_establish,pf_association_prof_operate,pf_association_sport_establish,pf_association_sport_operate,pf_score,pf_rank
0,10.23069,10.191724,10.193283,9.04459,10.095091,9.462297,11.329507,9.937672,7.596281,57.0
1,1.461527,1.455961,1.456183,1.292084,1.442156,1.351757,1.618501,1.419667,5.281772,147.0
2,4.384581,4.367882,4.36855,3.876253,4.326468,4.05527,4.855503,4.259002,6.111324,117.0
3,11.692217,11.647685,11.649466,10.336675,11.537247,10.814054,12.948008,11.357339,8.099696,42.0
4,7.599941,7.570995,7.572153,6.718839,7.49921,7.029135,8.416205,7.38227,6.912804,84.0


In [13]:
start_time = time.time()

# Initialize and train our tree.
decision_tree = tree.DecisionTreeRegressor(
    criterion='mse',
    max_features=1,
    max_depth=4,
    random_state = 1337
)
print(cross_val_score(decision_tree, X, Y, cv=10))
print("--- {} seconds ---".format(time.time() - start_time))

[0.61296806 0.4684326  0.86519239 0.53024644 0.66724877 0.68785903
 0.70667466 0.477272   0.78618926 0.3983105 ]
--- 0.10795283317565918 seconds ---


In [14]:
start_time = time.time()

# Initialize and train our tree.
decision_tree = tree.DecisionTreeRegressor(
    criterion='mse',
    max_features=1,
    max_depth=5,
    random_state = 1337
)
print(cross_val_score(decision_tree, X, Y, cv=10))
print("--- {} seconds ---".format(time.time() - start_time))

[ 0.6451816   0.69780031  0.86889505  0.26416441  0.78589672  0.78261771
  0.8181951   0.79882209  0.69025645 -0.09929465]
--- 0.09595489501953125 seconds ---


In [15]:
start_time = time.time()

# Initialize and train our tree.
decision_tree = tree.DecisionTreeRegressor(
    criterion='mse',
    max_features=1,
    max_depth=6,
    random_state = 1337
)
print(cross_val_score(decision_tree, X, Y, cv=10))
print("--- {} seconds ---".format(time.time() - start_time))

[0.40293843 0.5763866  0.80338402 0.71817905 0.75639492 0.61739129
 0.80651228 0.28803927 0.70796095 0.36764673]
--- 0.08398222923278809 seconds ---


In [17]:
start_time = time.time()

# Initialize and train our tree.
decision_tree = tree.DecisionTreeRegressor(
    criterion='mse',
    max_features=1,
    min_impurity_decrease=0.01,
    random_state = 1337
)
print(cross_val_score(decision_tree, X, Y, cv=10))
print("--- {} seconds ---".format(time.time() - start_time))

[0.46123523 0.57177282 0.84634558 0.41398313 0.31858696 0.66689563
 0.18716265 0.55935908 0.76795773 0.36085335]
--- 0.11597847938537598 seconds ---


In [18]:
start_time = time.time()

# Initialize and train our tree.
decision_tree = tree.DecisionTreeRegressor(
    criterion='mse',
    max_features=2,
    min_impurity_decrease=0.01,
    random_state = 1337
)
print(cross_val_score(decision_tree, X, Y, cv=10))
print("--- {} seconds ---".format(time.time() - start_time))

[0.5551775  0.66276574 0.90355637 0.61512192 0.80330265 0.56164852
 0.8435088  0.65996954 0.84864763 0.46436041]
--- 0.08398008346557617 seconds ---


In [19]:
start_time = time.time()

# Initialize and train our tree.
decision_tree = tree.DecisionTreeRegressor(
    criterion='mse',
    max_features=3,
    min_impurity_decrease=0.01,
    random_state = 1337
)
print(cross_val_score(decision_tree, X, Y, cv=10))
print("--- {} seconds ---".format(time.time() - start_time))

[0.67693077 0.67590859 0.83024761 0.77073278 0.74800179 0.71230524
 0.91050916 0.71560307 0.78174525 0.34875258]
--- 0.09198164939880371 seconds ---


In [20]:
start_time = time.time()

# Initialize and train our tree.
decision_tree = tree.DecisionTreeRegressor(
    criterion='mse',
    max_features=4,
    min_impurity_decrease=0.01,
    random_state = 1337
)
print(cross_val_score(decision_tree, X, Y, cv=10))
print("--- {} seconds ---".format(time.time() - start_time))

[0.60947094 0.70911664 0.8701506  0.74259131 0.82295118 0.94076066
 0.89509992 0.81399167 0.87768526 0.75246612]
--- 0.09195804595947266 seconds ---


In [25]:
start_time = time.time()

# Initialize and train our tree.
decision_tree = tree.DecisionTreeRegressor(
    criterion='mse',
    min_impurity_decrease=0.01
)
print(cross_val_score(decision_tree, X, Y, cv=10))
print("--- {} seconds ---".format(time.time() - start_time))

[0.85580195 0.82279953 0.91547373 0.87057221 0.8927533  0.94859957
 0.91092626 0.8327066  0.82397504 0.86610616]
--- 0.1723625659942627 seconds ---


In [26]:
start_time = time.time()

# Initialize and train our tree.
rfr = ensemble.RandomForestRegressor(
    n_estimators=10,
    criterion='mse',
    min_impurity_decrease=0.01
)
print(cross_val_score(rfr, X, Y, cv=10))
print("--- {} seconds ---".format(time.time() - start_time))

[0.85677725 0.92880318 0.97109009 0.90966752 0.89627151 0.97986327
 0.94944856 0.88718904 0.87267065 0.88720567]
--- 0.6447522640228271 seconds ---


In [27]:
start_time = time.time()

# Initialize and train our tree.
rfr = ensemble.RandomForestRegressor(
    n_estimators=20,
    criterion='mse',
    min_impurity_decrease=0.01
)
print(cross_val_score(rfr, X, Y, cv=10))
print("--- {} seconds ---".format(time.time() - start_time))

[0.86603276 0.954126   0.97569997 0.90010407 0.95374948 0.98471559
 0.94045687 0.93773182 0.91058582 0.87160582]
--- 1.1057302951812744 seconds ---
