In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree     import DecisionTreeRegressor
import pandas as pd
import numpy as np
import sys
from sklearn.model_selection import train_test_split
import time
from operator import itemgetter

TEST_PCT = 0.70
MAX_DEPTH = 12
LOSS = 'square'
NUM_TREES = 400
COLS = ['density', 'median_age', 'median_hh_income', 'pct_hispanic', 'pct_white', 'pct_black']

In [None]:
# Load the data
if sys.platform[:3] =='win': data_loc = 'D:/Data/Loyalty Vision/'
else: data_loc = "/home/tom/ML/data/CBG/"

filenm = "CBG_prepared.csv"
df = pd.read_csv(data_loc+filenm, delimiter=',')
df = df.sample(10000)  # for Grainger

In [None]:
# Split into Train and Test
df_train, df_test = train_test_split(df, test_size=TEST_PCT)

train_x = df_train[COLS]
train_y = df_train['spend_per_hh']
test_x = df_test[COLS]
test_y = df_test['spend_per_hh']
print('Record counts    Training: {:,}   Test  {:,}'.format(df_train.shape[0], df_test.shape[0]))

In [None]:
# Run the job
start = time.time()

# Fit regression model
regr = AdaBoostRegressor(DecisionTreeRegressor(
    max_depth=MAX_DEPTH), loss=LOSS, n_estimators=NUM_TREES)
regr.fit(train_x, train_y)
print('Time to train: {:.0f} seconds'.format(time.time() - start))

# See how accurate it was through the R-squared value
start = time.time()
score = regr.score(test_x, test_y)
print('R-squared: {:.2}'.format(score))
print('Time to score: {:.0f} seconds'.format(time.time() - start))

# Manually compute the R-squared value to confirm with "score()"
y_preds = regr.predict(test_x)
y_preds = pd.DataFrame(y_preds, columns=['pred'])

actual = pd.DataFrame({'spend_per_hh': pd.Series(test_y)})
actual = actual.reset_index(drop=True)
actual.columns =['actual']

both = y_preds.join(actual)

both['sum_error'] = (both['actual'] - both['pred']) ** 2
both['sum_explained'] = (both['actual'] - both['actual'].mean()) ** 2
print('Manual R-squared calculation:  {:.2}'.format(1 - both['sum_error'].sum()/both['sum_explained'].sum()))
!aplay /usr/share/sounds/bicycle_bell.wav

In [None]:
# This will show the relative importance of each feature
relative_importance = list(zip(COLS, regr.feature_importances_))
print('{:<20}{}'.format('Feature', 'Relative Importance'))
for a,b in sorted(relative_importance, key=itemgetter(1), reverse=True):
    print('{:<20}{:.2}'.format(a, b))