## Data Processing

Load the California housing dataset:
https://www.kaggle.com/camnugent/california-housing-prices


In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error


data = datasets.fetch_california_housing()
X = pd.DataFrame(data.data, columns = data.feature_names)
y = data.target
y = pd.Series(y)
y.index = X.index
print(str(len(X)) + ' rows')
print(str(len(X.columns)) + ' columns')


In [None]:
X.head()

In [None]:
# Data Preprocessing
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.33, random_state=42)
xTrainScaler = preprocessing.StandardScaler()
xTrain = xTrainScaler.fit_transform(xTrain)
xTrain = pd.DataFrame(xTrain,columns = X.columns)
xTest = preprocessing.StandardScaler().fit_transform(xTest)
xTest = pd.DataFrame(xTest,columns = X.columns)
yTrain = preprocessing.StandardScaler().fit_transform(yTrain.values.reshape(-1, 1))
yTest = preprocessing.StandardScaler().fit_transform(yTest.values.reshape(-1, 1))
yTrain = pd.Series(yTrain.flatten())
yTrain.index = xTrain.index
yTest = pd.Series(yTest.flatten())
yTest.index = xTest.index

## Install ControlBurn package

https://pypi.org/project/ControlBurn/


In [None]:
!pip install ControlBurn==0.0.9
from ControlBurn.ControlBurn import ControlBurnRegressor

## Fit a ControlBurnRegressor

Build forest via double bag-boosting and select features using lasso.

In [None]:
cb = ControlBurnRegressor(build_forest_method = 'doublebagboost', alpha = 0.02)
cb.fit(xTrain,yTrain)

print('Number of trees grown: ' + str(len(cb.forest)))
print('Number of trees selected: ' + str(len(cb.subforest)))
print('Features selected ' + str(cb.features_selected_))

In [None]:
pred = cb.predict(xTest)
print('MSE of polished model: ' + str(mean_squared_error(yTest,pred)))

## Interpretability Plots
Print a list of features used in each tree in the selected subforest.

In [None]:
tree_list = cb.subforest
cols = X.columns
for tree in tree_list:
    print(cols[tree.feature_importances_ > 0].values)

### Single Feature Trees
For single feature trees, plot the contribution to the prediction as a function of the feature value.

In [None]:
# plot for single feature f(x) plots
sub_weights = cb.weights[cb.weights>0]
for feat in cols:
    loc = 0
    pred_all = []
    for tree in tree_list:
        if ((feat in cols[tree.feature_importances_>0]) & (len(cols[tree.feature_importances_>0]) == 1)) :
            x_temp = pd.DataFrame(np.linspace(-1,1,1000),columns = [feat])
            
            for i in cols:
                if i != feat:
                    x_temp[i] = 0
            x_temp = x_temp[X.columns]
        
            pred = tree.predict(x_temp)
            pred_all.append(pred*sub_weights[loc])
        
        loc = loc+1
    pred_all = np.sum(pred_all,axis = 0)
    plt.plot(np.linspace(-1,1,1000),pred_all)
    plt.xlabel(feat)
    plt.ylabel('Contribution to Prediction')
    break
    

### Two Feature Trees (pairwise feature interactions)

The below heat map shows the frequency of which features appear together. This is useful for detecting feature interactions.

In [None]:
import itertools
import seaborn as sns
from itertools import combinations,permutations
import matplotlib.pyplot as plt



pairs = list(permutations(cols,2))
counter = pd.DataFrame(pairs,columns = ['Feature 1','Feature 2'])

counts = []
for i in pairs:
    n = 0
    for tree in tree_list:
        feats = list(cols[tree.feature_importances_>0])
        if ((i[0] in feats) & (i[1] in feats)):
            n = n + 1
    counts.append(n)
counter['count'] = counts
counter = counter.pivot_table(index='Feature 1', columns='Feature 2', values='count')
mask = np.zeros_like(counter, dtype='bool')
mask[np.triu_indices_from(mask)] = True
sns.heatmap(counter, mask = mask , cmap = 'Blues')

There is a strong feature interaction between Latitude and Longitude. To visualize this effect, we create contribution plots using the two features trees that only include Latitude and Longitude.

These are similar to partial dependence plots
https://christophm.github.io/interpretable-ml-book/pdp.html

In [None]:
pairs = list(permutations(np.linspace(-3,3,200),2)) 
x_temp = pd.DataFrame(pairs,columns = ['Latitude','Longitude'])
for i in cols:
    if i not in ['Latitude','Longitude']:
        x_temp[i] = 0
x_temp = x_temp[X.columns]
pred_all = []
loc = 0
for tree in tree_list:
    if (('Longitude' in cols[tree.feature_importances_>0]) &('Latitude' in cols[tree.feature_importances_>0]) & (len(cols[tree.feature_importances_>0]) == 2)):
        pred = tree.predict(x_temp)
        pred_all.append(pred*sub_weights[loc])
        
    loc = loc + 1
pred_all = np.sum(pred_all,axis = 0)


df = pd.DataFrame(pairs,columns = ['Latitude','Longitude'])
df['contribution'] = pred_all

contribution = df['contribution']

#Unscale the data for easier interpretation
temp = df.drop('contribution',axis = 1)
for i in cols:
    if i not in ['Latitude','Longitude']:
        temp[i] = 0
temp = temp[X.columns]
temp = pd.DataFrame(xTrainScaler.inverse_transform(temp), columns = X.columns)
df = temp[['Latitude', 'Longitude']].round(3)
df['contribution'] = contribution.round(3)

df_plot = df.pivot_table(index='Latitude', columns='Longitude', values='contribution')
sns.heatmap(df_plot , cmap = 'RdBu', fmt='.4f')

Converting this plot to a map yields the plot below. Red indicates positive contribution to housing cost, yellow neutral, and green a reduction to housing cost.

In [None]:
import folium
map1 = df.copy()
map1['color'] = pd.cut(map1['contribution'], bins=3, 
                              labels=['green', 'yellow', 'red'])
mapit = folium.Map( location=[36.7783, -119.4179], zoom_start=6 )
for i,row in map1.iterrows():
    folium.CircleMarker([row['Latitude'],row['Longitude']], radius = .5,color=row['color']).add_to(mapit)
    

In [None]:
mapit

These results are consistent with the California housing market, where houses in San Francisco and Los Angeles are very expensive but housing prices drop as you move inland.
