<a href="https://colab.research.google.com/github/swilsonmfc/automl/blob/main/VTreat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# VTreat

![](https://github.com/WinVector/vtreat/raw/master/tools/vtreat.png)

# Install

In [1]:
!pip install vtreat

Collecting vtreat
  Downloading https://files.pythonhosted.org/packages/82/17/bbe94e262011f059e2f0a1fd12d560878d8c0aa931022c3a7a1428e41d00/vtreat-0.4.9-py3-none-any.whl
Installing collected packages: vtreat
Successfully installed vtreat-0.4.9


In [2]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/47/80/8e9c57ec32dfed6ba2922bc5c96462cbf8596ce1a6f5de532ad1e43e53fe/catboost-0.25.1-cp37-none-manylinux1_x86_64.whl (67.3MB)
[K     |████████████████████████████████| 67.3MB 67kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.25.1


# Setup

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import vtreat
from catboost import CatBoostRegressor
from sklearn.linear_model import Lasso

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

# Data

In [4]:
df = pd.read_csv('https://query.data.world/s/5cqzv6uhbwh6bjphv3kea6yo2qbr4t', engine='python', encoding='latin-1')

In [5]:
df.dtypes

avgAnnCount                float64
avgDeathsPerYear             int64
TARGET_deathRate           float64
incidenceRate              float64
medIncome                    int64
popEst2015                   int64
povertyPercent             float64
studyPerCap                float64
binnedInc                   object
MedianAge                  float64
MedianAgeMale              float64
MedianAgeFemale            float64
Geography                   object
AvgHouseholdSize           float64
PercentMarried             float64
PctNoHS18_24               float64
PctHS18_24                 float64
PctSomeCol18_24            float64
PctBachDeg18_24            float64
PctHS25_Over               float64
PctBachDeg25_Over          float64
PctEmployed16_Over         float64
PctUnemployed16_Over       float64
PctPrivateCoverage         float64
PctPrivateCoverageAlone    float64
PctEmpPrivCoverage         float64
PctPublicCoverage          float64
PctPublicCoverageAlone     float64
PctWhite            

## Categoricals

In [6]:
# Break out State and Categories for Income
df['binnedInc'] = df.binnedInc.astype('category')
df['Geography'] = df.Geography.str.split(',', expand=True)[1]

In [7]:
# Separate DF for one hot
df_ohe = df.copy()
states = pd.get_dummies(df_ohe.Geography, prefix='state')
income = pd.get_dummies(df_ohe.binnedInc, prefix='income')
df_ohe = pd.concat([df_ohe, states, income], axis=1)
df_ohe = df_ohe.drop(columns=['Geography', 'binnedInc'])
df_ohe

Unnamed: 0,avgAnnCount,avgDeathsPerYear,TARGET_deathRate,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,MedianAge,MedianAgeMale,MedianAgeFemale,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctSomeCol18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctPrivateCoverageAlone,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate,state_ Alabama,state_ Alaska,state_ Arizona,state_ Arkansas,state_ California,state_ Colorado,state_ Connecticut,state_ Delaware,...,state_ Massachusetts,state_ Michigan,state_ Minnesota,state_ Mississippi,state_ Missouri,state_ Montana,state_ Nebraska,state_ Nevada,state_ New Hampshire,state_ New Jersey,state_ New Mexico,state_ New York,state_ North Carolina,state_ North Dakota,state_ Ohio,state_ Oklahoma,state_ Oregon,state_ Pennsylvania,state_ Rhode Island,state_ South Carolina,state_ South Dakota,state_ Tennessee,state_ Texas,state_ Utah,state_ Vermont,state_ Virginia,state_ Washington,state_ West Virginia,state_ Wisconsin,state_ Wyoming,"income_(34218.1, 37413.8]","income_(37413.8, 40362.7]","income_(40362.7, 42724.4]","income_(42724.4, 45201]","income_(45201, 48021.6]","income_(48021.6, 51046.4]","income_(51046.4, 54545.6]","income_(54545.6, 61494.5]","income_(61494.5, 125635]","income_[22640, 34218.1]"
0,1397.000000,469,164.9,489.800000,61898,260131,11.2,499.748204,39.3,36.9,41.7,2.54,52.5,11.5,39.5,42.1,6.9,23.2,19.6,51.9,8.0,75.1,,41.6,32.9,14.0,81.780529,2.594728,4.821857,1.843479,52.856076,6.118831,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
1,173.000000,70,161.3,411.600000,48127,43269,18.6,23.111234,33.0,32.2,33.7,2.34,44.5,6.1,22.4,64.0,7.5,26.0,22.7,55.9,7.8,70.2,53.8,43.6,31.1,15.3,89.228509,0.969102,2.246233,3.741352,45.372500,4.333096,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,102.000000,50,174.7,349.700000,49348,21026,14.6,47.560164,45.0,44.0,45.8,2.62,54.2,24.0,36.6,,9.5,29.0,16.0,45.9,7.0,63.7,43.5,34.9,42.1,21.1,90.922190,0.739673,0.465898,2.747358,54.444868,3.729488,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,427.000000,202,194.8,430.400000,44243,75882,17.1,342.637253,42.8,42.2,43.4,2.52,52.7,20.2,41.2,36.1,2.5,31.6,9.3,48.3,12.1,58.4,40.3,35.0,45.3,25.0,91.744686,0.782626,1.161359,1.362643,51.021514,4.603841,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
4,57.000000,26,144.4,350.100000,49955,10321,12.5,0.000000,48.3,47.8,48.9,2.34,57.8,14.9,43.0,40.0,2.0,33.4,15.0,48.2,4.8,61.6,43.9,35.1,44.0,22.7,94.104024,0.270192,0.665830,0.492135,54.027460,6.796657,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3042,1962.667684,15,149.6,453.549422,46961,6343,12.4,0.000000,44.2,41.1,48.8,2.08,51.0,20.9,27.9,,8.4,32.2,15.2,51.7,4.3,78.3,54.9,44.6,31.7,13.2,90.280811,3.837754,0.327613,1.700468,51.063830,7.773512,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3043,1962.667684,43,150.1,453.549422,48609,37118,18.8,377.175494,30.4,29.3,31.4,2.90,52.6,26.7,33.9,35.6,3.8,23.1,12.4,70.1,4.6,64.5,53.3,48.6,28.8,17.7,75.706245,2.326771,4.044920,14.130288,52.007937,8.186470,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3044,1962.667684,46,153.9,453.549422,51144,34536,15.0,1968.959926,30.9,30.5,31.2,3.04,54.8,19.7,44.5,33.3,2.5,23.0,12.8,64.8,6.4,62.0,52.6,47.8,26.6,16.8,87.961629,2.313188,1.316472,5.680705,55.153949,7.809192,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3045,1962.667684,52,175.0,453.549422,50745,25609,13.3,0.000000,39.0,36.9,40.5,2.56,58.8,10.9,37.2,,4.1,36.1,14.4,,6.9,75.9,56.3,49.6,29.5,14.0,92.905681,1.176562,0.244632,2.131790,58.484232,7.582938,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


## Missing

In [8]:
df.isna().sum()

avgAnnCount                   0
avgDeathsPerYear              0
TARGET_deathRate              0
incidenceRate                 0
medIncome                     0
popEst2015                    0
povertyPercent                0
studyPerCap                   0
binnedInc                     0
MedianAge                     0
MedianAgeMale                 0
MedianAgeFemale               0
Geography                     0
AvgHouseholdSize              0
PercentMarried                0
PctNoHS18_24                  0
PctHS18_24                    0
PctSomeCol18_24            2285
PctBachDeg18_24               0
PctHS25_Over                  0
PctBachDeg25_Over             0
PctEmployed16_Over          152
PctUnemployed16_Over          0
PctPrivateCoverage            0
PctPrivateCoverageAlone     609
PctEmpPrivCoverage            0
PctPublicCoverage             0
PctPublicCoverageAlone        0
PctWhite                      0
PctBlack                      0
PctAsian                      0
PctOther

# Train - Test

In [9]:
X = df.copy().drop(columns=['TARGET_deathRate'])
y = df['TARGET_deathRate']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=900)

X_ohe = df_ohe.copy().drop(columns=['TARGET_deathRate'])
y_ohe = df_ohe['TARGET_deathRate']
X_train_ohe, X_test_ohe, y_train_ohe, y_test_ohe = train_test_split(X_ohe, y_ohe, random_state=900)

In [10]:
results_df = pd.DataFrame(columns=['MAE', 'MSE', 'RMSE'])

# Baseline

In [11]:
nb = y_train.mean()

In [12]:
pred = [nb] * len(y_test)
mae  = mean_absolute_error(y_test, pred)
mse  = mean_squared_error(y_test, pred)
rmse = mse ** 0.5 
results_df.loc['Baseline'] = [mae, mse, rmse]
print(f'MAE: {mae} RMSE: {rmse}')

MAE: 21.257974235715064 RMSE: 27.75972698276685


# Linear

In [13]:
X_train_lin = X_train_ohe.copy()
X_test_lin  = X_test_ohe.copy()

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train_lin = imp_mean.fit_transform(X_train_lin)
X_test_lin  = imp_mean.transform(X_test_lin)

In [14]:
linear = Lasso(random_state=1417)
linear.fit(X_train_lin, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=1417,
      selection='cyclic', tol=0.0001, warm_start=False)

In [15]:
pred = linear.predict(X_test_lin)
mae  = mean_absolute_error(y_test, pred)
mse  = mean_squared_error(y_test, pred)
rmse = mse ** 0.5 
results_df.loc['Lasso'] = [mae, mse, rmse]
print(f'MAE: {mae} RMSE: {rmse}')

MAE: 14.318445861324768 RMSE: 19.496719260643168


# VTreat
* DataFrame Processor
* Built on Pandas
  * .score_frame_ method shows treatments

## Missing Data
* Handle the myriad of missingness (NA, NaN, Inf)
* Replace missing values with typical value (mean)
* Indicator column to identify replacements
* Indicator columns flagged as ColumnName_isBad

## Categorical Levels
* Comparable to OneHotEncoder(drop=None, handle_unknown='ignore')
* New level gets all zero values

## High Cardinality
* Use threshold of prevalence 1/(Max Levels)
* Combine remaining levels

## Impact Coding
* Compute global mean of dependent variable
* Measure group-wise mean
* Add feature for impact (diff between global mean and group mean)
* Compute using cross-fold to minimize leaks

# Linear + VTreat

In [16]:
X_train_lin = X_train.copy()
X_test_lin  = X_test.copy()

In [17]:
plan = vtreat.NumericOutcomeTreatment()
X_train_vt = plan.fit_transform(X_train_lin, y_train)
X_test_vt  = plan.transform(X_test_lin)

In [18]:
plan.score_frame_

Unnamed: 0,variable,orig_variable,treatment,y_aware,has_range,PearsonR,R2,significance,vcount,default_threshold,recommended
0,PctEmployed16_Over_is_bad,PctEmployed16_Over,missing_indicator,False,True,0.003951,0.000016,8.502805e-01,3.0,0.055556,False
1,PctPrivateCoverageAlone_is_bad,PctPrivateCoverageAlone,missing_indicator,False,True,-0.024562,0.000603,2.405416e-01,3.0,0.055556,False
2,PctSomeCol18_24_is_bad,PctSomeCol18_24,missing_indicator,False,True,0.015221,0.000232,4.670800e-01,3.0,0.055556,False
3,MedianAgeMale,MedianAgeMale,clean_copy,False,True,0.005712,0.000033,7.849420e-01,31.0,0.005376,False
4,popEst2015,popEst2015,clean_copy,False,True,-0.121582,0.014782,5.532642e-09,31.0,0.005376,True
...,...,...,...,...,...,...,...,...,...,...,...
68,Geography_lev_ Alabama,Geography,indicator_code,False,True,0.071197,0.005069,6.597283e-04,33.0,0.005051,True
69,Geography_lev_ Florida,Geography,indicator_code,False,True,-0.023332,0.000544,2.649198e-01,33.0,0.005051,False
70,Geography_lev_ Arkansas,Geography,indicator_code,False,True,0.109804,0.012057,1.426104e-07,33.0,0.005051,True
71,Geography_lev_ Louisiana,Geography,indicator_code,False,True,0.106026,0.011242,3.778901e-07,33.0,0.005051,True


In [19]:
linear_vt = Lasso(random_state=1417)
linear_vt.fit(X_train_vt, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=1417,
      selection='cyclic', tol=0.0001, warm_start=False)

In [20]:
pred = linear_vt.predict(X_test_vt)
mae  = mean_absolute_error(y_test, pred)
mse  = mean_squared_error(y_test, pred)
rmse = mse ** 0.5 
results_df.loc['Lasso + VTreat'] = [mae, mse, rmse]
print(f'MAE: {mae} RMSE: {rmse}')

MAE: 13.655462301801807 RMSE: 18.752024051912205


# Catboost

In [21]:
X_train_cat, X_eval_cat, y_train_cat, y_eval_cat = train_test_split(X_train, y_train)

In [22]:
categoricals = ['binnedInc', 'Geography']
boost = CatBoostRegressor(cat_features=categoricals, random_state=1417)
boost.fit(X_train_cat, y_train_cat, eval_set=(X_eval_cat, y_eval_cat), early_stopping_rounds=5)

Learning rate set to 0.051011
0:	learn: 27.3425497	test: 26.9555804	best: 26.9555804 (0)	total: 60.3ms	remaining: 1m
1:	learn: 26.8549959	test: 26.5282385	best: 26.5282385 (1)	total: 71.9ms	remaining: 35.9s
2:	learn: 26.3624929	test: 26.0601019	best: 26.0601019 (2)	total: 82.9ms	remaining: 27.5s
3:	learn: 25.9429607	test: 25.6494179	best: 25.6494179 (3)	total: 98.5ms	remaining: 24.5s
4:	learn: 25.5330652	test: 25.2678094	best: 25.2678094 (4)	total: 109ms	remaining: 21.7s
5:	learn: 25.1369531	test: 24.9553817	best: 24.9553817 (5)	total: 120ms	remaining: 19.8s
6:	learn: 24.7623548	test: 24.6082903	best: 24.6082903 (6)	total: 130ms	remaining: 18.5s
7:	learn: 24.4104792	test: 24.2962265	best: 24.2962265 (7)	total: 141ms	remaining: 17.5s
8:	learn: 24.0827627	test: 23.9997968	best: 23.9997968 (8)	total: 153ms	remaining: 16.9s
9:	learn: 23.7730932	test: 23.7282493	best: 23.7282493 (9)	total: 164ms	remaining: 16.2s
10:	learn: 23.4430311	test: 23.4467746	best: 23.4467746 (10)	total: 174ms	remai

<catboost.core.CatBoostRegressor at 0x7f58c41b2c10>

In [23]:
pred = boost.predict(X_test)
mae  = mean_absolute_error(y_test, pred)
mse  = mean_squared_error(y_test, pred)
rmse = mse ** 0.5 
results_df.loc['Catboost'] = [mae, mse, rmse]
print(f'MAE: {mae} RMSE: {rmse}')

MAE: 12.979106844629145 RMSE: 18.373122521805445


# Catboost + VTreat

In [24]:
plan = vtreat.NumericOutcomeTreatment()
X_train_vt = plan.fit_transform(X_train, y_train)
X_test_vt  = plan.transform(X_test)
X_train_vt, X_eval_vt, y_train_vt, y_eval_vt = train_test_split(X_train_vt, y_train)

In [25]:
plan.score_frame_

Unnamed: 0,variable,orig_variable,treatment,y_aware,has_range,PearsonR,R2,significance,vcount,default_threshold,recommended
0,PctEmployed16_Over_is_bad,PctEmployed16_Over,missing_indicator,False,True,0.003951,0.000016,8.502805e-01,3.0,0.055556,False
1,PctPrivateCoverageAlone_is_bad,PctPrivateCoverageAlone,missing_indicator,False,True,-0.024562,0.000603,2.405416e-01,3.0,0.055556,False
2,PctSomeCol18_24_is_bad,PctSomeCol18_24,missing_indicator,False,True,0.015221,0.000232,4.670800e-01,3.0,0.055556,False
3,MedianAgeMale,MedianAgeMale,clean_copy,False,True,0.005712,0.000033,7.849420e-01,31.0,0.005376,False
4,popEst2015,popEst2015,clean_copy,False,True,-0.121582,0.014782,5.532642e-09,31.0,0.005376,True
...,...,...,...,...,...,...,...,...,...,...,...
68,Geography_lev_ Alabama,Geography,indicator_code,False,True,0.071197,0.005069,6.597283e-04,33.0,0.005051,True
69,Geography_lev_ Florida,Geography,indicator_code,False,True,-0.023332,0.000544,2.649198e-01,33.0,0.005051,False
70,Geography_lev_ Arkansas,Geography,indicator_code,False,True,0.109804,0.012057,1.426104e-07,33.0,0.005051,True
71,Geography_lev_ Louisiana,Geography,indicator_code,False,True,0.106026,0.011242,3.778901e-07,33.0,0.005051,True


In [26]:
vboost = CatBoostRegressor(random_state=1417)
vboost.fit(X_train_vt, y_train_vt, eval_set=(X_eval_vt, y_eval_vt), early_stopping_rounds=5)

Learning rate set to 0.051011
0:	learn: 27.3695132	test: 26.5186952	best: 26.5186952 (0)	total: 8.65ms	remaining: 8.64s
1:	learn: 26.8626698	test: 26.0577041	best: 26.0577041 (1)	total: 18ms	remaining: 8.98s
2:	learn: 26.4593870	test: 25.6672186	best: 25.6672186 (2)	total: 26.8ms	remaining: 8.92s
3:	learn: 25.9818574	test: 25.2243262	best: 25.2243262 (3)	total: 36ms	remaining: 8.96s
4:	learn: 25.5304543	test: 24.8160017	best: 24.8160017 (4)	total: 45.1ms	remaining: 8.98s
5:	learn: 25.1189020	test: 24.4403627	best: 24.4403627 (5)	total: 54.2ms	remaining: 8.99s
6:	learn: 24.7268012	test: 24.0689234	best: 24.0689234 (6)	total: 63.1ms	remaining: 8.95s
7:	learn: 24.3257313	test: 23.7208634	best: 23.7208634 (7)	total: 72.1ms	remaining: 8.94s
8:	learn: 23.9686279	test: 23.3827158	best: 23.3827158 (8)	total: 80.9ms	remaining: 8.91s
9:	learn: 23.6054573	test: 23.0308881	best: 23.0308881 (9)	total: 90.8ms	remaining: 8.98s
10:	learn: 23.2728963	test: 22.7243505	best: 22.7243505 (10)	total: 99.8ms

<catboost.core.CatBoostRegressor at 0x7f58c3b79490>

In [27]:
pred = vboost.predict(X_test_vt)
mae  = mean_absolute_error(y_test, pred)
mse  = mean_squared_error(y_test, pred)
rmse = mse ** 0.5 
results_df.loc['Catboost + VTreat'] = [mae, mse, rmse]
print(f'MAE: {mae} RMSE: {rmse}')

MAE: 12.800726641238946 RMSE: 17.940377729386928


# Results

In [28]:
results_df

Unnamed: 0,MAE,MSE,RMSE
Baseline,21.257974,770.602442,27.759727
Lasso,14.318446,380.122062,19.496719
Lasso + VTreat,13.655462,351.638406,18.752024
Catboost,12.979107,337.571631,18.373123
Catboost + VTreat,12.800727,321.857153,17.940378


# Improving

In [29]:
improve_boost = CatBoostRegressor(random_state=1417, 
                                  depth=6)
improve_boost.fit(X_train_vt, y_train_vt, eval_set=(X_eval_vt, y_eval_vt), early_stopping_rounds=10)

Learning rate set to 0.051011
0:	learn: 27.3695132	test: 26.5186952	best: 26.5186952 (0)	total: 8.86ms	remaining: 8.85s
1:	learn: 26.8626698	test: 26.0577041	best: 26.0577041 (1)	total: 18.1ms	remaining: 9.01s
2:	learn: 26.4593870	test: 25.6672186	best: 25.6672186 (2)	total: 27.1ms	remaining: 9.01s
3:	learn: 25.9818574	test: 25.2243262	best: 25.2243262 (3)	total: 35.7ms	remaining: 8.9s
4:	learn: 25.5304543	test: 24.8160017	best: 24.8160017 (4)	total: 44.4ms	remaining: 8.83s
5:	learn: 25.1189020	test: 24.4403627	best: 24.4403627 (5)	total: 53.1ms	remaining: 8.79s
6:	learn: 24.7268012	test: 24.0689234	best: 24.0689234 (6)	total: 62.1ms	remaining: 8.81s
7:	learn: 24.3257313	test: 23.7208634	best: 23.7208634 (7)	total: 70.9ms	remaining: 8.79s
8:	learn: 23.9686279	test: 23.3827158	best: 23.3827158 (8)	total: 79.6ms	remaining: 8.77s
9:	learn: 23.6054573	test: 23.0308881	best: 23.0308881 (9)	total: 88.4ms	remaining: 8.75s
10:	learn: 23.2728963	test: 22.7243505	best: 22.7243505 (10)	total: 97.

<catboost.core.CatBoostRegressor at 0x7f58c3d6f890>

In [30]:
pred = improve_boost.predict(X_test_vt)
mae  = mean_absolute_error(y_test, pred)
mse  = mean_squared_error(y_test, pred)
rmse = mse ** 0.5 
results_df.loc['Catboost + VTreat'] = [mae, mse, rmse]
print(f'MAE: {mae} RMSE: {rmse}')

MAE: 12.676799799822662 RMSE: 17.79196082137901


# Notes
* VTreat feature engineering has some good ideas in it
* Easy to build pipelines 
* Feature selection built in