In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.base import is_regressor

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.feature_selection import RFE

from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
pd.set_option('display.max_colwidth', 50)
seed = 42 

In [2]:
# Load the raw data
df = pd.read_csv("Datasets\WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv\WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv")

### Preprocessing

In [3]:
# Split the dataset by Number of Policies
policies = df['Number of Policies'].unique()

subsets = {}

for policy in policies:
    subsets[policy] = df[df['Number of Policies'] == policy]

In [4]:
# From the subsets dict we create individual datasets
df_1 = subsets[1]
df_2 = subsets[2]
df_3 = subsets[3]
df_4 = subsets[4]
df_5 = subsets[5]
df_6 = subsets[6]
df_7 = subsets[7]
df_8 = subsets[8]
df_9 = subsets[9]

In [5]:
# Create a list of all the subsets to work with
subset_list = [df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9]

In [6]:
# Set Customer as index for each subset and drop Effective To Date
def customer_as_index(data):
    for i, d in enumerate(data):
        temp_d = d.copy()
        temp_d.set_index("Customer", inplace=True)
        temp_d.drop(["Effective To Date", "Number of Policies"], axis=1, inplace=True)
        data[i] = temp_d
        
customer_as_index(subset_list)

In [7]:
df_1 = subset_list[0]
df_2 = subset_list[1]
df_3 = subset_list[2]
df_4 = subset_list[3]
df_5 = subset_list[4]
df_6 = subset_list[5]
df_7 = subset_list[6]
df_8 = subset_list[7]
df_9 = subset_list[8]

### Get feature importance when predicting Customer Lifetime Value

In [8]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3251 entries, BU79786 to Y167826
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   State                          3251 non-null   object 
 1   Customer Lifetime Value        3251 non-null   float64
 2   Response                       3251 non-null   object 
 3   Coverage                       3251 non-null   object 
 4   Education                      3251 non-null   object 
 5   EmploymentStatus               3251 non-null   object 
 6   Gender                         3251 non-null   object 
 7   Income                         3251 non-null   int64  
 8   Location Code                  3251 non-null   object 
 9   Marital Status                 3251 non-null   object 
 10  Monthly Premium Auto           3251 non-null   int64  
 11  Months Since Last Claim        3251 non-null   int64  
 12  Months Since Policy Inception  3251 non-null

In [9]:
# Define which columns are numerical and which are categorical
num_features = ['Income', 'Monthly Premium Auto',
       'Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Total Claim Amount']
cat_features = ['State', 'Response', 'Coverage', 'Education', 'EmploymentStatus',
       'Gender', 'Location Code', 'Marital Status', 'Policy Type', 'Policy',
       'Renew Offer Type', 'Sales Channel', 'Vehicle Class', 'Vehicle Size']

# Define the ColumnTransformer to apply different preprocessing to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])

# Define the RFE with Linear Regression as the estimator
estimator = [LinearRegression(), 
             RandomForestRegressor(n_estimators=100, criterion="squared_error", max_depth=20), 
             XGBRegressor()]


def subsets_pipeline(data):
    set_name = []
    model_name = []
    model_rmse = []
    model_mape = []
    model_features = []
    for idx, d in enumerate(data):
        X = d.drop(["Customer Lifetime Value"], axis=1)
        y = d["Customer Lifetime Value"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
        
        for est in estimator:
            selector = RFE(est, n_features_to_select=20, step=1)
            pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('selector', selector),
                           ('model', est)])
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            rmse = np.mean(mean_squared_error(y_test, y_pred))
            mape = mean_absolute_percentage_error(y_test, y_pred)
            set_name.append(f"Dataset {idx + 1}")
            model_name.append(est)
            model_rmse.append(rmse)
            model_mape.append(mape)
            if isinstance(est, LinearRegression):
                model_features.append(pipeline.named_steps['model'].coef_)
            elif hasattr(est, 'feature_importances_') and is_regressor(est):
                model_features.append(pipeline.named_steps['model'].feature_importances_)
            else:
                model_features.append(None)
    table = pd.DataFrame({
        'Subset': set_name,
        'Model': model_name,
        'RMSE': model_rmse,
        'MAPE': model_mape,
        'Features': model_features
    })
    return table 

In [10]:
# data_sets = [df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9]
# subsets_pipeline(data_sets)

Print feature importance table for each model for each subset. 

In [11]:
# Define which columns are numerical and which are categorical
num_features = ['Income', 'Monthly Premium Auto',
       'Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Total Claim Amount']
cat_features = ['State', 'Response', 'Coverage', 'Education', 'EmploymentStatus',
       'Gender', 'Location Code', 'Marital Status', 'Policy Type', 'Policy',
       'Renew Offer Type', 'Sales Channel', 'Vehicle Class', 'Vehicle Size']

# Define the ColumnTransformer to apply different preprocessing to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])

# Define the RFE with Linear Regression as the estimator
estimator = [LinearRegression(), 
             RandomForestRegressor(n_estimators=100, criterion="squared_error", max_depth=20), 
             XGBRegressor()]


def subsets_pipeline(data):
    set_name = []
    model_name = []
    model_rmse = []
    model_mape = []
    model_features = []
    for idx, d in enumerate(data):
        X = d.drop(["Customer Lifetime Value"], axis=1)
        y = d["Customer Lifetime Value"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
        
        for est in estimator:
            selector = RFE(est, n_features_to_select=5, step=1)
            pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('selector', selector),
                           ('model', est)])
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            rmse = np.mean(mean_squared_error(y_test, y_pred))
            mape = mean_absolute_percentage_error(y_test, y_pred)
            
            set_name = f"Dataset {idx + 1}"
            model_name = est
            model_rmse = rmse
            model_mape = mape
            print(f"\n\n{set_name} - {model_name} - {rmse} - {mape}")
            
            feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
            feature_importances = pipeline.named_steps['selector'].ranking_
            
            top_features = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=False)[:10]
            
            print(top_features)

In [13]:
data_sets = [df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9]
# subsets_pipeline(data_sets)

We can see that Income and Monthly Premium are quite consistently the most important features. 

Let's create models to predict Income and Monthly Premium, then get feature importance from those models. 

### Feature importance for predicting Income

Prepare the subset data to get feature importance for predicting Income

In [17]:
# Remove CLV from each subset
inc_sets = {}
for idx, d in enumerate(data_sets):
    inc_sets[f"inc_set_{idx + 1}"] = d.drop(["Customer Lifetime Value"], axis=1)

In [19]:
# Create new subsets from each of the dictionary values
inc_subset_list = list(inc_sets.values())

In [20]:
# Define which columns are numerical and which are categorical
num_features = ['Monthly Premium Auto',
       'Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Total Claim Amount']
cat_features = ['State', 'Response', 'Coverage', 'Education', 'EmploymentStatus',
       'Gender', 'Location Code', 'Marital Status', 'Policy Type', 'Policy',
       'Renew Offer Type', 'Sales Channel', 'Vehicle Class', 'Vehicle Size']

# Define the ColumnTransformer to apply different preprocessing to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])

# Define the RFE with Linear Regression as the estimator
estimator = [LinearRegression(), 
             RandomForestRegressor(n_estimators=100, criterion="squared_error", max_depth=20), 
             XGBRegressor()]


def inc_pipeline(data):
    inc_dict = {}
    set_name = []
    for idx, d in enumerate(data):
        X = d.drop(["Income"], axis=1)
        y = d["Income"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
        
        for est in estimator:
            selector = RFE(est, n_features_to_select=20, step=1)
            pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('selector', selector),
                           ('model', est)])
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            # Get the evaluation metrics
            rmse = np.mean(mean_squared_error(y_test, y_pred))
            mape = mean_absolute_percentage_error(y_test, y_pred)
            # Get the feature names
            feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
            feature_importances = pipeline.named_steps['selector'].ranking_
            top_features = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=False)[:10]
            
            inc_dict[f"inc_set {idx + 1} - {est} - {rmse} - {mape}"] = top_features
            
    table = pd.DataFrame(inc_dict)
    return table 

In [21]:
inc_pipeline(inc_subset_list)

Unnamed: 0,inc_set 1 - LinearRegression() - 860334351.0983104 - 4.091277440656694e+19,inc_set 1 - RandomForestRegressor(max_depth=20) - 261152648.17097747 - 0.2508736034713302,"inc_set 1 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 319653425.3421351 - 3.616122175395065e+18",inc_set 2 - LinearRegression() - 876773028.8496732 - 4.102541815848286e+19,inc_set 2 - RandomForestRegressor(max_depth=20) - 266976707.26073793 - 0.23930977957029187,"inc_set 2 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 294819764.83577335 - 2.9093546733407867e+18",inc_set 3 - LinearRegression() - 734980688.4145299 - 3.026665300341821e+19,inc_set 3 - RandomForestRegressor(max_depth=20) - 309746337.06098664 - 0.2714710675314498,"inc_set 3 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 386086059.1945321 - 2.9915185313744553e+18",inc_set 4 - LinearRegression() - 242918122.6707317 - 6.692568734059258e+18,inc_set 4 - RandomForestRegressor(max_depth=20) - 189222141.8600512 - 0.19474464678252254,"inc_set 4 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 237299358.20869097 - 2.7003033763261363e+18",inc_set 5 - LinearRegression() - 428532123.0731707 - 4.256670555118572e+18,inc_set 5 - RandomForestRegressor(max_depth=20) - 400272376.0780241 - 0.34910121712111036,"inc_set 5 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 446751609.1249721 - 1.1400654545283014e+18",inc_set 6 - LinearRegression() - 277913889.02666664 - 5.764607523034235e+18,inc_set 6 - RandomForestRegressor(max_depth=20) - 234003059.20275202 - 0.21272018977420704,"inc_set 6 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 261812504.4152377 - 3.7865382905695575e+18",inc_set 7 - LinearRegression() - 286105688.5862069 - 5.300788526928032e+18,inc_set 7 - RandomForestRegressor(max_depth=20) - 277995884.3733976 - 0.2761168707998907,"inc_set 7 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 292391028.87540346 - 2.9469696488596644e+18",inc_set 8 - LinearRegression() - 858382984.8051949 - 2.551400316688399e+19,inc_set 8 - RandomForestRegressor(max_depth=20) - 313348352.86119354 - 0.3073692587166291,"inc_set 8 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 389968927.7844454 - 2.3883446080676347e+18",inc_set 9 - LinearRegression() - 642903378.3571428 - 2.8905389151214522e+19,inc_set 9 - RandomForestRegressor(max_depth=20) - 260461539.79780832 - 0.254388843236782,"inc_set 9 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 296023694.35768574 - 2.01480146752178e+18"
0,"(cat__Policy Type_Corporate Auto, 1)","(num__Monthly Premium Auto, 1)","(num__Months Since Policy Inception, 1)","(cat__Marital Status_Divorced, 1)","(num__Monthly Premium Auto, 1)","(num__Months Since Policy Inception, 1)","(cat__Location Code_Rural, 1)","(num__Monthly Premium Auto, 1)","(num__Total Claim Amount, 1)","(cat__Response_No, 1)","(num__Monthly Premium Auto, 1)","(num__Monthly Premium Auto, 1)","(cat__Coverage_Basic, 1)","(num__Monthly Premium Auto, 1)","(num__Months Since Policy Inception, 1)","(cat__Coverage_Basic, 1)","(num__Monthly Premium Auto, 1)","(num__Months Since Policy Inception, 1)","(cat__State_Arizona, 1)","(num__Monthly Premium Auto, 1)","(num__Months Since Policy Inception, 1)","(cat__Response_No, 1)","(num__Monthly Premium Auto, 1)","(num__Months Since Policy Inception, 1)","(cat__State_Arizona, 1)","(num__Monthly Premium Auto, 1)","(num__Months Since Last Claim, 1)"
1,"(cat__Policy Type_Personal Auto, 1)","(num__Months Since Last Claim, 1)","(num__Total Claim Amount, 1)","(cat__Marital Status_Married, 1)","(num__Months Since Last Claim, 1)","(num__Total Claim Amount, 1)","(cat__Location Code_Suburban, 1)","(num__Months Since Last Claim, 1)","(cat__State_Arizona, 1)","(cat__Response_Yes, 1)","(num__Months Since Last Claim, 1)","(num__Months Since Last Claim, 1)","(cat__Coverage_Extended, 1)","(num__Months Since Last Claim, 1)","(num__Number of Open Complaints, 1)","(cat__Coverage_Extended, 1)","(num__Months Since Last Claim, 1)","(num__Total Claim Amount, 1)","(cat__State_California, 1)","(num__Months Since Last Claim, 1)","(num__Number of Open Complaints, 1)","(cat__Response_Yes, 1)","(num__Months Since Last Claim, 1)","(num__Number of Open Complaints, 1)","(cat__State_California, 1)","(num__Months Since Last Claim, 1)","(num__Months Since Policy Inception, 1)"
2,"(cat__Policy_Special L1, 1)","(num__Months Since Policy Inception, 1)","(cat__Response_No, 1)","(cat__Marital Status_Single, 1)","(num__Months Since Policy Inception, 1)","(cat__State_Washington, 1)","(cat__Location Code_Urban, 1)","(num__Months Since Policy Inception, 1)","(cat__Response_No, 1)","(cat__Coverage_Basic, 1)","(num__Months Since Policy Inception, 1)","(num__Total Claim Amount, 1)","(cat__Coverage_Premium, 1)","(num__Months Since Policy Inception, 1)","(num__Total Claim Amount, 1)","(cat__Coverage_Premium, 1)","(num__Months Since Policy Inception, 1)","(cat__State_Arizona, 1)","(cat__State_Nevada, 1)","(num__Months Since Policy Inception, 1)","(num__Total Claim Amount, 1)","(cat__Education_Bachelor, 1)","(num__Months Since Policy Inception, 1)","(num__Total Claim Amount, 1)","(cat__State_Nevada, 1)","(num__Months Since Policy Inception, 1)","(num__Number of Open Complaints, 1)"
3,"(cat__Policy_Special L2, 1)","(num__Number of Open Complaints, 1)","(cat__Coverage_Basic, 1)","(cat__Policy Type_Corporate Auto, 1)","(num__Number of Open Complaints, 1)","(cat__Response_No, 1)","(cat__Marital Status_Divorced, 1)","(num__Number of Open Complaints, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Coverage_Extended, 1)","(num__Number of Open Complaints, 1)","(cat__State_California, 1)","(cat__EmploymentStatus_Disabled, 1)","(num__Number of Open Complaints, 1)","(cat__State_Arizona, 1)","(cat__EmploymentStatus_Disabled, 1)","(num__Number of Open Complaints, 1)","(cat__State_California, 1)","(cat__State_Oregon, 1)","(num__Number of Open Complaints, 1)","(cat__Coverage_Extended, 1)","(cat__Education_College, 1)","(num__Number of Open Complaints, 1)","(cat__State_Arizona, 1)","(cat__State_Oregon, 1)","(num__Number of Open Complaints, 1)","(num__Total Claim Amount, 1)"
4,"(cat__Policy_Special L3, 1)","(num__Total Claim Amount, 1)","(cat__Coverage_Extended, 1)","(cat__Policy Type_Personal Auto, 1)","(num__Total Claim Amount, 1)","(cat__Coverage_Basic, 1)","(cat__Marital Status_Married, 1)","(num__Total Claim Amount, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Coverage_Premium, 1)","(num__Total Claim Amount, 1)","(cat__Response_No, 1)","(cat__EmploymentStatus_Employed, 1)","(num__Total Claim Amount, 1)","(cat__State_Nevada, 1)","(cat__EmploymentStatus_Employed, 1)","(num__Total Claim Amount, 1)","(cat__Response_No, 1)","(cat__State_Washington, 1)","(num__Total Claim Amount, 1)","(cat__Education_Bachelor, 1)","(cat__Education_High School or Below, 1)","(num__Total Claim Amount, 1)","(cat__State_California, 1)","(cat__State_Washington, 1)","(num__Total Claim Amount, 1)","(cat__State_Arizona, 1)"
5,"(cat__Renew Offer Type_Offer1, 1)","(cat__State_Oregon, 1)","(cat__Coverage_Premium, 1)","(cat__Policy Type_Special Auto, 1)","(cat__State_California, 1)","(cat__Coverage_Extended, 1)","(cat__Marital Status_Single, 1)","(cat__State_California, 1)","(cat__Gender_F, 1)","(cat__Education_Bachelor, 1)","(cat__State_California, 1)","(cat__Coverage_Premium, 1)","(cat__EmploymentStatus_Medical Leave, 1)","(cat__State_California, 1)","(cat__Response_No, 1)","(cat__EmploymentStatus_Medical Leave, 1)","(cat__Response_No, 1)","(cat__Education_High School or Below, 1)","(cat__Coverage_Basic, 1)","(cat__State_California, 1)","(cat__Education_Master, 1)","(cat__Location Code_Rural, 1)","(cat__State_Arizona, 1)","(cat__Response_No, 1)","(cat__Response_No, 1)","(cat__State_Nevada, 1)","(cat__State_Nevada, 1)"
6,"(cat__Renew Offer Type_Offer2, 1)","(cat__Response_Yes, 1)","(cat__Education_Doctor, 1)","(cat__Policy_Special L1, 1)","(cat__Response_No, 1)","(cat__Education_Master, 1)","(cat__Policy Type_Personal Auto, 1)","(cat__State_Oregon, 1)","(cat__Location Code_Suburban, 1)","(cat__Education_College, 1)","(cat__Education_Bachelor, 1)","(cat__Education_College, 1)","(cat__EmploymentStatus_Retired, 1)","(cat__Coverage_Extended, 1)","(cat__Coverage_Extended, 1)","(cat__EmploymentStatus_Retired, 1)","(cat__Response_Yes, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Coverage_Extended, 1)","(cat__Education_High School or Below, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Location Code_Suburban, 1)","(cat__State_California, 1)","(cat__Education_Bachelor, 1)","(cat__Response_Yes, 1)","(cat__Education_Bachelor, 1)","(cat__Response_No, 1)"
7,"(cat__Renew Offer Type_Offer3, 1)","(cat__Education_Bachelor, 1)","(cat__Education_High School or Below, 1)","(cat__Policy_Special L2, 1)","(cat__Education_Bachelor, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Policy_Corporate L2, 1)","(cat__Education_Bachelor, 1)","(cat__Marital Status_Divorced, 1)","(cat__Education_Doctor, 1)","(cat__Education_High School or Below, 1)","(cat__Education_High School or Below, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Education_College, 1)","(cat__Education_High School or Below, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Education_College, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Coverage_Premium, 1)","(cat__Education_Master, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Location Code_Urban, 1)","(cat__Response_Yes, 1)","(cat__Education_High School or Below, 1)","(cat__Education_Bachelor, 1)","(cat__Education_College, 1)","(cat__Education_High School or Below, 1)"
8,"(cat__Renew Offer Type_Offer4, 1)","(cat__Education_High School or Below, 1)","(cat__Education_Master, 1)","(cat__Policy_Special L3, 1)","(cat__Education_High School or Below, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Policy_Corporate L3, 1)","(cat__Education_College, 1)","(cat__Marital Status_Married, 1)","(cat__Education_High School or Below, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Location Code_Rural, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Location Code_Rural, 1)","(cat__Education_High School or Below, 1)","(cat__Gender_F, 1)","(cat__EmploymentStatus_Disabled, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Location Code_Suburban, 1)","(cat__Marital Status_Divorced, 1)","(cat__Education_Bachelor, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Education_College, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__EmploymentStatus_Employed, 1)"
9,"(cat__Sales Channel_Call Center, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Renew Offer Type_Offer1, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Gender_F, 1)","(cat__Renew Offer Type_Offer1, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Policy_Corporate L3, 1)","(cat__Education_Master, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Location Code_Suburban, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Location Code_Suburban, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Location Code_Suburban, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Marital Status_Divorced, 1)","(cat__Marital Status_Married, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Education_Doctor, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__EmploymentStatus_Unemployed, 1)"


We can see that Monthly Premium Auto is the most important feature in predicting Income. 

### Feature importance in predicting Monthly Premium

In [22]:
# Remove CLV and Income from each subset
mpa_sets = {}
for idx, d in enumerate(data_sets):
    mpa_sets[f"inc_set_{idx + 1}"] = d.drop(["Customer Lifetime Value", "Income"], axis=1)

In [23]:
# Create new subsets from each of the dictionary values
mpa_subset_list = list(mpa_sets.values())

In [24]:
# Define which columns are numerical and which are categorical
num_features = ['Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Total Claim Amount']
cat_features = ['State', 'Response', 'Coverage', 'Education', 'EmploymentStatus',
       'Gender', 'Location Code', 'Marital Status', 'Policy Type', 'Policy',
       'Renew Offer Type', 'Sales Channel', 'Vehicle Class', 'Vehicle Size']

# Define the ColumnTransformer to apply different preprocessing to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])

# Define the RFE with Linear Regression as the estimator
estimator = [LinearRegression(), 
             RandomForestRegressor(n_estimators=100, criterion="squared_error", max_depth=20), 
             XGBRegressor()]


def inc_pipeline(data):
    mpa_dict = {}
    set_name = []
    for idx, d in enumerate(data):
        X = d.drop(["Monthly Premium Auto"], axis=1)
        y = d["Monthly Premium Auto"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
        
        for est in estimator:
            selector = RFE(est, n_features_to_select=20, step=1)
            pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('selector', selector),
                           ('model', est)])
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            # Get the evaluation metrics
            rmse = np.mean(mean_squared_error(y_test, y_pred))
            mape = mean_absolute_percentage_error(y_test, y_pred)
            # Get the feature names
            feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
            feature_importances = pipeline.named_steps['selector'].ranking_
            top_features = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=False)[:10]
            
            mpa_dict[f"inc_set {idx + 1} - {est} - {rmse} - {mape}"] = top_features
            
    table = pd.DataFrame(mpa_dict)
    return table 

In [25]:
inc_pipeline(mpa_subset_list)

Unnamed: 0,inc_set 1 - LinearRegression() - 49.44839740729651 - 0.05325745219950998,inc_set 1 - RandomForestRegressor(max_depth=20) - 42.529707849686325 - 0.03462791525060119,"inc_set 1 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 71.47602603998926 - 0.038780969920706225",inc_set 2 - LinearRegression() - 51.201714749421846 - 0.054035338319401445,inc_set 2 - RandomForestRegressor(max_depth=20) - 34.31288465238133 - 0.0358526501050307,"inc_set 2 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 49.00394852119296 - 0.03923036223028722",inc_set 3 - LinearRegression() - 39.63566011449668 - 0.052345399899652846,inc_set 3 - RandomForestRegressor(max_depth=20) - 46.75532018936316 - 0.03913938361246579,"inc_set 3 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 65.33394031173894 - 0.04388778974404338",inc_set 4 - LinearRegression() - 37.02831368814125 - 0.05056662431778528,inc_set 4 - RandomForestRegressor(max_depth=20) - 210.2976768292683 - 0.05542460500560493,"inc_set 4 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 336.22130912783905 - 0.07112982602557032",inc_set 5 - LinearRegression() - 1100.0213414634147 - 0.3137825403346142,inc_set 5 - RandomForestRegressor(max_depth=20) - 275.4199090243903 - 0.06418984718715603,"inc_set 5 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 219.75317496596074 - 0.05981084907998181",inc_set 6 - LinearRegression() - 36.27177946378639 - 0.05130256310204211,inc_set 6 - RandomForestRegressor(max_depth=20) - 237.9935013333333 - 0.07220920932224988,"inc_set 6 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 419.7501588894811 - 0.07961948708803188",inc_set 7 - LinearRegression() - 44.782662849358076 - 0.05669469036703782,inc_set 7 - RandomForestRegressor(max_depth=20) - 317.61517586206895 - 0.06902750544118927,"inc_set 7 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 198.55460642134716 - 0.0494558128791969",inc_set 8 - LinearRegression() - 1102.9943181818182 - 0.2589231285755949,inc_set 8 - RandomForestRegressor(max_depth=20) - 332.5263439094518 - 0.0652142663454325,"inc_set 8 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 273.2490993708944 - 0.06470533237697107",inc_set 9 - LinearRegression() - 51.72903124209498 - 0.05446509880355702,inc_set 9 - RandomForestRegressor(max_depth=20) - 218.66592619047614 - 0.05245096766154998,"inc_set 9 - XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) - 181.63515715734943 - 0.06385945852987361"
0,"(num__Total Claim Amount, 1)","(num__Months Since Last Claim, 1)","(num__Months Since Last Claim, 1)","(num__Total Claim Amount, 1)","(num__Months Since Last Claim, 1)","(num__Total Claim Amount, 1)","(num__Total Claim Amount, 1)","(num__Months Since Last Claim, 1)","(num__Total Claim Amount, 1)","(num__Total Claim Amount, 1)","(num__Months Since Last Claim, 1)","(num__Total Claim Amount, 1)","(cat__Response_No, 1)","(num__Months Since Last Claim, 1)","(num__Total Claim Amount, 1)","(num__Total Claim Amount, 1)","(num__Months Since Last Claim, 1)","(num__Months Since Policy Inception, 1)","(num__Total Claim Amount, 1)","(num__Months Since Last Claim, 1)","(num__Months Since Policy Inception, 1)","(cat__State_Arizona, 1)","(num__Months Since Last Claim, 1)","(num__Months Since Last Claim, 1)","(num__Total Claim Amount, 1)","(num__Months Since Last Claim, 1)","(num__Number of Open Complaints, 1)"
1,"(cat__Coverage_Basic, 1)","(num__Months Since Policy Inception, 1)","(num__Months Since Policy Inception, 1)","(cat__State_Oregon, 1)","(num__Months Since Policy Inception, 1)","(cat__State_Arizona, 1)","(cat__Response_No, 1)","(num__Months Since Policy Inception, 1)","(cat__State_Arizona, 1)","(cat__State_Washington, 1)","(num__Months Since Policy Inception, 1)","(cat__State_Washington, 1)","(cat__Response_Yes, 1)","(num__Months Since Policy Inception, 1)","(cat__Response_No, 1)","(cat__Response_No, 1)","(num__Months Since Policy Inception, 1)","(num__Total Claim Amount, 1)","(cat__State_Nevada, 1)","(num__Months Since Policy Inception, 1)","(num__Number of Open Complaints, 1)","(cat__State_California, 1)","(num__Months Since Policy Inception, 1)","(num__Total Claim Amount, 1)","(cat__Coverage_Basic, 1)","(num__Months Since Policy Inception, 1)","(num__Total Claim Amount, 1)"
2,"(cat__Coverage_Extended, 1)","(num__Number of Open Complaints, 1)","(num__Total Claim Amount, 1)","(cat__Coverage_Basic, 1)","(num__Number of Open Complaints, 1)","(cat__Response_No, 1)","(cat__Coverage_Basic, 1)","(num__Total Claim Amount, 1)","(cat__Response_No, 1)","(cat__Coverage_Basic, 1)","(num__Total Claim Amount, 1)","(cat__Response_No, 1)","(cat__Education_Bachelor, 1)","(num__Total Claim Amount, 1)","(cat__Coverage_Basic, 1)","(cat__Coverage_Basic, 1)","(num__Total Claim Amount, 1)","(cat__State_California, 1)","(cat__Coverage_Basic, 1)","(num__Number of Open Complaints, 1)","(num__Total Claim Amount, 1)","(cat__State_Nevada, 1)","(num__Total Claim Amount, 1)","(cat__Response_No, 1)","(cat__Coverage_Extended, 1)","(num__Number of Open Complaints, 1)","(cat__State_California, 1)"
3,"(cat__Coverage_Premium, 1)","(num__Total Claim Amount, 1)","(cat__Response_No, 1)","(cat__Coverage_Extended, 1)","(num__Total Claim Amount, 1)","(cat__Coverage_Basic, 1)","(cat__Coverage_Extended, 1)","(cat__Coverage_Basic, 1)","(cat__Coverage_Basic, 1)","(cat__Coverage_Premium, 1)","(cat__Coverage_Basic, 1)","(cat__Coverage_Basic, 1)","(cat__Education_College, 1)","(cat__Coverage_Basic, 1)","(cat__Coverage_Extended, 1)","(cat__Coverage_Premium, 1)","(cat__State_California, 1)","(cat__Coverage_Basic, 1)","(cat__Coverage_Extended, 1)","(num__Total Claim Amount, 1)","(cat__Response_No, 1)","(cat__State_Oregon, 1)","(cat__State_Arizona, 1)","(cat__Coverage_Basic, 1)","(cat__Coverage_Premium, 1)","(num__Total Claim Amount, 1)","(cat__State_Nevada, 1)"
4,"(cat__Education_Doctor, 1)","(cat__Response_No, 1)","(cat__Coverage_Basic, 1)","(cat__Coverage_Premium, 1)","(cat__Coverage_Basic, 1)","(cat__Coverage_Extended, 1)","(cat__Coverage_Premium, 1)","(cat__Coverage_Extended, 1)","(cat__Coverage_Extended, 1)","(cat__EmploymentStatus_Retired, 1)","(cat__Coverage_Extended, 1)","(cat__Coverage_Extended, 1)","(cat__Education_Doctor, 1)","(cat__Coverage_Extended, 1)","(cat__Coverage_Premium, 1)","(cat__Education_Doctor, 1)","(cat__Coverage_Basic, 1)","(cat__Coverage_Extended, 1)","(cat__Coverage_Premium, 1)","(cat__State_Arizona, 1)","(cat__Coverage_Basic, 1)","(cat__State_Washington, 1)","(cat__Coverage_Basic, 1)","(cat__Coverage_Extended, 1)","(cat__Education_Doctor, 1)","(cat__State_Oregon, 1)","(cat__Coverage_Basic, 1)"
5,"(cat__EmploymentStatus_Disabled, 1)","(cat__Coverage_Basic, 1)","(cat__Coverage_Extended, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Coverage_Extended, 1)","(cat__Coverage_Premium, 1)","(cat__Education_Doctor, 1)","(cat__Coverage_Premium, 1)","(cat__Coverage_Premium, 1)","(cat__Location Code_Rural, 1)","(cat__Coverage_Premium, 1)","(cat__Coverage_Premium, 1)","(cat__Education_High School or Below, 1)","(cat__Coverage_Premium, 1)","(cat__Education_High School or Below, 1)","(cat__Education_Master, 1)","(cat__Coverage_Extended, 1)","(cat__Coverage_Premium, 1)","(cat__Education_Master, 1)","(cat__Coverage_Basic, 1)","(cat__Coverage_Extended, 1)","(cat__Education_Bachelor, 1)","(cat__Coverage_Extended, 1)","(cat__Coverage_Premium, 1)","(cat__Location Code_Rural, 1)","(cat__State_Washington, 1)","(cat__Coverage_Extended, 1)"
6,"(cat__EmploymentStatus_Unemployed, 1)","(cat__Coverage_Extended, 1)","(cat__Coverage_Premium, 1)","(cat__Location Code_Rural, 1)","(cat__Coverage_Premium, 1)","(cat__EmploymentStatus_Retired, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Education_Bachelor, 1)","(cat__Education_Bachelor, 1)","(cat__Location Code_Suburban, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Education_College, 1)","(cat__Education_Master, 1)","(cat__Education_High School or Below, 1)","(cat__Gender_F, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Coverage_Premium, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__EmploymentStatus_Disabled, 1)","(cat__Coverage_Extended, 1)","(cat__Coverage_Premium, 1)","(cat__Education_Doctor, 1)","(cat__Coverage_Premium, 1)","(cat__Education_Doctor, 1)","(cat__Policy Type_Corporate Auto, 1)","(cat__Coverage_Basic, 1)","(cat__Coverage_Premium, 1)"
7,"(cat__Location Code_Rural, 1)","(cat__Coverage_Premium, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Location Code_Suburban, 1)","(cat__Education_College, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Location Code_Rural, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Marital Status_Divorced, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__EmploymentStatus_Disabled, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Location Code_Suburban, 1)","(cat__EmploymentStatus_Medical Leave, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Coverage_Premium, 1)","(cat__Education_Doctor, 1)","(cat__Education_High School or Below, 1)","(cat__Education_High School or Below, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Policy Type_Special Auto, 1)","(cat__Coverage_Extended, 1)","(cat__Education_Doctor, 1)"
8,"(cat__Location Code_Suburban, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Marital Status_Single, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Location Code_Rural, 1)","(cat__Location Code_Suburban, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Location Code_Suburban, 1)","(cat__Policy_Special L1, 1)","(cat__Location Code_Suburban, 1)","(cat__EmploymentStatus_Medical Leave, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Gender_M, 1)","(cat__Location Code_Urban, 1)","(cat__EmploymentStatus_Retired, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Gender_F, 1)","(cat__Location Code_Rural, 1)","(cat__Education_College, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Education_Master, 1)","(cat__EmploymentStatus_Employed, 1)","(cat__Location Code_Rural, 1)","(cat__Policy_Corporate L1, 1)","(cat__Coverage_Premium, 1)","(cat__Education_Master, 1)"
9,"(cat__Policy_Corporate L1, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Location Code_Rural, 1)","(cat__Policy Type_Special Auto, 1)","(cat__Location Code_Suburban, 1)","(cat__Location Code_Suburban, 1)","(cat__Policy_Corporate L1, 1)","(cat__Location Code_Suburban, 1)","(cat__Renew Offer Type_Offer1, 1)","(cat__Policy_Special L2, 1)","(cat__Marital Status_Single, 1)","(cat__Location Code_Suburban, 1)","(cat__EmploymentStatus_Medical Leave, 1)","(cat__Location Code_Suburban, 1)","(cat__Marital Status_Married, 1)","(cat__Location Code_Suburban, 1)","(cat__Gender_F, 1)","(cat__Location Code_Suburban, 1)","(cat__Policy Type_Corporate Auto, 1)","(cat__Education_Master, 1)","(cat__Location Code_Rural, 1)","(cat__Gender_F, 1)","(cat__EmploymentStatus_Unemployed, 1)","(cat__Location Code_Suburban, 1)","(cat__Policy_Personal L1, 1)","(cat__Education_Bachelor, 1)","(cat__Location Code_Suburban, 1)"


### Use a voting mechanism to determine most important targetable feature values to get highest CLV

- Add weights (increase in frequency) to features from subsets with higher average CLV
- Get the most frequent values for targetable features that are important