In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.feature_selection import RFE

from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
seed = 42 

In [18]:
# Load the raw data
df = pd.read_csv("Datasets\WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv\WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv")

### Preprocessing

In [19]:
# Split the dataset by Number of Policies
policies = df['Number of Policies'].unique()

subsets = {}

for policy in policies:
    subsets[policy] = df[df['Number of Policies'] == policy]

In [20]:
# From the subsets dict we create individual datasets
df_1 = subsets[1]
df_2 = subsets[2]
df_3 = subsets[3]
df_4 = subsets[4]
df_5 = subsets[5]
df_6 = subsets[6]
df_7 = subsets[7]
df_8 = subsets[8]
df_9 = subsets[9]

In [21]:
# Create a list of all the subsets to work with
subset_list = [df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9]

In [22]:
# Set Customer as index for each subset and drop Effective To Date
def customer_as_index(data):
    for i, d in enumerate(data):
        temp_d = d.copy()
        temp_d.set_index("Customer", inplace=True)
        temp_d.drop(["Effective To Date", "Number of Policies"], axis=1, inplace=True)
        data[i] = temp_d
        
customer_as_index(subset_list)

In [28]:
df_1 = subset_list[0]
df_2 = subset_list[1]
df_3 = subset_list[2]
df_4 = subset_list[3]
df_5 = subset_list[4]
df_6 = subset_list[5]
df_7 = subset_list[6]
df_8 = subset_list[7]
df_9 = subset_list[8]

### Get feature importance when predicting Customer Lifetime Value

In [31]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3251 entries, BU79786 to Y167826
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   State                          3251 non-null   object 
 1   Customer Lifetime Value        3251 non-null   float64
 2   Response                       3251 non-null   object 
 3   Coverage                       3251 non-null   object 
 4   Education                      3251 non-null   object 
 5   EmploymentStatus               3251 non-null   object 
 6   Gender                         3251 non-null   object 
 7   Income                         3251 non-null   int64  
 8   Location Code                  3251 non-null   object 
 9   Marital Status                 3251 non-null   object 
 10  Monthly Premium Auto           3251 non-null   int64  
 11  Months Since Last Claim        3251 non-null   int64  
 12  Months Since Policy Inception  3251 non-null

In [44]:
# Define which columns are numerical and which are categorical
num_features = ['Income', 'Monthly Premium Auto',
       'Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Number of Policies',
       'Total Claim Amount']
cat_features = ['State', 'Response', 'Coverage', 'Education', 'EmploymentStatus',
       'Gender', 'Location Code', 'Marital Status', 'Policy Type', 'Policy',
       'Renew Offer Type', 'Sales Channel', 'Vehicle Class', 'Vehicle Size']

# Define the ColumnTransformer to apply different preprocessing to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])

# Define the RFE with Linear Regression as the estimator
estimator = [LinearRegression(), 
             RandomForestRegressor(n_estimators=100, criterion="squared_error", max_depth=20), 
             XGBRegressor()]


def subsets_pipeline(data):
    set_name = []
    model_name = []
    model_rmse = []
    model_mape = []
    for idx, d in enumerate(data):
        X = d.drop(["Customer Lifetime Value"], axis=1)
        y = d["Customer Lifetime Value"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
        
        for est in estimator:
            selector = RFE(est, n_features_to_select=20, step=1)
            pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('selector', selector),
                           ('model', est)])
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            rmse = np.mean(mean_squared_error(y_test, y_pred))
            mape = mean_absolute_percentage_error(y_test, y_pred)
            set_name.append(f"Dataset {idx + 1}")
            model_name.append(est)
            model_rmse.append(rmse)
            model_mape.append(mape)
    table = pd.DataFrame({
        'Subset': set_name,
        'Model': model_name,
        'RMSE': model_rmse,
        'MAPE': model_mape
    })
    return table 

In [45]:
data_sets = [df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9]
subsets_pipeline(data_sets)

Unnamed: 0,Subset,Model,RMSE,MAPE
0,Dataset 1,LinearRegression(),1590102.0,0.232561
1,Dataset 1,"(DecisionTreeRegressor(max_depth=20, max_featu...",15192.38,0.015673
2,Dataset 1,"XGBRegressor(base_score=None, booster=None, ca...",9769.409,0.013335
3,Dataset 2,LinearRegression(),73494060.0,0.430921
4,Dataset 2,"(DecisionTreeRegressor(max_depth=20, max_featu...",65363690.0,0.393267
5,Dataset 2,"XGBRegressor(base_score=None, booster=None, ca...",74592830.0,0.393598
6,Dataset 3,LinearRegression(),4446170.0,0.243039
7,Dataset 3,"(DecisionTreeRegressor(max_depth=20, max_featu...",76997.16,0.018498
8,Dataset 3,"XGBRegressor(base_score=None, booster=None, ca...",45072.06,0.015375
9,Dataset 4,LinearRegression(),1207825.0,0.080012
