# Building Best Regression Model for Customer Spend

In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split  # for train and test split
from sklearn.linear_model import LinearRegression  # import linear regression from sklearn library
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error  # method to calculate RMSE from the linear regression


In [3]:
df = pd.read_csv('spend_age_income_ed.csv')
display(df.head(5))
print('length of the data frame is', len(df))

Unnamed: 0,spend,age,income,years_of_education
0,3304.0,36.0,45125.0,12
1,3709.0,43.0,41695.0,10
2,3305.0,47.0,39253.0,17
3,2170.0,33.0,32384.0,13
4,2113.0,30.0,33182.0,10


length of the data frame is 1000


In [5]:
x_var = df[['age', 'income', 'years_of_education']]
y_var = df['spend']

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x_var, y_var, random_state=10)

print('length of x_train:', x_train.shape)
print('length of x_test:', x_test.shape)
print('length of y_train:', y_train.shape)
print('length of y_test:', y_test.shape)

length of x_train: (750, 3)
length of x_test: (250, 3)
length of y_train: (750,)
length of y_test: (250,)


In [15]:
linreg_model = LinearRegression().fit(x_train, y_train)

# for regression tree models with max_depth 2 and 5
treemodel_dep2 = DecisionTreeRegressor(max_depth=2).fit(x_train, y_train)  
treemodel_dep5 = DecisionTreeRegressor(max_depth=5).fit(x_train, y_train)

# for regression tree models with max_depth 2 and 5
randforest_dep2 = RandomForestRegressor(max_depth=2, random_state=10).fit(x_train, y_train)  
randforest_dep5 = RandomForestRegressor(max_depth=5, random_state=10).fit(x_train, y_train)

    Calculate and print out the RMSE on the test data for all five models.

In [41]:
# build a function to calculate rmse from all models
# pass them into a data frame

def rmse_calc(models):
    df_rmse = pd.DataFrame()
    i = 0
    
    for model in models:
        print(model)
        predict = model.predict(x_test)
        rmse_val = mean_squared_error(predict, y_test)**0.5
        
        df_rmse.loc[i, 'model'] = str(model)
        df_rmse.loc[i, 'rmse_val'] = rmse_val
        i += 1

    return df_rmse

In [42]:
models = [linreg_model, treemodel_dep2, treemodel_dep5, randforest_dep2, randforest_dep5]
rmse_calc(models)

LinearRegression()
DecisionTreeRegressor(max_depth=2)
DecisionTreeRegressor(max_depth=5)
RandomForestRegressor(max_depth=2, random_state=10)
RandomForestRegressor(max_depth=5, random_state=10)


Unnamed: 0,model,rmse_val
0,LinearRegression(),348.197715
1,DecisionTreeRegressor(max_depth=2),268.510693
2,DecisionTreeRegressor(max_depth=5),125.532571
3,"RandomForestRegressor(max_depth=2, random_stat...",266.45845
4,"RandomForestRegressor(max_depth=5, random_stat...",115.201406
