In [449]:
## Import necessary libraries
import numpy as np
import pandas as pd

In [450]:
## Read the data
FILENAME = "expenses.csv"
df = pd.read_csv(FILENAME)

In [451]:
## Print data
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [452]:
## Remove outliers
from pandas.core.dtypes.common import is_numeric_dtype


def remove_outlier(df_in):
    df_out = pd.DataFrame()
    for col in list(df_in.columns):
        if is_numeric_dtype(df_in[col]):
            q1 = df_in[col].quantile(0.25)
            q3 = df_in[col].quantile(0.75)

            iqr = q3 - q1
            up = q3 + (iqr * 1.5)
            down = q1 - (iqr * 1.5)

            df_out = df_in.loc[(df_in[col] > down) & (df_in[col] < up)]
    return df_out


df = remove_outlier(df)

In [453]:
## One Hot Encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

transformer = make_column_transformer((OneHotEncoder(), ['sex', 'smoker', 'region']), remainder='passthrough')
df = pd.DataFrame(transformer.fit_transform(df))

df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,19.0,27.9,0.0,16884.924
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,18.0,33.77,1.0,1725.5523
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,28.0,33.0,3.0,4449.462
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,33.0,22.705,0.0,21984.47061
4,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,32.0,28.88,0.0,3866.8552


In [454]:
## Split into train and test sets
from sklearn.model_selection import train_test_split

X = df.drop(columns=[len(df.columns) - 1], axis=1)
y = df[len(df.columns) - 1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [455]:
## RandomForestRegressor (random_state=2, max_depth=5) [Original Data]
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

model = RandomForestRegressor(random_state=85, max_depth=11)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print("Accuracy: ", model.score(X_test, y_test))


MAE:  2643.8782936376388
MSE:  26093460.180981763
RMSE:  5108.175817352195
Accuracy:  0.5592121616701766


In [462]:
## RandomForestRegressor (random_state=2, max_depth=5) + MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = RandomForestRegressor(random_state=85, max_depth=11)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print("Accuracy: ", model.score(X_test, y_test))

MAE:  2648.5302330509476
MSE:  26132893.639073182
RMSE:  5112.034197760534
Accuracy:  0.5585460258403776


In [457]:
## Lasso Regression + MinMaxScaler
from sklearn.linear_model import Lasso

model = Lasso(alpha=0.1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print("Accuracy: ", model.score(X_test, y_test))

MAE:  2733.740019567269
MSE:  26473893.954177614
RMSE:  5145.278802375788
Accuracy:  0.5527856249306423


In [458]:
## Ridge Regression + MinMaxScaler
from sklearn.linear_model import Ridge

model = Ridge(alpha=0.1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print("Accuracy: ", model.score(X_test, y_test))

MAE:  2733.765429921369
MSE:  26468414.17038367
RMSE:  5144.746268805068
Accuracy:  0.5528781930314727
