In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import seaborn as sns

In [73]:
# 경고문 처리
import warnings
warnings.filterwarnings('ignore')


In [10]:
raw = pd.read_csv('data/insurance/insurance.csv')
raw.shape

(1338, 7)

In [11]:
raw.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [12]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [13]:
raw['children'] = raw['children'].astype('object')
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   object 
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(1), object(4)
memory usage: 73.3+ KB


----

In [14]:
df = raw.copy()
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [19]:
df['HavingChild'] = raw['children'] == 0
df = df.drop('children', axis=1)
df

Unnamed: 0,age,sex,bmi,smoker,region,charges,HavingChild
0,19,female,27.900,yes,southwest,16884.92400,True
1,18,male,33.770,no,southeast,1725.55230,False
2,28,male,33.000,no,southeast,4449.46200,False
3,33,male,22.705,no,northwest,21984.47061,True
4,32,male,28.880,no,northwest,3866.85520,True
...,...,...,...,...,...,...,...
1333,50,male,30.970,no,northwest,10600.54830,False
1334,18,female,31.920,no,northeast,2205.98080,True
1335,18,female,36.850,no,southeast,1629.83350,True
1336,21,female,25.800,no,southwest,2007.94500,True


In [21]:
df['s/n'] = df['region'].str.contains('south')
df

Unnamed: 0,age,sex,bmi,smoker,region,charges,HavingChild,s/n
0,19,female,27.900,yes,southwest,16884.92400,True,True
1,18,male,33.770,no,southeast,1725.55230,False,True
2,28,male,33.000,no,southeast,4449.46200,False,True
3,33,male,22.705,no,northwest,21984.47061,True,False
4,32,male,28.880,no,northwest,3866.85520,True,False
...,...,...,...,...,...,...,...,...
1333,50,male,30.970,no,northwest,10600.54830,False,False
1334,18,female,31.920,no,northeast,2205.98080,True,False
1335,18,female,36.850,no,southeast,1629.83350,True,True
1336,21,female,25.800,no,southwest,2007.94500,True,True


In [22]:
df['e/w'] = df['region'].str.contains('east')
df

Unnamed: 0,age,sex,bmi,smoker,region,charges,HavingChild,s/n,e/w
0,19,female,27.900,yes,southwest,16884.92400,True,True,False
1,18,male,33.770,no,southeast,1725.55230,False,True,True
2,28,male,33.000,no,southeast,4449.46200,False,True,True
3,33,male,22.705,no,northwest,21984.47061,True,False,False
4,32,male,28.880,no,northwest,3866.85520,True,False,False
...,...,...,...,...,...,...,...,...,...
1333,50,male,30.970,no,northwest,10600.54830,False,False,False
1334,18,female,31.920,no,northeast,2205.98080,True,False,True
1335,18,female,36.850,no,southeast,1629.83350,True,True,True
1336,21,female,25.800,no,southwest,2007.94500,True,True,False


In [23]:
df = df.drop('region', axis=1)
df

Unnamed: 0,age,sex,bmi,smoker,charges,HavingChild,s/n,e/w
0,19,female,27.900,yes,16884.92400,True,True,False
1,18,male,33.770,no,1725.55230,False,True,True
2,28,male,33.000,no,4449.46200,False,True,True
3,33,male,22.705,no,21984.47061,True,False,False
4,32,male,28.880,no,3866.85520,True,False,False
...,...,...,...,...,...,...,...,...
1333,50,male,30.970,no,10600.54830,False,False,False
1334,18,female,31.920,no,2205.98080,True,False,True
1335,18,female,36.850,no,1629.83350,True,True,True
1336,21,female,25.800,no,2007.94500,True,True,False


----

In [158]:
df.columns

Index(['age', 'sex', 'bmi', 'smoker', 'charges', 'HavingChild', 's/n', 'e/w'], dtype='object')

In [159]:
feature_names = [
    'age', 'sex', 'bmi', 'smoker',
    'HavingChild', 's/n', 'e/w'
]
feature_names


['age', 'sex', 'bmi', 'smoker', 'HavingChild', 's/n', 'e/w']

In [160]:
label_name = 'charges'
label_name

'charges'

----

In [161]:
X = df[feature_names]
X.shape

(1338, 7)

In [162]:
y = df[label_name]
y.shape

(1338,)

In [163]:
X.head()

Unnamed: 0,age,sex,bmi,smoker,HavingChild,s/n,e/w
0,19,female,27.9,yes,True,True,False
1,18,male,33.77,no,False,True,True
2,28,male,33.0,no,False,True,True
3,33,male,22.705,no,True,False,False
4,32,male,28.88,no,True,False,False


In [164]:
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [165]:
X_ohe = pd.get_dummies(X)
X_ohe.head()

Unnamed: 0,age,bmi,HavingChild,s/n,e/w,sex_female,sex_male,smoker_no,smoker_yes
0,19,27.9,True,True,False,1,0,0,1
1,18,33.77,False,True,True,0,1,1,0
2,28,33.0,False,True,True,0,1,1,0
3,33,22.705,True,False,False,0,1,1,0
4,32,28.88,True,False,False,0,1,1,0


In [166]:
X_ohe.isnull().sum()

age            0
bmi            0
HavingChild    0
s/n            0
e/w            0
sex_female     0
sex_male       0
smoker_no      0
smoker_yes     0
dtype: int64

----

In [96]:
# from sklearn.preprocessing import StandardScaler

# ss = StandardScaler()
# ss

StandardScaler()

In [97]:
# X_scaled = ss.fit_transform(X_ohe)
# X_scaled.shape


(1338, 7)

----

In [167]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_ohe, y, test_size=0.2, random_state=42, shuffle=True
)
# X_train, X_valid, y_train, y_valid = train_test_split(
#     X_train, y_train, shuffle=True, test_size=0.2, random_state=42)


In [168]:
display(X_train.shape)
display(X_test.shape)
display(y_train.shape)
display(y_test.shape)


(1070, 9)

(268, 9)

(1070,)

(268,)

---

In [180]:
import lightgbm as lgb

In [183]:
train_dataset = lgb.Dataset(X_train, y_train)
train_dataset

<lightgbm.basic.Dataset at 0x7fced458a5b0>

In [184]:
test_dataset = lgb.Dataset(X_test, y_test)
test_dataset

<lightgbm.basic.Dataset at 0x7fced458a3a0>

In [185]:
params = {
    'learning_rate': 0.3,
    'objective': 'regression',
    'metric': 'rmse',
    'seed': 42,
    'max_depth': 5,
    'n_estimators': 300,
    'subsample': 0.7
}


In [186]:
lgb_model = lgb.train(
    params, train_dataset, 10000, test_dataset,
    verbose_eval=500, early_stopping_rounds=100
)
lgb_model


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 317
[LightGBM] [Info] Number of data points in the train set: 1070, number of used features: 9
[LightGBM] [Info] Start training from score 13346.089733
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[17]	valid_0's rmse: 4326.17


<lightgbm.basic.Booster at 0x7fced458aa30>

In [188]:
y_predict = lgb_model.predict(X_test)
y_predict[:5]

array([10196.77340677,  6505.15488301, 27026.6256674 ,  8973.05288092,
       34420.21187405])

----

In [189]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_predict)
mse

18715787.619491134

In [190]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, y_predict)

2511.855183004684

In [191]:
# rmse
mse ** 0.5

4326.174709774345

In [192]:
from sklearn.metrics import mean_squared_log_error

rmsle = mean_squared_log_error(y_test, y_predict)**0.5
rmsle


0.42496745054250434