In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,sex,smoker,region
0,female,yes,southwest
1,male,no,southeast
2,male,no,southeast
3,male,no,northwest
4,male,no,northwest


In [5]:
obj_df['smoker'].value_counts()

no     1064
yes     274
Name: smoker, dtype: int64

In [6]:
obj_df['region'].value_counts()

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

In [7]:
obj_df['sex'].value_counts()

male      676
female    662
Name: sex, dtype: int64

In [8]:
obj_df["sex"] = obj_df["sex"].astype('category')
obj_df["region"] = obj_df["region"].astype('category')
obj_df["smoker"] = obj_df["smoker"].astype('category')
obj_df.dtypes

sex       category
smoker    category
region    category
dtype: object

In [9]:
obj_df["sex_cat"] = obj_df["sex"].cat.codes
obj_df["region_cat"] = obj_df["region"].cat.codes
obj_df["smoker_cat"] = obj_df["smoker"].cat.codes
obj_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   sex         1338 non-null   category
 1   smoker      1338 non-null   category
 2   region      1338 non-null   category
 3   sex_cat     1338 non-null   int8    
 4   region_cat  1338 non-null   int8    
 5   smoker_cat  1338 non-null   int8    
dtypes: category(3), int8(3)
memory usage: 8.4 KB


In [10]:
df['sex'] = obj_df['sex_cat']
df['smoker'] = obj_df['smoker_cat']
df['region'] = obj_df['region_cat']
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,3,16884.92400
1,18,1,33.770,1,0,2,1725.55230
2,28,1,33.000,3,0,2,4449.46200
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.880,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830
1334,18,0,31.920,0,0,0,2205.98080
1335,18,0,36.850,0,0,2,1629.83350
1336,21,0,25.800,0,0,3,2007.94500


## 前處理結束

In [12]:
X = df.drop(['charges'], axis =1).values
y = df['charges'].values

print(X.shape)
print(y.shape)

(1338, 6)
(1338,)


In [13]:
# 測試&訓練集使用50%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=12)

In [14]:
# linear regression

from sklearn.linear_model import LinearRegression
model1 = LinearRegression()
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)
score1 = model1.score(X_test, y_test)
score1

0.736659746260978

In [15]:
# SVM

from sklearn.svm import SVR
model2 = SVR(kernel='poly', degree=2, gamma='auto')
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)
score2 = model2.score(X_test, y_test)
score2

0.5455772349908068

In [16]:
# decision tree

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [17]:
# float 64沒辦法跑decision tree故全換成int64

X_train_int = X_train.astype(np.int64)
y_train_int = y_train.astype(np.int64)
X_test_int = X_test.astype(np.int64)
y_test_int = y_test.astype(np.int64)


model3 = tree.DecisionTreeClassifier()
model3.fit(X_train_int, y_train_int)
y_pred3 = model3.predict(X_test_int)
score3 = model3.score(X_test_int, y_test_int)
score3

0.01046337817638266

In [18]:
#linear regression 模型評估

from scipy.stats.stats import pearsonr
pearsonr(y_pred1, y_test)

(0.8582908516935335, 1.9790600679135826e-195)

In [19]:
error1 = y_pred1 - y_test
rmse1 = (error1**2).mean()**.5
mae1 = abs(error1).mean()
print('rmse: ', str(rmse1))
print('mae: ', str(mae1))

rmse:  6230.661198550438
mae:  4286.283858436203


In [20]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
print('rmse: ', str(mean_squared_error(y_pred1, y_test)**0.5))
print('mae: ', str(mean_absolute_error(y_pred1, y_test)))

rmse:  6230.661198550438
mae:  4286.283858436203


In [21]:
r2_score(y_pred1, y_test)

0.643612637224138

In [22]:
score1

0.736659746260978

In [23]:
# SVM模型評估

pearsonr(y_pred2, y_test)

(0.8336046507664776, 4.280612854255268e-174)

In [24]:
error2 = y_pred2 - y_test
rmse2 = (error2**2).mean()**.5
mae2 = abs(error2).mean()
print('rmse: ', str(rmse2))
print('mae: ', str(mae2))

rmse:  8184.75250653519
mae:  3580.0382429221017


In [25]:
print('rmse: ', str(mean_squared_error(y_pred2, y_test)**0.5))
print('mae: ', str(mean_absolute_error(y_pred2, y_test)))

rmse:  8184.75250653519
mae:  3580.0382429221017


In [26]:
r2_score(y_pred2, y_test)

-0.4237295653630737

In [27]:
score2

0.5455772349908068

In [28]:
# decision tree 模型評估

pearsonr(y_pred3, y_test)

(0.681347334315325, 1.856781618329966e-92)

In [29]:
error3 = y_pred3 - y_test
rmse3 = (error3**2).mean()**.5
mae3 = abs(error3).mean()
print('rmse: ', str(rmse3))
print('mae: ', str(mae3))

rmse:  9450.55335995986
mae:  5110.275712494768


In [30]:
print('rmse: ', str(mean_squared_error(y_pred3, y_test)**0.5))
print('mae: ', str(mean_absolute_error(y_pred3, y_test)))

rmse:  9450.55335995986
mae:  5110.275712494768


In [31]:
r2_score(y_pred3, y_test)

0.3166439765900889

In [32]:
score3

0.01046337817638266