# U.S. Medical Insurance Costs

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('insurance.csv')
print(df.head())

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [7]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [4]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
print(X[:5])
print(y[:5])

[[19 'female' 27.9 0 'yes' 'southwest']
 [18 'male' 33.77 1 'no' 'southeast']
 [28 'male' 33.0 3 'no' 'southeast']
 [33 'male' 22.705 0 'no' 'northwest']
 [32 'male' 28.88 0 'no' 'northwest']]
[16884.924    1725.5523   4449.462   21984.47061  3866.8552 ]


In [12]:
le_sex = LabelEncoder()
X[:, 1] = le_sex.fit_transform(X[:, -1])

In [15]:
le_smoker = LabelEncoder()
X[:, 4] = le_smoker.fit_transform(X[:, 5])

In [16]:
le_region = LabelEncoder()
X[:, 5] = le_region.fit_transform(X[:, 5])

In [17]:
X[:5]

array([[19, 3, 27.9, 0, 3, 3],
       [18, 2, 33.77, 1, 2, 2],
       [28, 2, 33.0, 3, 2, 2],
       [33, 1, 22.705, 0, 1, 1],
       [32, 1, 28.88, 0, 1, 1]], dtype=object)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [20]:
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [22]:
X_train[:, 5]

array([-0.462710167305497, 1.3343855823020596, 0.43583770749828127, ...,
       -1.3612580421092753, 0.43583770749828127, 0.43583770749828127],
      dtype=object)

In [31]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf', C=0.5, gamma = 0.05)
regressor.fit(X_train, y_train)

SVR(C=0.5, gamma=0.05)

In [30]:
from sklearn.model_selection import GridSearchCV
parameters = [{'C': np.arange(0,1,0.1), 'kernel': ['linear']}, 
              {'C': np.arange(0,1,0.1), 'kernel': ['rbf'], 'gamma': np.arange(0,1,0.050)}]
grid_search = GridSearchCV(estimator = regressor, 
                           param_grid = parameters, 
                           scoring = 'neg_mean_squared_error', 
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X = X_train, y = y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: -16200953378.12 %
Best Parameters: {'C': 0.5, 'gamma': 0.05, 'kernel': 'rbf'}


 -1.67801641e+08 -1.67799855e+08 -1.67845514e+08 -1.67837469e+08
 -1.67828663e+08 -1.67822881e+08             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan -1.62016349e+08 -1.62015289e+08
 -1.62016429e+08 -1.62016481e+08 -1.62016444e+08 -1.62016406e+08
 -1.62016379e+08 -1.62016360e+08 -1.62016349e+08 -1.62016342e+08
 -1.62016338e+08 -1.62016337e+08 -1.62016338e+08 -1.62016340e+08
 -1.62016342e+08 -1.62016345e+08 -1.62016349e+08 -1.62016352e+08
 -1.62016356e+08 -1.62016360e+08 -1.62016349e+08 -1.62013729e+08
 -1.62016500e+08 -1.62016614e+08 -1.62016539e+08 -1.62016463e+08
 -1.62016408e+08 -1.62016372e+08 -1.62016348e+08 -1.62016334e+08
 -1.62016327e+08 -1.62016325e+08 -1.62016326e+08 -1.62016330e+08
 -1.62016335e+08 -1.62016

In [36]:
y_pred = regressor.predict(X_test)

In [37]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

-0.1164761001927983