In [1]:
import numpy as np 
import pandas as pd

In [2]:
df = pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
cat_cols = list(df.select_dtypes(exclude = np.number).columns)
num_cols = list(df.select_dtypes(include = np.number).columns)

target_cols = ['charges']
num_cols.remove(target_cols[0])

print(cat_cols, num_cols, target_cols)

['sex', 'smoker', 'region'] ['age', 'bmi', 'children'] ['charges']


In [5]:
df['region'].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

## Data Processing Pipeline

In [109]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

class LabelEncoding(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}

    def fit(self, X, y=None):
        self.encoders = {
            col: LabelEncoder().fit(X[col]) for col in X.columns
        }
        return self

    def transform(self, X, y=None):
        X_copy = X.copy()
        for col, encoder in self.encoders.items():
            X_copy[col] = encoder.transform(X_copy[col])
        return X_copy



column_transformer = ColumnTransformer(
    transformers=[
        ('cat', LabelEncoding(), cat_cols),
        ('num', MinMaxScaler(), num_cols)
    ], 
    remainder='passthrough' 
)


pipeline = Pipeline(steps=[
    ('processor', column_transformer)
])


X = pipeline.fit_transform(df.drop(target_cols, axis='columns'), df[target_cols])

processed_cols = num_cols + cat_cols 
X = pd.DataFrame(X, columns=processed_cols)



target_scaling = MinMaxScaler()
y = target_scaling.fit_transform(df[target_cols]).reshape(1, -1)[0]
y = pd.Series(y)

print(X.shape, y.shape)

(1338, 6) (1338,)


## Split Data into Training and Testing

In [110]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    shuffle=True)
X_train.shape, y_test.shape

((1070, 6), (268,))

## Model Training

In [111]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

models = [
    LinearRegression(),
    RandomForestRegressor(),
    SVR()
]

for algo in models:
    print("*" * 20,algo, "*" * 20)
    model = algo.fit(X_train, y_train)
    print("Training Score ", model.score(X_train, y_train))
    print("Testing Score ", model.score(X_test, y_test))

******************** LinearRegression() ********************
Training Score  0.7602568495644851
Testing Score  0.7036949357505351
******************** RandomForestRegressor() ********************
Training Score  0.9777413744011719
Testing Score  0.7922902899433336
******************** SVR() ********************
Training Score  0.7144323107530048
Testing Score  0.6620780321839483


In [112]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'kernel': ['linear', 'rbf', 'poly'],   
    'C': [0.1, 1, 10, 100],               
    'gamma': ['scale', 'auto'],            
    'epsilon': [0.01, 0.1, 0.5, 1]         
}

grid_search = GridSearchCV(
    estimator=SVR(),
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)  

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV] END ....C=0.1, epsilon=0.01, gamma=scale, kernel=linear; total time=   0.1s
[CV] END .......C=0.1, epsilon=0.01, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .......C=0.1, epsilon=0.01, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ......C=0.1, epsilon=0.01, gamma=scale, kernel=poly; total time=   0.1s
[CV] END .....C=0.1, epsilon=0.01, gamma=auto, kernel=linear; total time=   0.1s
[CV] END ........C=0.1, epsilon=0.01, gamma=auto, kernel=rbf; total time=   0.1s
[CV] END .......C=0.1, epsilon=0.01, gamma=auto, kernel=poly; total time=   0.1s
[CV] END .......C=0.1, epsilon=0.01, gamma=auto, kernel=poly; total time=   0.1s
[CV] END ........C=0.1, epsilon=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ........C=0.1, epsilon=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .......C=0.1, epsilon=0.1, gamma=scale, kernel=poly; total time=   0.1s
[CV] END .......C=0.1, epsilon=0.1, gamma=scale

In [113]:
print("Best parameters:", grid_search.best_params_)
print("Best R2 score:", grid_search.best_score_)

Best parameters: {'C': 10, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
Best R2 score: 0.829096211333779


In [114]:
model = SVR(
**grid_search.best_params_
)
model

In [115]:
model.fit(X_train, y_train)
model.score(X_train, y_train), model.score(X_test, y_test)

(0.8552906364542058, 0.7879987651372451)

## training pipeline

In [119]:
training_pipeline = Pipeline(steps=[
    ('processor', column_transformer),
    ('regressor', SVR(
        **grid_search.best_params_
    ))
])

X_t = df.drop(['charges'], axis='columns')
y_t = target_scaling.fit_transform(df[target_cols])

X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_t, y_t)

training_pipeline.fit(X_train_t, y_train_t)

  y = column_or_1d(y, warn=True)


In [120]:
y_pred = training_pipeline.predict(X_test_t)
y_pred [:10]

array([0.23500313, 0.18203212, 0.07782355, 0.1720321 , 0.52348388,
       0.06639038, 0.06581319, 0.10045132, 0.76727893, 0.10477256])

In [121]:
training_pipeline.score(X_test_t, y_test_t)

0.8614658812566003