In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

In [75]:
df=pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [20]:
df['age'].value_counts()

age
18    69
19    68
46    29
52    29
50    29
47    29
48    29
51    29
45    29
20    29
24    28
27    28
28    28
25    28
23    28
49    28
54    28
53    28
22    28
21    28
26    28
31    27
41    27
44    27
43    27
42    27
29    27
30    27
40    27
32    26
33    26
57    26
34    26
55    26
56    26
35    25
58    25
37    25
59    25
39    25
36    25
38    25
62    23
60    23
63    23
61    23
64    22
Name: count, dtype: int64

In [56]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [77]:
X=df[['age', 'sex', 'bmi', 'children', 'smoker', 'region']]
y=df['charges']

In [78]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.900,0,yes,southwest
1,18,male,33.770,1,no,southeast
2,28,male,33.000,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.880,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest
1334,18,female,31.920,0,no,northeast
1335,18,female,36.850,0,no,southeast
1336,21,female,25.800,0,no,southwest


In [79]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [80]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region
693,24,male,23.655,0,no,northwest
1297,28,female,26.510,2,no,southeast
634,51,male,39.700,1,no,southwest
1022,47,male,36.080,1,yes,southeast
178,46,female,28.900,2,no,southwest
...,...,...,...,...,...,...
1095,18,female,31.350,4,no,northeast
1130,39,female,23.870,5,no,southeast
1294,58,male,25.175,0,no,northeast
860,37,female,47.600,2,yes,southwest


In [81]:
from sklearn.preprocessing import LabelEncoder
sEncoder=LabelEncoder()
smokerEncoder=LabelEncoder()

In [82]:
X_train['sex']=sEncoder.fit_transform(X_train['sex'])
X_test['sex']=sEncoder.transform(X_test['sex'])
X_train['smoker']=smokerEncoder.fit_transform(X_train['smoker'])
X_test['smoker']=smokerEncoder.transform(X_test['smoker'])

In [83]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region
693,24,1,23.655,0,0,northwest
1297,28,0,26.510,2,0,southeast
634,51,1,39.700,1,0,southwest
1022,47,1,36.080,1,1,southeast
178,46,0,28.900,2,0,southwest
...,...,...,...,...,...,...
1095,18,0,31.350,4,0,northeast
1130,39,0,23.870,5,0,southeast
1294,58,1,25.175,0,0,northeast
860,37,0,47.600,2,1,southwest


In [84]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')  
X_train_region = ohe.fit_transform(X_train[['region']])
X_test_region = ohe.transform(X_test[['region']])

In [85]:
X_train_region_df = pd.DataFrame(X_train_region, columns=ohe.get_feature_names_out(), index=X_train.index)
X_test_region_df = pd.DataFrame(X_test_region, columns=ohe.get_feature_names_out(), index=X_test.index)

In [86]:
X_train_region_df

Unnamed: 0,region_northwest,region_southeast,region_southwest
693,1.0,0.0,0.0
1297,0.0,1.0,0.0
634,0.0,0.0,1.0
1022,0.0,1.0,0.0
178,0.0,0.0,1.0
...,...,...,...
1095,0.0,0.0,0.0
1130,0.0,1.0,0.0
1294,0.0,0.0,0.0
860,0.0,0.0,1.0


In [87]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region
693,24,1,23.655,0,0,northwest
1297,28,0,26.510,2,0,southeast
634,51,1,39.700,1,0,southwest
1022,47,1,36.080,1,1,southeast
178,46,0,28.900,2,0,southwest
...,...,...,...,...,...,...
1095,18,0,31.350,4,0,northeast
1130,39,0,23.870,5,0,southeast
1294,58,1,25.175,0,0,northeast
860,37,0,47.600,2,1,southwest


In [88]:
X_train = X_train.drop(columns=['region'])
X_test = X_test.drop(columns=['region'])

In [89]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker
693,24,1,23.655,0,0
1297,28,0,26.510,2,0
634,51,1,39.700,1,0
1022,47,1,36.080,1,1
178,46,0,28.900,2,0
...,...,...,...,...,...
1095,18,0,31.350,4,0
1130,39,0,23.870,5,0
1294,58,1,25.175,0,0
860,37,0,47.600,2,1


In [90]:
X_train = pd.concat([X_train, X_train_region_df], axis=1)
X_test = pd.concat([X_test, X_test_region_df], axis=1)

In [91]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region_northwest,region_southeast,region_southwest
693,24,1,23.655,0,0,1.0,0.0,0.0
1297,28,0,26.510,2,0,0.0,1.0,0.0
634,51,1,39.700,1,0,0.0,0.0,1.0
1022,47,1,36.080,1,1,0.0,1.0,0.0
178,46,0,28.900,2,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
1095,18,0,31.350,4,0,0.0,0.0,0.0
1130,39,0,23.870,5,0,0.0,1.0,0.0
1294,58,1,25.175,0,0,0.0,0.0,0.0
860,37,0,47.600,2,1,0.0,0.0,1.0


In [92]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  
X_test_scaled = scaler.transform(X_test)

In [93]:
from sklearn.svm import SVR
svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
svr_rbf.fit(X_train_scaled, y_train)

In [94]:
y_pred = svr_rbf.predict(X_test_scaled)

In [97]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error (SVR with RBF kernel): {mse:.2f}")
print(rmse)
print(f"R-squared Score: {r2:.2f}")

Mean Squared Error (SVR with RBF kernel): 118450952.70
10883.517478374155
R-squared Score: 0.21


In [107]:
from sklearn.pipeline import Pipeline
svr = SVR(kernel='rbf')
param_grid = {
    'C': [1, 10, 100, 1000],       
    'gamma': ['scale', 0.01, 0.1, 1],  
    'epsilon': [0.01, 0.1, 0.5, 1]    
}
grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(best_model)

SVR(C=1000, epsilon=0.01, gamma=1)


In [108]:
y_pred = best_model.predict(X_test)

In [109]:
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

In [106]:
print("Best Parameters:", grid_search.best_params_)
print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

Best Parameters: {'C': 100, 'epsilon': 1, 'gamma': 'auto'}
R² Score: -0.0976
RMSE: 12869.35
