In [2]:
import pandas as pd

In [3]:
raw_df=pd.read_csv('insurance.csv')

In [4]:
raw_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
def unique_values(df):
    unique_counts = {}
    for column in df.columns:
        unique_counts[column] = df[column].nunique()
    return unique_counts

In [6]:
unique_values(raw_df)

{'age': 47,
 'sex': 2,
 'bmi': 548,
 'children': 6,
 'smoker': 2,
 'region': 4,
 'charges': 1337}

In [7]:
### Now we will consider age,bmi,charges as numerical features and sex,smoker,region as categorical features

In [8]:
raw_df.region.value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [9]:
raw_df.sex.value_counts()

sex
male      676
female    662
Name: count, dtype: int64

In [10]:
raw_df.smoker.value_counts()

smoker
no     1064
yes     274
Name: count, dtype: int64

In [11]:
inputs=raw_df[['age','sex','bmi','children','smoker','region']]
target=raw_df['charges']

In [12]:
numerical_cols=['age','bmi','children']

In [13]:
from sklearn.model_selection import train_test_split
inputs_train,input_test,target_train,target_test=train_test_split(inputs, target, test_size=0.25, random_state=42)

In [14]:
from sklearn.preprocessing import MinMaxScaler

In [15]:
min_max=MinMaxScaler()
min_max.fit(raw_df[numerical_cols])
inputs_train[numerical_cols] = min_max.transform(inputs_train[numerical_cols])
input_test[numerical_cols] = min_max.transform(input_test[numerical_cols])

ENCODING

In [16]:
dict1={'no':0,'yes':1}

In [17]:
dict2={'male':1,'female':0}

In [18]:
dict3={'southeast':1,'southwest':2,'northeast':3,'northwest':4}

In [19]:
raw_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [20]:
inputs_train['smoker']=inputs_train.smoker.map(dict1)
input_test['smoker']=input_test.smoker.map(dict1)

In [21]:
inputs_train['sex']=inputs_train.sex.map(dict2)
input_test['sex']=input_test.sex.map(dict2)

In [22]:
inputs_train['region']=inputs_train.region.map(dict3)
input_test['region']=input_test.region.map(dict3)

In [23]:
inputs_train['region']=inputs_train['region']/4
input_test['region']=input_test['region']/4

In [24]:
raw_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [25]:
input_test

Unnamed: 0,age,sex,bmi,children,smoker,region
764,0.586957,0,0.247915,0.4,0,0.75
887,0.391304,0,0.378262,0.0,0,1.00
890,1.000000,0,0.293920,0.0,1,1.00
1293,0.608696,1,0.263250,0.6,0,1.00
259,0.021739,1,0.429379,0.0,1,1.00
...,...,...,...,...,...,...
342,0.913043,0,0.311811,0.0,0,0.75
308,0.869565,1,0.508609,0.0,0,0.75
1128,0.347826,1,0.453054,0.2,0,0.50
503,0.021739,1,0.384450,0.0,1,0.25


In [26]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [27]:
model.fit(inputs_train, target_train)

In [28]:
predictions = model.predict(inputs_train)

In [29]:
import numpy as np
def rmse(targets, predictions):
    return np.sqrt(np.mean(np.square(targets - predictions)))

In [30]:
from sklearn.metrics import r2_score

In [31]:
loss = r2_score(target_train, predictions)

In [32]:
loss

0.7445275825163911

In [33]:
pred=model.predict(input_test)

In [34]:
loss1=r2_score(target_test,pred)

In [35]:
loss1

0.7667469908213232

In [36]:
len(target_test)

335

In [37]:
model.coef_

array([11930.18187569,    46.59026438, 12527.86477304,  2131.51750546,
       23650.66168697,   844.06837719])

In [38]:
model.intercept_

-3024.9738313557173

In [39]:
weights_df = pd.DataFrame({
    'feature': np.append(inputs.columns, 1),
    'weight': np.append(model.coef_, model.intercept_)
})
weights_df

Unnamed: 0,feature,weight
0,age,11930.181876
1,sex,46.590264
2,bmi,12527.864773
3,children,2131.517505
4,smoker,23650.661687
5,region,844.068377
6,1,-3024.973831


In [42]:
train_rmse=rmse(target_train,predictions)
test_rmse=rmse(target_test, pred)

In [43]:
train_rmse

6088.23403480799

In [44]:
test_rmse

5932.605868797438