In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mat
import seaborn as sns
from tabulate import tabulate

In [2]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn import tree

In [3]:
df=pd.read_csv("insurance.csv")

In [4]:
df

Unnamed: 0,age,sex,bmi,classif,children,smoker,region,charges
0,19,female,27.900,PREO,0,yes,southwest,16884.92400
1,18,male,33.770,OB1,1,no,southeast,1725.55230
2,28,male,33.000,OB1,3,no,southeast,4449.46200
3,0,male,22.705,N,0,no,northwest,21984.47061
4,32,male,28.880,PREO,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...,...
1333,50,male,30.970,OB1,3,no,northwest,10600.54830
1334,18,female,31.920,OB1,0,no,northeast,2205.98080
1335,18,female,36.850,OB2,0,no,southeast,1629.83350
1336,21,female,25.800,PREO,0,no,southwest,2007.94500


BMI	Nutritional status
Below 18.5	Underweight
18.5–24.9	Normal weight
25.0–29.9	Pre-obesity
30.0–34.9	Obesity class I
35.0–39.9	Obesity class II
Above 40	Obesity class III

In [5]:
df.dtypes

age           int64
sex          object
bmi         float64
classif      object
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [6]:
df.shape

(1338, 8)

In [7]:
df.columns

Index(['age', 'sex', 'bmi', 'classif', 'children', 'smoker', 'region',
       'charges'],
      dtype='object')

In [8]:
df.describe([0.5,0.8,0.99])

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.133782,30.663397,1.094918,13270.422265
std,14.958658,6.098187,1.205493,12110.011237
min,-58.0,15.96,0.0,1121.8739
50%,39.0,30.4,1.0,9382.033
80%,53.6,35.86,2.0,20260.626406
99%,64.0,46.4079,5.0,48537.480726
max,190.0,53.13,5.0,63770.42801


In [9]:
df[df.duplicated()]

Unnamed: 0,age,sex,bmi,classif,children,smoker,region,charges
581,19,male,30.59,OB1,0,no,northwest,1639.5631


In [10]:
#deleting duplicates
print('Size of dataframe before drop_duplicates', df.shape)

df.drop_duplicates(inplace= True)

print('Size of dataframe after drop_duplicates', df.shape)

Size of dataframe before drop_duplicates (1338, 8)
Size of dataframe after drop_duplicates (1337, 8)


In [11]:
df.isnull().sum()

age         0
sex         0
bmi         0
classif     8
children    0
smoker      0
region      0
charges     0
dtype: int64

In [12]:
#missing data
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
classif,8,0.005984
age,0,0.0
sex,0,0.0
bmi,0,0.0
children,0,0.0
smoker,0,0.0
region,0,0.0
charges,0,0.0


In [13]:
rows = [index for index, row in df.iterrows() if row.isnull().any()]

print(rows)

[15, 74, 128, 173, 559, 671, 839, 1331]


In [14]:
index=[15, 74, 128, 173, 559, 671, 839, 1331]

In [15]:
rows= [index for index, row in df.iterrows() if row.isnull().any()]

print(rows)

[15, 74, 128, 173, 559, 671, 839, 1331]


In [16]:
for i in rows:
    if(int(df.iloc[i]['bmi'])<18):
        df.at[i,'classif']="UW"
        
    elif(int(df.iloc[i]['bmi']) in range(18,25)):
        df.at[i,'classif']="N"
        
    elif(int(df.iloc[i]['bmi'])in range(25,30)):
        df.at[i,'classif']="PREO"
        
    elif(int(df.iloc[i]['bmi']) in range(30,35)):
        df.at[i,'classif']="OB1"
        
    elif(int(df.iloc[i]['bmi'])in range(35,40)):
        df.at[i,'classif']="OB2"
        
    elif(int(df.iloc[i]['bmi'])>40):
        df.at[i,'classif']="OB3"

In [17]:
for i in rows:
    print(df.iloc[i]['classif'])

N
PREO
UW
OB1
OB2
PREO
OB1
OB3


## Performing EDA

### Separating categorical and numerical columns

In [18]:
def separate_data_types(df):
    categorical=[]
    continuous=[]
    for column in df.columns:
        if df[column].nunique()<100:
            categorical.append(column)
        else:
            continuous.append(column)
    return categorical,continuous
categorical,continuous=separate_data_types(df)
from tabulate import tabulate
table=[categorical,continuous]
print(tabulate({"categorical":categorical,
                "continuous":continuous},headers=["categorical","continuous"]))

categorical.remove('age')
continuous.append('age')

print("continuous:-",continuous)
print("categorical:-",categorical)

categorical    continuous
-------------  ------------
age            bmi
sex            charges
classif
children
smoker
region
continuous:- ['bmi', 'charges', 'age']
categorical:- ['sex', 'classif', 'children', 'smoker', 'region']


In [19]:
def myOutliers(df,col):
    Q3=np.quantile(df[col],0.75)
    Q1=np.quantile(df[col],0.25)
    
    IQR=Q3-Q1
    
    global mylist
    global my_outlier_df
    
    lower_range=Q1-1.5*IQR
    upper_range=Q3+1.5*IQR
    
    print("The lower range of",col,"is",lower_range)
    print("The upper range of",col,"is",upper_range)
    
    mylist=[x for x in df[col] if ((x<lower_range) | (x>upper_range))]
    print("total outlier in",col,"are",len(mylist))
    my_outlier_df=df.loc[df[col].isin(mylist)]
k=['age','bmi','children','charges']

In [20]:
for i in k:
    print('i is',i)
    myOutliers(df,i)
    print('*****************')

i is age
The lower range of age is -11.5
The upper range of age is 88.5
total outlier in age are 2
*****************
i is bmi
The lower range of bmi is 13.674999999999994
The upper range of bmi is 47.31500000000001
total outlier in bmi are 9
*****************
i is children
The lower range of children is -3.0
The upper range of children is 5.0
total outlier in children are 0
*****************
i is charges
The lower range of charges is -13120.716174999998
The upper range of charges is 34524.777625
total outlier in charges are 139
*****************


In [21]:
df=df[(df['age']<=88.5)&(df['age']>=0)]
df=df[(df['bmi']<=47.315)&(df['bmi']>=13.674)]
df.shape  

(1326, 8)

In [22]:
# encoding sex column
df.replace({'sex':{'male':0,'female':1}}, inplace=True)

3 # encoding 'smoker' column
df.replace({'smoker':{'yes':0,'no':1}}, inplace=True)

# encoding 'region' column
df.replace({'region':{'southeast':0,'southwest':1,'northeast':2,'northwest':3}}, inplace=True)

df.replace({'classif':{'UW':0,'N':1,'PREO':2,'OB1':3,'OB2':4,'OB3':5}}, inplace=True)

In [23]:
df

Unnamed: 0,age,sex,bmi,classif,children,smoker,region,charges
0,19,1,27.900,2,0,0,1,16884.92400
1,18,0,33.770,3,1,1,0,1725.55230
2,28,0,33.000,3,3,1,0,4449.46200
3,0,0,22.705,1,0,1,3,21984.47061
4,32,0,28.880,2,0,1,3,3866.85520
...,...,...,...,...,...,...,...,...
1332,52,1,44.700,5,3,1,1,11411.68500
1333,50,0,30.970,3,3,1,3,10600.54830
1334,18,1,31.920,3,0,1,2,2205.98080
1335,18,1,36.850,4,0,1,0,1629.83350


In [27]:
#df.to_csv("data_after_preprocessing.csv",index=False)

In [28]:
df=pd.read_csv("data_after_preprocessing.csv")

In [29]:
df

Unnamed: 0,age,sex,bmi,classif,children,smoker,region,charges
0,19,1,27.900,2,0,0,1,16884.92400
1,18,0,33.770,3,1,1,0,1725.55230
2,28,0,33.000,3,3,1,0,4449.46200
3,0,0,22.705,1,0,1,3,21984.47061
4,32,0,28.880,2,0,1,3,3866.85520
...,...,...,...,...,...,...,...,...
1321,52,1,44.700,5,3,1,1,11411.68500
1322,50,0,30.970,3,3,1,3,10600.54830
1323,18,1,31.920,3,0,1,2,2205.98080
1324,18,1,36.850,4,0,1,0,1629.83350


In [31]:
X = df.drop(columns='charges', axis=1)
Y = df['charges']

In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [33]:
print(X.shape, X_train.shape, X_test.shape)

(1326, 7) (1060, 7) (266, 7)


### Random Forest

In [34]:
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor()
model.fit(X_train,Y_train)
y_pred=model.predict(X_test)
y_pred



# predicting the accuracy score
score=r2_score(Y_test,y_pred)
print("r2 socre is ",score)

r2 socre is  0.827396027266667


In [35]:
mse_RF=mean_squared_error(Y_test,y_pred)
r2_RF=model.score(X_test,Y_test)

In [36]:
print(mse_RF)
print("...........")
print(r2_RF)

24942245.31196714
...........
0.827396027266667


In [37]:
input_data=(40,0,22.705,1,2,1,2)
input_data1=np.asarray(input_data)
input_data_reshaped=input_data1.reshape(1,-1)
predi=model.predict(input_data_reshaped)

In [38]:
predi=model.predict(X)

In [39]:
predi

array([16980.4516058 ,  3029.2993405 ,  7631.231044  , ...,
        2267.47369938,  3263.58449004,  2462.3734927 ])

In [40]:
a=pd.DataFrame({'Prediction':predi},index=None)

In [41]:
final=pd.concat([df,a],axis=1)

In [42]:
final

Unnamed: 0,age,sex,bmi,classif,children,smoker,region,charges,Prediction
0,19,1,27.900,2,0,0,1,16884.92400,16980.451606
1,18,0,33.770,3,1,1,0,1725.55230,3029.299340
2,28,0,33.000,3,3,1,0,4449.46200,7631.231044
3,0,0,22.705,1,0,1,3,21984.47061,16423.389857
4,32,0,28.880,2,0,1,3,3866.85520,4137.052079
...,...,...,...,...,...,...,...,...,...
1321,52,1,44.700,5,3,1,1,11411.68500,11455.496965
1322,50,0,30.970,3,3,1,3,10600.54830,13232.272216
1323,18,1,31.920,3,0,1,2,2205.98080,2267.473699
1324,18,1,36.850,4,0,1,0,1629.83350,3263.584490


In [44]:
#final.to_csv("final.csv")