### Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Reading CSV file

In [2]:
df=pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Data Exploration

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
df.duplicated().sum()

1

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1337.0,1337.0,1337.0,1337.0
mean,39.222139,30.663452,1.095737,13279.121487
std,14.044333,6.100468,1.205571,12110.359656
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29,0.0,4746.344
50%,39.0,30.4,1.0,9386.1613
75%,51.0,34.7,2.0,16657.71745
max,64.0,53.13,5.0,63770.42801


### Data Preprocessing

In [7]:
df1=df.drop('sex',axis=1)

In [8]:
df1.head()

Unnamed: 0,age,bmi,children,smoker,region,charges
0,19,27.9,0,yes,southwest,16884.924
1,18,33.77,1,no,southeast,1725.5523
2,28,33.0,3,no,southeast,4449.462
3,33,22.705,0,no,northwest,21984.47061
4,32,28.88,0,no,northwest,3866.8552


#### Using get_dummies method for encoding categorical values

In [9]:
df2=pd.get_dummies(df1,columns=['smoker','region'])
df2.head()

Unnamed: 0,age,bmi,children,charges,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,0,1
1,18,33.77,1,1725.5523,1,0,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,0,1,0
3,33,22.705,0,21984.47061,1,0,0,1,0,0
4,32,28.88,0,3866.8552,1,0,0,1,0,0


In [10]:
X=df2.drop('charges',axis=1)
y=df2['charges']

In [11]:
X

Unnamed: 0,age,bmi,children,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.900,0,0,1,0,0,0,1
1,18,33.770,1,1,0,0,0,1,0
2,28,33.000,3,1,0,0,0,1,0
3,33,22.705,0,1,0,0,1,0,0
4,32,28.880,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,1,0,0,1,0,0
1334,18,31.920,0,1,0,1,0,0,0
1335,18,36.850,0,1,0,0,0,1,0
1336,21,25.800,0,1,0,0,0,0,1


In [12]:
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1337, dtype: float64

In [13]:
from sklearn.preprocessing import MinMaxScaler

scale=MinMaxScaler()
col=['age','bmi']

X[col]=scale.fit_transform(X[col])

In [14]:
X.head()

Unnamed: 0,age,bmi,children,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.021739,0.321227,0,0,1,0,0,0,1
1,0.0,0.47915,1,1,0,0,0,1,0
2,0.217391,0.458434,3,1,0,0,0,1,0
3,0.326087,0.181464,0,1,0,0,1,0,0
4,0.304348,0.347592,0,1,0,0,1,0,0


In [15]:
# Drop one binary variable for each categorical variable to avoid multicollinearity
df_encoded = X.drop(columns=['smoker_no', 'region_northeast', 'region_southeast'],axis=1)

# Now, the dataset should contain:
# age, bmi, children, smoker_yes, region_northwest, region_southwest

# Use df_encoded for further modeling

df_encoded.head()

Unnamed: 0,age,bmi,children,smoker_yes,region_northwest,region_southwest
0,0.021739,0.321227,0,1,0,1
1,0.0,0.47915,1,0,0,0
2,0.217391,0.458434,3,0,0,0
3,0.326087,0.181464,0,0,1,0
4,0.304348,0.347592,0,0,1,0


### Train_test_split Dataset

In [16]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df_encoded,y,test_size=0.2,random_state=0)

### Model Building and Training

In [17]:
from sklearn.ensemble import RandomForestRegressor

model=RandomForestRegressor()
model.fit(x_train,y_train)

In [18]:
model.score(x_test,y_test)

0.835367116219148

### Finetuning

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor


In [20]:
model_params={
    'linear_regression': {
        'model': LinearRegression(),
        'params': {}
        },
    'decision_tree_regression': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': [None, 5, 10, 15, 20, 30, 50]
        }
    },
    'random_forest_regression': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [5, 10, 15, 20, 30, 40, 50],
            
        }
    },
    'gradient_boosting_regression': {
        'model': GradientBoostingRegressor(),
        'params': {
            'n_estimators': [5, 10, 15, 20, 30, 40, 50],
            'learning_rate': [0.01, 0.1, 0.5, 1],
          
        }
    }
}


In [21]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
        'Accuracy'   : clf.score(x_test,y_test)
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params','Accuracy'])
df

Unnamed: 0,model,best_score,best_params,Accuracy
0,linear_regression,0.738531,{},0.75225
1,decision_tree_regression,0.834899,{'max_depth': 5},0.811973
2,random_forest_regression,0.829717,{'n_estimators': 50},0.834516
3,gradient_boosting_regression,0.856685,"{'learning_rate': 0.5, 'n_estimators': 5}",0.847972


### Making the Model Considering Gender feature

In [22]:
data=pd.read_csv('insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [23]:
data.drop_duplicates(inplace=True)

In [24]:
data2=pd.get_dummies(data,columns=['sex','smoker','region'])
data2.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0,1,0,0


In [25]:
X=data2.drop('charges',axis=1)
y=data2['charges']

In [26]:
X.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,1,0,0,1,0,0,0,1
1,18,33.77,1,0,1,1,0,0,0,1,0
2,28,33.0,3,0,1,1,0,0,0,1,0
3,33,22.705,0,0,1,1,0,0,1,0,0
4,32,28.88,0,0,1,1,0,0,1,0,0


In [27]:
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [28]:
from sklearn.preprocessing import MinMaxScaler

scale=MinMaxScaler()
col=['age','bmi']

X[col]=scale.fit_transform(X[col])

In [29]:
X.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.021739,0.321227,0,1,0,0,1,0,0,0,1
1,0.0,0.47915,1,0,1,1,0,0,0,1,0
2,0.217391,0.458434,3,0,1,1,0,0,0,1,0
3,0.326087,0.181464,0,0,1,1,0,0,1,0,0
4,0.304348,0.347592,0,0,1,1,0,0,1,0,0


In [30]:
# Drop one binary variable for each categorical variable to avoid multicollinearity
df_encoded = X.drop(columns=['smoker_no', 'region_northeast', 'region_southeast'],axis=1)

# Now, the dataset should contain:
# age, bmi, children, smoker_yes, region_northwest, region_southwest

# Use df_encoded for further modeling
df_encoded.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_yes,region_northwest,region_southwest
0,0.021739,0.321227,0,1,0,1,0,1
1,0.0,0.47915,1,0,1,0,0,0
2,0.217391,0.458434,3,0,1,0,0,0
3,0.326087,0.181464,0,0,1,0,1,0
4,0.304348,0.347592,0,0,1,0,1,0


In [31]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df_encoded,y,test_size=.2,random_state=0)

In [32]:
from sklearn.ensemble import RandomForestRegressor

clf=RandomForestRegressor()
clf.fit(x_train,y_train)

In [33]:
clf.score(x_test,y_test)

0.835258060472372

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor


In [35]:
model_params={
    'linear_regression': {
        'model': LinearRegression(),
        'params': {}
        },
    'decision_tree_regression': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': [None, 5, 10, 15, 20, 30, 50]
        }
    },
    'random_forest_regression': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [5, 10, 15, 20, 30, 40, 50],
            
        }
    },
    'gradient_boosting_regression': {
        'model': GradientBoostingRegressor(),
        'params': {
            'n_estimators': [5, 10, 15, 20, 30, 40, 50],
            'learning_rate': [0.01, 0.1, 0.5, 1],
          
        }
    }
}


In [36]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
        'Accuracy'   : clf.score(x_test,y_test)
    })
    
df_ = pd.DataFrame(scores,columns=['model','best_score','best_params','Accuracy'])
df_

Unnamed: 0,model,best_score,best_params,Accuracy
0,linear_regression,0.738129,{},0.752058
1,decision_tree_regression,0.834879,{'max_depth': 5},0.811958
2,random_forest_regression,0.82685,{'n_estimators': 50},0.836847
3,gradient_boosting_regression,0.856834,"{'learning_rate': 0.1, 'n_estimators': 50}",0.850124


### There is no difference  in model accuracy if the Gender feature present or absent