In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [50]:
df = pd.read_csv("medical_cost.csv")

In [51]:
df.shape

(1338, 7)

In [52]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [53]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
medical charges,0


In [54]:
df.duplicated().sum()

1

In [55]:
df.drop_duplicates(inplace=True)

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1337 non-null   int64  
 1   sex              1337 non-null   object 
 2   bmi              1337 non-null   float64
 3   children         1337 non-null   int64  
 4   smoker           1337 non-null   object 
 5   region           1337 non-null   object 
 6   medical charges  1337 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.6+ KB


In [58]:
cat_col = [col for col in df.columns if df[col].dtype == 'object']
cat_col

['sex', 'smoker', 'region']

In [59]:
for col in cat_col:
    print(f"{col}: {df[col].unique()}")

sex: ['female' 'male']
smoker: ['yes' 'no']
region: ['southwest' 'southeast' 'northwest' 'northeast']


In [60]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in cat_col:
    df[col] = le.fit_transform(df[col])

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1337 non-null   int64  
 1   sex              1337 non-null   int64  
 2   bmi              1337 non-null   float64
 3   children         1337 non-null   int64  
 4   smoker           1337 non-null   int64  
 5   region           1337 non-null   int64  
 6   medical charges  1337 non-null   float64
dtypes: float64(2), int64(5)
memory usage: 83.6 KB


In [62]:
X, y = df.drop('medical charges', axis=1), df['medical charges']

In [63]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [64]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [65]:
y_pred = rf.predict(X_test)

In [66]:
y_pred

array([16477.5413279 , 12579.2016847 , 10096.7193823 ,  9241.8195371 ,
        9699.9661994 ,  2112.03390983, 11380.5194845 , 23326.2921361 ,
       27799.171898  ,  1901.19251369, 13177.3471771 , 39715.2617007 ,
       19846.7918481 , 11552.4182667 , 10780.3662179 , 47431.5361197 ,
       12825.9467658 , 12075.8580485 , 28927.8015291 ,  1307.807651  ,
        1817.796857  , 14115.9965589 ,  5987.9422674 ,  5317.7806166 ,
       10220.838909  , 21890.0303885 , 27531.853542  ,  6561.451471  ,
       39392.1973802 , 15259.7284917 ,  2272.849194  ,  1300.320436  ,
       13952.9964042 ,  8194.8110853 ,  7577.1935594 ,  2936.8115443 ,
       14528.5486287 ,  9867.0645301 ,  5258.267377  ,  1861.17086   ,
       11975.3153499 , 16050.8974597 ,  5719.3256922 , 21667.6533844 ,
       15439.0069475 , 10057.0203252 , 23518.1657735 ,  3066.950395  ,
        9929.7415135 , 12314.5547986 ,  5339.9679125 ,  3806.7270525 ,
       18406.678799  ,  5239.7857167 ,  2144.4829835 ,  9734.0100979 ,
      

In [67]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("R2:", r2)

MSE: 19343965.20645149
R2: 0.8459643704924885


In [71]:
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30, 50],
}

In [72]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

In [73]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [75]:
grid.best_params_

{'max_depth': 10, 'max_features': 'log2', 'n_estimators': 300}

In [76]:
grid.best_score_

0.8402204332748354

In [77]:
y_pred_grid = grid.predict(X_test)

In [78]:
print(r2_score(y_test, y_pred_grid))

0.8501212869478133
