In [11]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("D:/Project/Guvi_Project/Medical Insurance Cost Prediction/medical_insurance.csv")

# Display basic info
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2772 entries, 0 to 2771
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       2772 non-null   int64  
 1   sex       2772 non-null   object 
 2   bmi       2772 non-null   float64
 3   children  2772 non-null   int64  
 4   smoker    2772 non-null   object 
 5   region    2772 non-null   object 
 6   charges   2772 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 151.7+ KB
None
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [13]:
# Check for missing values
print(df.isnull().sum())

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


In [15]:
# Check for outliers in 'bmi' (normal range: 10-60)
bmi_outliers = df[(df['bmi'] < 10) | (df['bmi'] > 60)]
print(f"BMI outliers:\n {bmi_outliers}")

# Cap extreme BMI values (optional)
df['bmi'] = df['bmi'].clip(lower=10, upper=60)

# Check for extreme 'charges' (top 1%)
upper_limit = df['charges'].quantile(0.99)
df = df[df['charges'] <= upper_limit]

BMI outliers:
 Empty DataFrame
Columns: [age, sex, bmi, children, smoker, region, charges]
Index: []


In [17]:
# Binary encoding for 'sex' and 'smoker'
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
df['smoker'] = df['smoker'].map({'no': 0, 'yes': 1})

In [19]:
print(df.head())

   age  sex     bmi  children  smoker     region      charges
0   19    1  27.900         0       1  southwest  16884.92400
1   18    0  33.770         1       0  southeast   1725.55230
2   28    0  33.000         3       0  southeast   4449.46200
3   33    0  22.705         0       0  northwest  21984.47061
4   32    0  28.880         0       0  northwest   3866.85520


In [21]:
# Log-transform 'charges' if skewed (common for cost data)
df['log_charges'] = np.log1p(df['charges'])

In [23]:
# One-hot encode 'region'
df = pd.get_dummies(df, columns=['region'], prefix='region', drop_first=True)

In [25]:
# Verify cleaned data
print(df.head(10))
print(df.describe())

   age  sex     bmi  children  smoker      charges  log_charges  \
0   19    1  27.900         0       1  16884.92400     9.734236   
1   18    0  33.770         1       0   1725.55230     7.453882   
2   28    0  33.000         3       0   4449.46200     8.400763   
3   33    0  22.705         0       0  21984.47061     9.998137   
4   32    0  28.880         0       0   3866.85520     8.260455   
5   31    1  25.740         0       0   3756.62160     8.231541   
6   46    1  33.440         1       0   8240.58960     9.016949   
7   37    1  27.740         3       0   7281.50560     8.893230   
8   37    0  29.830         2       0   6406.41070     8.765211   
9   60    1  25.840         0       0  28923.13692    10.272432   

   region_northwest  region_southeast  region_southwest  
0             False             False              True  
1             False              True             False  
2             False              True             False  
3              True           

In [27]:
# Save cleaned dataset
df.to_csv("D:/Project/Guvi_Project/Medical Insurance Cost Prediction/medical_insurance.csv", index=False)