In [1]:
#Import Libraries for data manipulation and visualization if necessary
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
#Load the dataset into a Pnadas dataFrame
df = pd.read_csv(r"C:\Users\wmypr\insurance.csv")

In [5]:
#Display the first 5 rows to get an overview of the data
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.9,0.0,yes,southwest,16884.924
1,18.0,male,33.77,1.0,no,Southeast,1725.5523
2,28.0,male,33.0,3.0,no,southeast,$4449.462
3,33.0,male,22.705,0.0,no,northwest,$21984.47061
4,32.0,male,28.88,0.0,no,northwest,$3866.8552


In [7]:
#Get the dimensions of the DataFrame (rows, columns)
df.shape

(1338, 7)

In [9]:
#Generate summary statistics for numerical columns in the DataFrame
df.describe()

Unnamed: 0,age,bmi,children
count,1272.0,1272.0,1272.0
mean,35.214623,30.56055,0.948899
std,22.478251,6.095573,1.303532
min,-64.0,15.96,-4.0
25%,24.75,26.18,0.0
50%,38.0,30.21,1.0
75%,51.0,34.485,2.0
max,64.0,53.13,5.0


In [11]:
#Show basic info about the DataFrame (column types, non-null counts, and memory usage)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1272 non-null   float64
 1   sex       1272 non-null   object 
 2   bmi       1272 non-null   float64
 3   children  1272 non-null   float64
 4   smoker    1272 non-null   object 
 5   region    1272 non-null   object 
 6   charges   1284 non-null   object 
dtypes: float64(3), object(4)
memory usage: 73.3+ KB


In [13]:
#Count the number of missing values in each column
df.isnull().sum()

age         66
sex         66
bmi         66
children    66
smoker      66
region      66
charges     54
dtype: int64

In [15]:
#Count the remaining duplicate rows in the DataFrame
df.duplicated().sum()

1

In [17]:
# Remove duplicate rows from the DataFrame
df = df.drop_duplicates()

In [19]:
# Remove dollar signs from the 'charges' column
df["charges"] = df["charges"].replace(r"[\$]", "", regex=True)

# Convert the 'charges' column to numeric values, as its dtype is still 'object'; invalid values become NaN
df["charges"] = pd.to_numeric(df["charges"], errors="coerce")

In [27]:
#Check the first few rows to ensure the 'charges' column has been cleaned '$' and converted
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1272 non-null   float64
 1   sex       1272 non-null   object 
 2   bmi       1272 non-null   float64
 3   children  1272 non-null   float64
 4   smoker    1272 non-null   object 
 5   region    1272 non-null   object 
 6   charges   1272 non-null   float64
dtypes: float64(4), object(3)
memory usage: 83.6+ KB


In [29]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.9,0.0,yes,southwest,16884.924
1,18.0,male,33.77,1.0,no,Southeast,1725.5523
2,28.0,male,33.0,3.0,no,southeast,4449.462
3,33.0,male,22.705,0.0,no,northwest,21984.47061
4,32.0,male,28.88,0.0,no,northwest,3866.8552


In [31]:
# Convert any negative values in the 'age' column to positive, as age cannot be negative
df["age"] = df["age"].abs()

In [33]:
# Convert any negative values in the 'children' column to positive, as the number of children cannot be negative
df["children"] = df["children"].abs()

In [35]:
# Generate summary statistics to confirm no negative values in 'age' and 'children' columns
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1272.0,1272.0,1272.0,1272.0
mean,39.335692,30.56055,1.079403,13286.594477
std,14.064247,6.095573,1.197618,12142.505233
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.18,0.0,4733.582163
50%,39.0,30.21,1.0,9382.033
75%,51.0,34.485,2.0,16579.959053
max,64.0,53.13,5.0,63770.42801


In [37]:
# Replace missing values with mean or median, depending on the column distribution
df["age"] = df["age"].fillna(df["age"].mean())
df["bmi"] = df["bmi"].fillna(df["bmi"].median())
df["children"] = df["children"].fillna(df["children"].median())
df["charges"] = df["charges"].fillna(df["charges"].median())

In [39]:
# Replace missing values in the 'sex' column with the most common value (mode)
df["sex"] = df["sex"].fillna(df["sex"].mode()[0])

# Replace missing values in the 'smoker' column with the most common value (mode)
df["smoker"] = df["smoker"].fillna(df["smoker"].mode()[0])

# Replace missing values in the 'region' column with the most common value (mode)
df["region"] = df["region"].fillna(df["region"].mode()[0])

In [43]:
# Check if there are any remaining null values in the dataset after imputation
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [45]:
# Display the unique values in the 'sex' column to check the distinct categories (e.g., 'male', 'female')
df["sex"].unique()

array(['female', 'male', 'woman', 'F', 'man', 'M'], dtype=object)

In [47]:
# Display the count of each unique value in the 'sex' column to see the distribution of categories (e.g., 'male', 'female')
df["sex"].value_counts()

sex
male      582
female    503
man        64
M          64
woman      62
F          62
Name: count, dtype: int64

In [49]:
# Standardize the values in the 'sex' column by replacing variations like 'M' and 'man' with 'male', 
# and 'F' and 'woman' with 'female' to ensure consistent representation of gender categories
df["sex"] = df["sex"].replace({"M" : "male", "man" : "male", "F" : "female", "woman" : "female"})

In [51]:
# Display the updated count of each unique value in the 'sex' column after standardizing the categories (e.g., 'male', 'female')
df["sex"].value_counts()

sex
male      710
female    627
Name: count, dtype: int64

In [53]:
# Display the unique values in the 'region' column to check the distinct categories
df["region"].unique()

array(['southwest', 'Southeast', 'southeast', 'northwest', 'Northwest',
       'Northeast', 'northeast', 'Southwest'], dtype=object)

In [55]:
# Convert all region names to lowercase for consistent representation
df["region"] = df["region"].str.lower()

In [57]:
# Display the unique values in the 'region' column to check the distinct regions after standardization (e.g., 'southeast', 'southwest')
df["region"].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [61]:
# Display the unique values in the 'smoker' column to identify any unexpected values beyond 'yes' and 'no'
df["smoker"].unique()

array(['yes', 'no'], dtype=object)

In [63]:
# Round the values in the 'age' and 'children' columns to the nearest integer and convert them to integer type
# This ensures that both columns have integer values, as age and number of children should be whole numbers
df[["age", "children"]] = df[["age", "children"]].round(0).astype(int)

In [65]:
# Round the values in the 'bmi' and 'charges' columns to 2 decimal places for consistency and better readability
df[["bmi", "charges"]] = df[["bmi", "charges"]].round(2)

In [67]:
# Replace the 'yes' and 'no' values in the 'smoker' column with Boolean values True and False for easier analysis
df["smoker"] = df["smoker"].replace({"yes" : True, "no" : False}).astype(bool)

  df["smoker"] = df["smoker"].replace({"yes" : True, "no" : False}).astype(bool)


In [69]:
#Display the first 5 rows of the dataset to check the updated values and overall structure of the DataFrame
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,True,southwest,16884.92
1,18,male,33.77,1,False,southeast,1725.55
2,28,male,33.0,3,False,southeast,4449.46
3,33,male,22.7,0,False,northwest,21984.47
4,32,male,28.88,0,False,northwest,3866.86


In [71]:
# Display a summary of the DataFrame after data cleaning and preprocessing 
# to verify data types, confirm there are no missing values, and check the final structure
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int32  
 1   sex       1337 non-null   object 
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   int32  
 4   smoker    1337 non-null   bool   
 5   region    1337 non-null   object 
 6   charges   1337 non-null   float64
dtypes: bool(1), float64(2), int32(2), object(2)
memory usage: 64.0+ KB


In [73]:
# Generate descriptive statistics for the numeric columns after preprocessing 
# to confirm that values are within expected ranges and outliers or anomalies have been handled
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1337.0,1337.0,1337.0,1337.0
mean,39.319372,30.543695,1.075542,13096.768975
std,13.718039,5.945665,1.168246,11873.194137
min,18.0,15.96,0.0,1121.87
25%,27.0,26.41,0.0,4922.92
50%,39.0,30.21,1.0,9382.03
75%,51.0,34.21,2.0,15230.32
max,64.0,53.13,5.0,63770.43


In [75]:
# Save the cleaned DataFrame to a CSV file
df.to_csv("insurance_data_cleaned.csv", index=False)

In [77]:
import sys

In [79]:
print(sys.version)

3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]
