In [3]:
# 3.1 Loading the Dataset with Missing Values
import pandas as pd

# Load the dataset with missing values
data = pd.read_csv('datawithmissing.csv')

# Display the first few rows of the dataset
print(data.head())

print("=============================================================================")

# Check for missing values in each column
print(data.isnull().sum())


   Unnamed: 0  Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin  \
0           0            6    148.0           72.0           35.0      NaN   
1           1            1     85.0           66.0           29.0      NaN   
2           2            8    183.0           64.0            NaN      NaN   
3           3            1     89.0           66.0           23.0     94.0   
4           4            0    137.0           40.0           35.0    168.0   

    BMI  DiabetesPedigreeFunction  Age  Outcome  
0  33.6                     0.627   50        1  
1  26.6                     0.351   31        0  
2  23.3                     0.672   32        1  
3  28.1                     0.167   21        0  
4  43.1                     2.288   33        1  
Unnamed: 0                    0
Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
Diabete

In [5]:
# 3.2 Handling Missing Data
# Drop rows with any missing values
#data_cleaned = data.dropna()

# Check if missing values are removed
#print(data_cleaned.isnull().sum())

# Fill missing values with the mean of each column
data_filled = data.fillna(data.mean())

# Check if missing values are filled
print(data_filled.isnull().sum())


Unnamed: 0                  0
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [5]:
# 3.3 Handling Missing Values in Specific Columns
# Fill missing values in a specific column (e.g., 'Glucose') with the mean
data['Glucose'] = data['Glucose'].fillna(data['Glucose'].mean())

# Drop rows where 'Outcome' column is missing
data = data.dropna(subset=['Outcome'])

# Check the cleaned dataset
print(data.isnull().sum())


Unnamed: 0                    0
Pregnancies                   0
Glucose                       0
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


In [6]:
# 3.4 Feature Scaling (Optional)
# If your data contains values in different ranges (e.g., age vs glucose levels), scaling them can help improve the performance of machine learning models.

from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Apply scaling to the relevant numeric columns
numeric_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Display the scaled data
print(data.head())


   Unnamed: 0  Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin  \
0           0            6  0.865108      -0.032746       0.558557       NaN   
1           1            1 -1.206162      -0.517645      -0.014657       NaN   
2           2            8  2.015813      -0.679278            NaN       NaN   
3           3            1 -1.074652      -0.517645      -0.587871 -0.518847   
4           4            0  0.503458      -2.618874       0.558557  0.104968   

        BMI  DiabetesPedigreeFunction  Age  Outcome  
0  0.165097                     0.627   50        1  
1 -0.846404                     0.351   31        0  
2 -1.323254                     0.672   32        1  
3 -0.629654                     0.167   21        0  
4  1.537847                     2.288   33        1  
