In [1]:
#What Are Missing Values?
#...Missing values mean some data entries are blank or not available or null.
#...In Python (especially in Pandas), missing values are represented as NaN (Not a Number).
#...Handling missing data is very important because most machine learning models can’t handle NaN values directly.


In [2]:
#Import Library and Create DataFrame

# ->We’ll use the Pandas library for handling missing data.

In [3]:
import pandas as pd


# Creating a simple dataset with missing values
data = {
    'Name': ['Riya', 'Aman', 'Meena', 'Sohan', 'Neha'],
    'Age': [20, None, 25, None, 22],
    'Marks': [80, 90, None, 70, 85],
    'City': ['Delhi', 'Mumbai', None, 'Pune', 'Mumbai']
}

df = pd.DataFrame(data)
print("Original DataFrame:\n", df)


Original DataFrame:
     Name   Age  Marks    City
0   Riya  20.0   80.0   Delhi
1   Aman   NaN   90.0  Mumbai
2  Meena  25.0    NaN    None
3  Sohan   NaN   70.0    Pune
4   Neha  22.0   85.0  Mumbai


In [17]:
import warnings
warnings.filterwarnings('ignore')

In [18]:
#we will check Missing Values

#We first check how many missing values are present in each column.

In [19]:
print(df.isnull().sum())

Name     0
Age      0
Marks    0
City     0
dtype: int64


### Handling Missing values (Filling Techniques)

In [20]:
#Filling Missing Values with Mean

# Use when: The column contains numerical data (like Age, Marks, Salary).
#           we use a mean when there is no outlier in data 
# Formula: Mean = (Sum of all values) / (Number of values)

In [21]:
df['Age'].fillna(df['Age'].mean(), inplace=True)
print(df)


    Name        Age  Marks    City
0   Riya  20.000000   80.0   Delhi
1   Aman  22.333333   90.0  Mumbai
2  Meena  25.000000   82.5  Mumbai
3  Sohan  22.333333   70.0    Pune
4   Neha  22.000000   85.0  Mumbai


In [22]:
#Filling Missing Values with Median
#Use when: The column has outliers (extreme values) that can affect the mean.
#Definition: Median = Middle value after sorting all values.

In [23]:
df['Marks'].fillna(df['Marks'].median(), inplace=True)
print(df)


    Name        Age  Marks    City
0   Riya  20.000000   80.0   Delhi
1   Aman  22.333333   90.0  Mumbai
2  Meena  25.000000   82.5  Mumbai
3  Sohan  22.333333   70.0    Pune
4   Neha  22.000000   85.0  Mumbai


In [24]:
#filling missing value with mode
#Use when: The column contains categorical data (like City, Gender, Department).
#Definition: Mode = The most frequently occurring value in a column.

In [25]:
df["City"].fillna(df['City'].mode()[0],inplace=True)
print(df)

    Name        Age  Marks    City
0   Riya  20.000000   80.0   Delhi
1   Aman  22.333333   90.0  Mumbai
2  Meena  25.000000   82.5  Mumbai
3  Sohan  22.333333   70.0    Pune
4   Neha  22.000000   85.0  Mumbai


In [26]:
#Missing Values Handling – bfill & ffill

#1. ffill (Forward Fill)
#It fills the missing value with the previous (forward) value from the same column.

#2. bfill (Backward Fill):
#It fills the missing value with the next (backward) value from the same column.

In [27]:
data = {
    'Name': ['Riya', 'Aman', 'Meena', 'Sohan', 'Neha'],
    'Age': [20, None, 25, None, 22],
    'City': ['Delhi', 'Mumbai', None, None, 'Pune']
}

ndf = pd.DataFrame(data)
print("Original DataFrame:\n", ndf)


Original DataFrame:
     Name   Age    City
0   Riya  20.0   Delhi
1   Aman   NaN  Mumbai
2  Meena  25.0    None
3  Sohan   NaN    None
4   Neha  22.0    Pune


In [28]:
ndf['Age'].fillna(method='ffill',inplace=True)
print(ndf)

    Name   Age    City
0   Riya  20.0   Delhi
1   Aman  20.0  Mumbai
2  Meena  25.0    None
3  Sohan  25.0    None
4   Neha  22.0    Pune


In [29]:
ndf['City'].fillna(method='bfill', inplace=True)
print(ndf)

    Name   Age    City
0   Riya  20.0   Delhi
1   Aman  20.0  Mumbai
2  Meena  25.0    Pune
3  Sohan  25.0    Pune
4   Neha  22.0    Pune
