In [None]:
# Working with Missing Data in Pandas
# Missing Data can occur when no information is provided for one or more
# items or for a whole unit. Missing Data is a very big problem in a real-life scenarios.
# Missing Data can also refer to as NA(Not Available) values in pandas.
# In DataFrame sometimes many datasets simply arrive with missing data,
# either because it exists and was not collected or it never existed.
# For Example, Suppose different users being surveyed may choose not to share their income,
# some users may choose not to share the address in this way many datasets went missing.

# Dataset : https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/ufo.csv


import pandas as pd
import warnings as wrn
wrn.filterwarnings('ignore')

ufo= pd.read_csv('https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/ufo.csv')
ufo.head()





Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [None]:
ufo.info()


In [None]:
#check for the missing values
ufo.isnull().sum()

Unnamed: 0,0
City,26
Colors Reported,15359
Shape Reported,2644
State,0
Time,0


In [None]:
# fillna()
# used to Fill NA/NaN values
ufo['City'].fillna('Unknown', inplace=True)
ufo['Shape Reported'].fillna('Unknown', inplace=True)
ufo['Colors Reported'].fillna('Unknown', inplace=True)

#now data is clear



In [None]:
ufo.isnull().sum()

Unnamed: 0,0
City,0
Colors Reported,0
Shape Reported,0
State,0
Time,0


In [None]:
# Load Another Dataset
data = pd.read_csv('https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/sample11.csv')
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,
2,3,Alex,,67.0
3,4,Alex,12.0,
4,5,Alex,,
5,6,Alex,54.0,
6,7,Alex,65.0,66.0


In [None]:
# fillna()
# ffill --> forward fill

data['Marks'].fillna(method='ffill', inplace=True)
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,
2,3,Alex,23.0,67.0
3,4,Alex,12.0,
4,5,Alex,12.0,
5,6,Alex,54.0,
6,7,Alex,65.0,66.0


In [None]:
data['Percentage'].fillna(method='bfill', inplace=True)
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,67.0
2,3,Alex,23.0,67.0
3,4,Alex,12.0,66.0
4,5,Alex,12.0,66.0
5,6,Alex,54.0,66.0
6,7,Alex,65.0,66.0


In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/sample11.csv')
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,
2,3,Alex,,67.0
3,4,Alex,12.0,
4,5,Alex,,
5,6,Alex,54.0,
6,7,Alex,65.0,66.0


In [None]:
#fill missing values by taking up a mean
mean_val= data['Percentage'].mean()
mean_val=round(mean_val,1)
data['Percentage'].fillna(mean_val, inplace=True)
data



Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,70.3
2,3,Alex,,67.0
3,4,Alex,12.0,70.3
4,5,Alex,,70.3
5,6,Alex,54.0,70.3
6,7,Alex,65.0,66.0


In [None]:
'''
Python Pandas interpolate() method is used to fill NaN values in the DataFrame
or Series using various interpolation techniques to fill the missing values
rather than hard-coding the value.
Interpolation in Python is a technique used to estimate unknown data points
between two known data points.
'''
data.interpolate()




Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,70.3
2,3,Alex,17.5,67.0
3,4,Alex,12.0,70.3
4,5,Alex,33.0,70.3
5,6,Alex,54.0,70.3
6,7,Alex,65.0,66.0
