# Topic: Data Manipulation
In this python notebook there are various functions or methods for data manipulation

### HANDLING MISSING VALUES

Methods:
• dropna(): Removes missing values (entire row/column if NaN is present)

• fillna(): Fills missing values with a specified value (mean, median, mode, forward fill, backward fill)

• isnull().sum(): Counts missing values in each column

• interpolate(): Estimates missing values using interpolation techniques

• replace(): Replaces missing or specific values with defined values

In [1]:
import pandas as pd
import numpy as np

In [2]:
url = "https://raw.githubusercontent.com/drishtij3/Datasets/refs/heads/main/data_with_more_features.csv"
df = pd.read_csv(url)

df

Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,,5 - 10,45,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30636,816,female,group D,high school,standard,none,single,sometimes,no,2.0,school_bus,5 - 10,59,61,65
30637,890,male,group E,high school,standard,none,single,regularly,no,1.0,private,5 - 10,58,53,51
30638,911,female,,high school,free/reduced,completed,married,sometimes,no,1.0,private,5 - 10,61,70,67
30639,934,female,group D,associate's degree,standard,completed,married,regularly,no,3.0,school_bus,5 - 10,82,90,93


In [3]:
df.mode()

Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,group C,some college,standard,none,married,sometimes,yes,1.0,school_bus,5 - 10,64.0,65.0,67.0
1,1,,,,,,,,,,,,,,
2,2,,,,,,,,,,,,,,
3,4,,,,,,,,,,,,,,
4,6,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690,992,,,,,,,,,,,,,,
691,993,,,,,,,,,,,,,,
692,996,,,,,,,,,,,,,,
693,998,,,,,,,,,,,,,,


In [4]:
df

Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,,5 - 10,45,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30636,816,female,group D,high school,standard,none,single,sometimes,no,2.0,school_bus,5 - 10,59,61,65
30637,890,male,group E,high school,standard,none,single,regularly,no,1.0,private,5 - 10,58,53,51
30638,911,female,,high school,free/reduced,completed,married,sometimes,no,1.0,private,5 - 10,61,70,67
30639,934,female,group D,associate's degree,standard,completed,married,regularly,no,3.0,school_bus,5 - 10,82,90,93


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30641 entries, 0 to 30640
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           30641 non-null  int64  
 1   Gender               30641 non-null  object 
 2   EthnicGroup          28801 non-null  object 
 3   ParentEduc           28796 non-null  object 
 4   LunchType            30641 non-null  object 
 5   TestPrep             28811 non-null  object 
 6   ParentMaritalStatus  29451 non-null  object 
 7   PracticeSport        30010 non-null  object 
 8   IsFirstChild         29737 non-null  object 
 9   NrSiblings           29069 non-null  float64
 10  TransportMeans       27507 non-null  object 
 11  WklyStudyHours       29686 non-null  object 
 12  MathScore            30641 non-null  int64  
 13  ReadingScore         30641 non-null  int64  
 14  WritingScore         30641 non-null  int64  
dtypes: float64(1), int64(4), object(10)


In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,,5 - 10,45,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75


In [7]:
# Display missing values count
display(df.isnull())


Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30636,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
30637,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
30638,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
30639,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


# dropna(): Remove missing values


In [9]:
df_dropna = df.dropna()
display(df_dropna)

Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75
5,5,female,group B,associate's degree,standard,none,married,regularly,yes,1.0,school_bus,5 - 10,73,84,79
6,6,female,group B,some college,standard,completed,widowed,never,no,1.0,private,5 - 10,85,93,89
7,7,male,group B,some college,free/reduced,none,married,sometimes,yes,1.0,private,> 10,41,43,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30635,794,male,group C,some college,standard,none,married,regularly,no,2.0,school_bus,5 - 10,58,53,49
30636,816,female,group D,high school,standard,none,single,sometimes,no,2.0,school_bus,5 - 10,59,61,65
30637,890,male,group E,high school,standard,none,single,regularly,no,1.0,private,5 - 10,58,53,51
30639,934,female,group D,associate's degree,standard,completed,married,regularly,no,3.0,school_bus,5 - 10,82,90,93


In [10]:
# Remove columns with missing values
df_dropna_cols = df.dropna(axis=1)
display(df_dropna_cols.head())

Unnamed: 0.1,Unnamed: 0,Gender,LunchType,MathScore,ReadingScore,WritingScore
0,0,female,standard,71,71,74
1,1,female,standard,69,90,88
2,2,female,standard,87,93,91
3,3,male,free/reduced,45,56,42
4,4,male,standard,76,78,75


# fillna(): Fill missing values with a specified value


In [11]:
# Fill numeric columns with mean
df_filled_mean = df.fillna(df.mean(numeric_only=True))
display(df_filled_mean.head())
# df.mean(numeric_only=True)

Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,,5 - 10,45,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75


In [12]:
# Fill categorical columns with mode
df_filled_mode = df.fillna(df.mode().iloc[0])
display(df_filled_mode.head())
# df.mode()

Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,group C,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,school_bus,5 - 10,45,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75


In [13]:
# Forward fill (propagating last valid value forward)
# df.fillna(method='ffill', inplace=True)
df_ffill = df.fillna(method='ffill')
display(df_ffill.head())

  df_ffill = df.fillna(method='ffill')


Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,school_bus,5 - 10,45,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75


In [14]:
# Backward fill (propagating next valid value backward)
df_bfill = df.fillna(method='bfill')
display(df_bfill)

  df_bfill = df.fillna(method='bfill')


Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,group C,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,school_bus,5 - 10,45,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30636,816,female,group D,high school,standard,none,single,sometimes,no,2.0,school_bus,5 - 10,59,61,65
30637,890,male,group E,high school,standard,none,single,regularly,no,1.0,private,5 - 10,58,53,51
30638,911,female,group D,high school,free/reduced,completed,married,sometimes,no,1.0,private,5 - 10,61,70,67
30639,934,female,group D,associate's degree,standard,completed,married,regularly,no,3.0,school_bus,5 - 10,82,90,93


In [15]:
# 3. isnull().sum(): Count missing values
display(df.isnull().sum())
# df.isnull()

Unnamed: 0                0
Gender                    0
EthnicGroup            1840
ParentEduc             1845
LunchType                 0
TestPrep               1830
ParentMaritalStatus    1190
PracticeSport           631
IsFirstChild            904
NrSiblings             1572
TransportMeans         3134
WklyStudyHours          955
MathScore                 0
ReadingScore              0
WritingScore              0
dtype: int64

# interpolate(): Estimate missing values using interpolation techniques

In [16]:

# 4. interpolate(): Estimate missing values
df_interpolated = df.interpolate()
display(df_interpolated.head())

  df_interpolated = df.interpolate()


Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,,5 - 10,45,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75


# replace(): Replace missing or specific values with defined values
# isnull().sum(): Count missing values in each column

In [17]:
df_replaced = df.replace({None: "Unknown", "none": "Missing"})
display(df_replaced.head())


Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,Unknown,bachelor's degree,standard,Missing,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,Unknown,married,sometimes,yes,0.0,Unknown,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,Missing,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,Missing,married,never,no,1.0,Unknown,5 - 10,45,56,42
4,4,male,group C,some college,standard,Missing,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75


### Separate DataFrame for ffill, bfill, and interpolate Demonstration ###

In [18]:
df_demo = pd.DataFrame({
    "A": [1, np.nan, np.nan, 4, 5],
    "B": [10, 20, np.nan, np.nan, 50],
    "C": [np.nan, 200, 300, np.nan, 500]
}) 




In [19]:

print("Original DataFrame:")
display(df_demo)

Original DataFrame:


Unnamed: 0,A,B,C
0,1.0,10.0,
1,,20.0,200.0
2,,,300.0
3,4.0,,
4,5.0,50.0,500.0


In [20]:
 
# Forward Fill
df_ffill_demo = df_demo.fillna(method='ffill')
print("Forward Fill:")
display(df_ffill_demo)


Forward Fill:


  df_ffill_demo = df_demo.fillna(method='ffill')


Unnamed: 0,A,B,C
0,1.0,10.0,
1,1.0,20.0,200.0
2,1.0,20.0,300.0
3,4.0,20.0,300.0
4,5.0,50.0,500.0


In [21]:

# Backward Fill
df_bfill_demo = df_demo.fillna(method='bfill')
print("Backward Fill:")
display(df_bfill_demo)


Backward Fill:


  df_bfill_demo = df_demo.fillna(method='bfill')


Unnamed: 0,A,B,C
0,1.0,10.0,200.0
1,4.0,20.0,200.0
2,4.0,50.0,300.0
3,4.0,50.0,500.0
4,5.0,50.0,500.0


In [22]:

# Interpolation
df_interpolate_demo = df_demo.interpolate()
print("Interpolation:")
display(df_interpolate_demo)

Interpolation:


Unnamed: 0,A,B,C
0,1.0,10.0,
1,2.0,20.0,200.0
2,3.0,30.0,300.0
3,4.0,40.0,400.0
4,5.0,50.0,500.0
