# Topic: Data Filtering
This file contains a collection of code snippets that demonstrate how to filter data in various ways using Python and its libraries. The examples cover filtering data based on conditions, using boolean indexing, and applying functions to filter data.


In [1]:
import pandas as pd
import numpy as np

In [2]:

# Load Data (Replace with actual file path)
url = "https://raw.githubusercontent.com/drishtij3/Datasets/refs/heads/main/data_with_more_features.csv"
df = pd.read_csv(url)

In [3]:
df

Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,,5 - 10,45,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30636,816,female,group D,high school,standard,none,single,sometimes,no,2.0,school_bus,5 - 10,59,61,65
30637,890,male,group E,high school,standard,none,single,regularly,no,1.0,private,5 - 10,58,53,51
30638,911,female,,high school,free/reduced,completed,married,sometimes,no,1.0,private,5 - 10,61,70,67
30639,934,female,group D,associate's degree,standard,completed,married,regularly,no,3.0,school_bus,5 - 10,82,90,93



### Filtering Data ###


# loc[]: Select rows based on conditions

In [4]:

# Example 1: Select students with MathScore above 80
df_loc_1 = df.loc[df['MathScore'] > 80]
display(df_loc_1)
 

Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
6,6,female,group B,some college,standard,completed,widowed,never,no,1.0,private,5 - 10,85,93,89
16,16,male,group C,high school,standard,,married,sometimes,yes,0.0,school_bus,5 - 10,88,89,86
33,34,male,group E,some college,standard,none,divorced,sometimes,yes,0.0,school_bus,,97,87,82
34,35,male,group E,associate's degree,standard,completed,single,sometimes,no,,school_bus,5 - 10,81,81,79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30614,495,male,group D,bachelor's degree,standard,none,divorced,sometimes,no,,school_bus,5 - 10,82,73,80
30620,622,male,group C,bachelor's degree,standard,none,married,,no,4.0,private,5 - 10,91,85,77
30632,778,female,group D,some college,standard,none,married,regularly,no,3.0,private,5 - 10,82,88,97
30633,783,female,group C,master's degree,standard,completed,married,never,no,2.0,school_bus,5 - 10,84,99,99


In [5]:

# Example 2: Select female students who practice sports regularly
df_loc_2 = df.loc[(df['Gender'] == 'female') & (df['PracticeSport'] == 'regularly')]
display(df_loc_2.head())

Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
5,5,female,group B,associate's degree,standard,none,married,regularly,yes,1.0,school_bus,5 - 10,73,84,79
9,9,female,group B,high school,free/reduced,none,married,regularly,yes,,private,< 5,37,59,50
12,12,female,group B,high school,standard,none,married,regularly,no,1.0,private,5 - 10,66,82,74
35,36,female,group D,associate's degree,standard,none,married,regularly,no,2.0,private,5 - 10,76,82,85


# iloc[]: Select rows based on index position

In [6]:


# Example 1: Select the first 5 rows
df_iloc_1 = df.iloc[:5]
display(df_iloc_1)


Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,,5 - 10,45,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75


In [7]:

# Example 2: Select rows 10 to 15 and columns 2 to 5
df_iloc_2 = df.iloc[10:15, 2:6]
display(df_iloc_2)


Unnamed: 0,EthnicGroup,ParentEduc,LunchType,TestPrep
10,group C,associate's degree,standard,none
11,group D,associate's degree,standard,none
12,group B,high school,standard,none
13,group A,some college,standard,completed
14,group A,master's degree,standard,none



# query(): Filters using SQL-like syntax

In [8]:

# Example 1: Select students who have more than 2 siblings
df_query_1 = df.query("NrSiblings > 2")
display(df_query_1.head())


Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
8,8,male,group D,high school,free/reduced,completed,single,sometimes,no,3.0,private,> 10,65,64,68
19,19,female,group C,associate's degree,free/reduced,none,married,never,yes,3.0,private,< 5,50,56,58
20,20,male,group D,high school,standard,none,married,sometimes,yes,5.0,school_bus,5 - 10,66,69,63


In [9]:

# Example 2: Select students whose ParentEduc is "bachelor's degree"
df_query_2 = df.query('ParentEduc == "bachelor\'s degree"')
display(df_query_2.head())


Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
27,27,female,group C,bachelor's degree,standard,none,married,sometimes,yes,4.0,school_bus,> 10,69,70,76
90,99,female,group D,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,64,66,62
92,101,male,group D,bachelor's degree,standard,completed,single,sometimes,no,1.0,private,5 - 10,68,74,74
105,114,female,group E,bachelor's degree,standard,completed,married,regularly,yes,3.0,school_bus,< 5,98,99,100



# between(): Filters values within a range

In [10]:

# Example 1: Select students with MathScore between 60 and 80
df_between_1 = df[df['MathScore'].between(60, 80)]
display(df_between_1)


Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69,90,88
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75
5,5,female,group B,associate's degree,standard,none,married,regularly,yes,1.0,school_bus,5 - 10,73,84,79
8,8,male,group D,high school,free/reduced,completed,single,sometimes,no,3.0,private,> 10,65,64,68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30629,748,female,group D,associate's degree,standard,completed,married,sometimes,no,2.0,school_bus,,67,63,72
30631,765,male,group E,some high school,standard,none,married,sometimes,no,3.0,school_bus,< 5,80,65,66
30634,785,male,group A,associate's degree,free/reduced,completed,,sometimes,no,2.0,school_bus,5 - 10,65,60,60
30638,911,female,,high school,free/reduced,completed,married,sometimes,no,1.0,private,5 - 10,61,70,67


In [11]:

# Example 2: Select students with ReadingScore between 50 and 90
df_between_2 = df[df['ReadingScore'].between(50, 90)]
display(df_between_2.head())


Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69,90,88
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,,5 - 10,45,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75
5,5,female,group B,associate's degree,standard,none,married,regularly,yes,1.0,school_bus,5 - 10,73,84,79



# mask(): Replaces values where conditions hold true

In [12]:

# Example 1: Mask MathScore < 50 with 'Low'
df_mask_1 = df.copy()
df_mask_1['MathScore'] = df_mask_1['MathScore'].mask(df_mask_1['MathScore'] < 50, 'Low')
display(df_mask_1.head(10))


Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,,5 - 10,Low,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75
5,5,female,group B,associate's degree,standard,none,married,regularly,yes,1.0,school_bus,5 - 10,73,84,79
6,6,female,group B,some college,standard,completed,widowed,never,no,1.0,private,5 - 10,85,93,89
7,7,male,group B,some college,free/reduced,none,married,sometimes,yes,1.0,private,> 10,Low,43,39
8,8,male,group D,high school,free/reduced,completed,single,sometimes,no,3.0,private,> 10,65,64,68
9,9,female,group B,high school,free/reduced,none,married,regularly,yes,,private,< 5,Low,59,50


In [13]:

# Example 2: Mask missing EthnicGroup values with 'Unknown'
df_mask_2 = df.copy()
df_mask_2['EthnicGroup'] = df_mask_2['EthnicGroup'].mask(df_mask_2['EthnicGroup'].isnull(), 'Unknown')
display(df_mask_2.head())

Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,Unknown,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,,5 - 10,45,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75
