# DataFrames Filtering

In [1]:
import pandas as pd


In [4]:
df = pd.read_csv("data/employees.csv")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance


## reduce size to improve performance of filtering

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [9]:
# missing values will get today's date, so probably best to fill with some value before converting
df["Start Date"] = pd.to_datetime(df["Start Date"])
df["Last Login Time"] = pd.to_datetime(df["Last Login Time"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    object        
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  933 non-null    object        
 7   Team               957 non-null    object        
dtypes: datetime64[ns](2), float64(1), int64(1), object(4)
memory usage: 62.6+ KB


In [12]:
df["Senior Management"] = df["Senior Management"].astype("bool")


In [10]:
df["Gender"] = df["Gender"].astype("category")

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 49.0+ KB


In [14]:
df

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2020-04-29 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2020-04-29 06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,2020-04-29 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2020-04-29 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2020-04-29 16:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
995,Henry,,2014-11-23,2020-04-29 06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,2020-04-29 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2020-04-29 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2020-04-29 16:45:00,60500,11.985,False,Business Development


## user parse_dates argument

In [15]:
# automatically covert string to date
df = pd.read_csv("data/employees.csv", parse_dates=["Start Date", "Last Login Time"])
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    object        
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  933 non-null    object        
 7   Team               957 non-null    object        
dtypes: datetime64[ns](2), float64(1), int64(1), object(4)
memory usage: 62.6+ KB


## filtering


In [18]:
# filter by gender
df[df["Gender"] == "Male"].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2020-04-29 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2020-04-29 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2020-04-29 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2020-04-29 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2020-04-29 01:35:00,115163,10.125,False,Legal


In [20]:
# filter with not equals
df[df["Team"] != "Finance"].head()


Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2020-04-29 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2020-04-29 06:53:00,61933,4.17,True,
4,Larry,Male,1998-01-24,2020-04-29 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2020-04-29 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2020-04-29 16:20:00,65476,10.012,True,Product


In [27]:
# filtering with multiple series
# use & (and) | (or) ~ (not)
filter_gender = df["Gender"] == "Male"
filter_team = df["Team"] == "Marketing"

df[filter_gender & filter_team].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2020-04-29 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2020-04-29 06:53:00,61933,4.17,True,Marketing
3,Jerry,Male,2005-03-04,2020-04-29 13:00:00,138705,9.34,True,Marketing
4,Larry,Male,1998-01-24,2020-04-29 16:47:00,101004,1.389,True,Marketing
5,Dennis,Male,1987-04-18,2020-04-29 01:35:00,115163,10.125,False,Marketing


## using .isin() method

In [33]:
# filter rows where team is Legal, Sales, Product
df = pd.read_csv("data/employees.csv", parse_dates=["Start Date", "Last Login Time"])

# use | condition or
filter_team = df["Team"].isin(["Legal", "Sales", "Product"])
df[filter_team].head(12)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2020-04-29 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2020-04-29 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2020-04-29 15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,2020-04-29 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2020-04-29 06:09:00,59414,1.256,False,Product
17,Shawn,Male,1986-12-07,2020-04-29 19:45:00,111737,6.414,False,Product
19,Donna,Female,2010-07-22,2020-04-29 03:48:00,81014,1.894,False,Product
20,Lois,,1995-04-22,2020-04-29 19:18:00,64714,4.934,True,Legal
27,Scott,,1991-07-11,2020-04-29 18:58:00,122367,5.218,False,Legal
29,Benjamin,Male,2005-01-26,2020-04-29 22:06:00,79529,7.008,True,Legal


## .isnull() and .notnull()


In [36]:
# filter where Gender is null
df[df["Gender"].isnull()].head()


Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
20,Lois,,1995-04-22,2020-04-29 19:18:00,64714,4.934,True,Legal
22,Joshua,,2012-03-08,2020-04-29 01:58:00,90816,18.816,True,Client Services
27,Scott,,1991-07-11,2020-04-29 18:58:00,122367,5.218,False,Legal
31,Joyce,,2005-02-20,2020-04-29 14:40:00,88657,12.752,False,Product
41,Christine,,2015-06-28,2020-04-29 01:08:00,66582,11.308,True,Business Development


In [39]:
# filter where Gender is not null
df[df["Gender"].notnull()].head()


Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2020-04-29 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2020-04-29 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2020-04-29 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2020-04-29 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2020-04-29 16:47:00,101004,1.389,True,Client Services


## between() method

In [41]:
df[df["Salary"].between(60000, 90000, inclusive=True)].head()


Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2020-04-29 06:53:00,61933,4.17,True,
6,Ruby,Female,1987-08-17,2020-04-29 16:20:00,65476,10.012,True,Product
10,Louise,Female,1980-08-12,2020-04-29 09:01:00,63241,15.132,True,
19,Donna,Female,2010-07-22,2020-04-29 03:48:00,81014,1.894,False,Product
20,Lois,,1995-04-22,2020-04-29 19:18:00,64714,4.934,True,Legal


In [42]:
df[df["Last Login Time"].between("08:30AM", "01:00PM", inclusive=True)].head()


Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2020-04-29 12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,2020-04-29 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2020-04-29 13:00:00,138705,9.34,True,Finance
7,,Female,2015-07-20,2020-04-29 10:43:00,45906,11.598,,Finance
10,Louise,Female,1980-08-12,2020-04-29 09:01:00,63241,15.132,True,


## duplicated() method

In [50]:
# returns first instance of rows that have duplicated First Name values
df[df["First Name"].duplicated(keep="first")].head(10)


Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
23,,Male,2012-06-14,2020-04-29 16:19:00,125792,5.042,,
25,,Male,2012-10-08,2020-04-29 01:12:00,37076,18.576,,Client Services
32,,Male,1998-08-21,2020-04-29 14:27:00,122340,6.417,,
34,Jerry,Male,2004-01-10,2020-04-29 12:56:00,95734,19.096,False,Client Services
39,,Male,2016-01-29,2020-04-29 02:33:00,122173,7.797,,Client Services
51,,,2011-12-17,2020-04-29 08:29:00,41126,14.009,,Sales
58,Theresa,Female,2010-04-11,2020-04-29 07:18:00,72670,1.481,True,Engineering
62,,Female,2007-06-12,2020-04-29 17:25:00,58112,19.414,,Marketing
63,Matthew,Male,2013-01-02,2020-04-29 22:33:00,35203,18.04,False,Human Resources
66,Nancy,Female,2012-12-15,2020-04-29 23:57:00,125250,2.672,True,Business Development


In [49]:
# returns all rows that have duplicated First Name values
df[df["First Name"].duplicated(keep=False)].sort_values("First Name").head(10)


Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
440,Aaron,Male,1990-07-22,2020-04-29 14:53:00,52119,11.343,True,Client Services
327,Aaron,Male,1994-01-29,2020-04-29 18:48:00,58755,5.097,True,Marketing
101,Aaron,Male,2012-02-17,2020-04-29 10:20:00,61602,11.849,True,Marketing
937,Aaron,,1986-01-22,2020-04-29 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2020-04-29 01:45:00,95327,15.12,False,Distribution
141,Adam,Male,1990-12-24,2020-04-29 20:57:00,110194,14.727,True,Product
302,Adam,Male,2007-07-05,2020-04-29 11:59:00,71276,5.027,True,Human Resources
538,Adam,Male,2010-10-08,2020-04-29 21:53:00,45181,3.491,False,Human Resources
610,Alan,Male,2012-02-17,2020-04-29 00:26:00,41453,10.084,False,Product
53,Alan,,2014-03-03,2020-04-29 13:28:00,40341,17.578,True,Finance


In [52]:
# use tilda to negate the results
# returns rows with unique First Name
df[~df["First Name"].duplicated(keep=False)].sort_values("First Name").head(10)


Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2020-04-29 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2020-04-29 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2020-04-29 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2020-04-29 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2020-04-29 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2020-04-29 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2020-04-29 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2020-04-29 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2020-04-29 10:30:00,132839,17.463,True,Client Services


## drop_duplicates() method


In [58]:
# need to specify subset, else will look for dups across the row
# will keep the first row
df = df.drop_duplicates(subset=["First Name"], keep= "first")


In [61]:
df = pd.read_csv("data/employees.csv", parse_dates=["Start Date", "Last Login Time"])
# where First Name is repeated in same Team
df.drop_duplicates(subset=["First Name", "Team"], keep= "first", inplace=True)
df

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2020-04-29 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2020-04-29 06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,2020-04-29 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2020-04-29 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2020-04-29 16:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2020-04-29 17:47:00,98874,4.479,True,Marketing
995,Henry,,2014-11-23,2020-04-29 06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,2020-04-29 06:30:00,42392,19.675,False,Finance
998,Larry,Male,2013-04-20,2020-04-29 16:45:00,60500,11.985,False,Business Development


## unique() and nunique()

In [62]:
# returns an array of all unique values within Gender
df["Gender"].unique()


array(['Male', 'Female', nan], dtype=object)

In [63]:
# returns the count of non-null unique values
# set dropna parameter if null values should count
df["Gender"].nunique()


2