In [2]:
import pandas as pd

# First thing first - memory optimization on import

In [3]:
df = pd.read_csv("employees.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [4]:
df["Gender"].nunique()

2

In [5]:
df["Team"].nunique()

10

In [6]:
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df["Senior Management"] = df["Senior Management"].astype(bool)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   First Name         933 non-null    object  
 1   Gender             855 non-null    category
 2   Start Date         1000 non-null   object  
 3   Last Login Time    1000 non-null   object  
 4   Salary             1000 non-null   int64   
 5   Bonus %            1000 non-null   float64 
 6   Senior Management  1000 non-null   bool    
 7   Team               957 non-null    category
dtypes: bool(1), category(2), float64(1), int64(1), object(3)
memory usage: 42.6+ KB


## Lets go ahead and convert dates and times to datetimes too

In [7]:
# Note that times w/o dates are assumed to be datetime.today()
df["Start Date"] = pd.to_datetime(df["Start Date"])
df["Last Login Time"] = pd.to_datetime(df["Last Login Time"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 42.6+ KB


# Filtering a dataframe based on a single condition

In [8]:
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-07-13 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-07-13 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-07-13 11:17:00,130590,11.858,False,Finance


### Fetch all Male employees

In [9]:
df["Gender"] == "Male"

0       True
1       True
2      False
3       True
4       True
       ...  
995    False
996     True
997     True
998     True
999     True
Name: Gender, Length: 1000, dtype: bool

In [10]:
# You can do operations inside of the brackets apparently, cool
df[df["Gender"] == "Male"]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-07-13 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-07-13 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-07-13 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-07-13 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-07-13 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2022-07-13 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2022-07-13 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-07-13 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-07-13 16:45:00,60500,11.985,False,Business Development


In [11]:
# This is called a mask, and you should make it separate variable for clarity's sake
# Holy shit, python lets you do evaluations inside of assingments? Or is this just a polymorphed method?
mask = df["Team"] == "Finance"
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-07-13 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-07-13 13:00:00,138705,9.340,True,Finance
7,,Female,2015-07-20,2022-07-13 10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,2022-07-13 07:13:00,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,2022-07-13 22:47:00,114796,6.796,False,Finance
...,...,...,...,...,...,...,...,...
907,Elizabeth,Female,1998-07-27,2022-07-13 11:12:00,137144,10.081,False,Finance
954,Joe,Male,1980-01-19,2022-07-13 16:06:00,119667,1.148,True,Finance
987,Gloria,Female,2014-12-08,2022-07-13 05:08:00,136709,10.331,True,Finance
992,Anthony,Male,2011-10-16,2022-07-13 08:35:00,112769,11.625,True,Finance


In [12]:
mask_sm = df["Senior Management"]
df[mask_sm]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-07-13 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-07-13 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-07-13 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-07-13 16:47:00,101004,1.389,True,Client Services
6,Ruby,Female,1987-08-17,2022-07-13 16:20:00,65476,10.012,True,Product
...,...,...,...,...,...,...,...,...
991,Rose,Female,2002-08-25,2022-07-13 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2022-07-13 08:35:00,112769,11.625,True,Finance
993,Tina,Female,1997-05-15,2022-07-13 15:53:00,56450,19.040,True,Engineering
994,George,Male,2013-06-21,2022-07-13 17:47:00,98874,4.479,True,Marketing


In [13]:
mask_nm = df["Team"] != "Marketing"
df[mask_nm]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-07-13 06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,2022-07-13 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-07-13 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-07-13 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-07-13 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
995,Henry,,2014-11-23,2022-07-13 06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,2022-07-13 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-07-13 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-07-13 16:45:00,60500,11.985,False,Business Development


In [14]:
df[df["Salary"] > 110000]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-07-13 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-07-13 13:00:00,138705,9.340,True,Finance
5,Dennis,Male,1987-04-18,2022-07-13 01:35:00,115163,10.125,False,Legal
9,Frances,Female,2002-08-08,2022-07-13 06:51:00,139852,7.524,True,Business Development
12,Brandon,Male,1980-12-01,2022-07-13 01:08:00,112807,17.492,True,Human Resources
...,...,...,...,...,...,...,...,...
987,Gloria,Female,2014-12-08,2022-07-13 05:08:00,136709,10.331,True,Finance
991,Rose,Female,2002-08-25,2022-07-13 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2022-07-13 08:35:00,112769,11.625,True,Finance
995,Henry,,2014-11-23,2022-07-13 06:09:00,132483,16.655,False,Distribution


In [15]:
# Dude wtf, dates are so much easier... although we need to consider timezones. I'm sure he will get into it...
date_mask = df["Start Date"] <= "1985-01-01"
df[date_mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,2022-07-13 09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,2022-07-13 01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,2022-07-13 10:27:00,132940,19.082,False,Client Services
28,Terry,Male,1981-11-27,2022-07-13 18:30:00,124008,13.464,True,Client Services
37,Linda,Female,1981-10-19,2022-07-13 20:49:00,57427,9.557,True,Client Services
...,...,...,...,...,...,...,...,...
982,Rose,Female,1982-04-06,2022-07-13 10:43:00,91411,8.639,True,Human Resources
983,John,Male,1982-12-23,2022-07-13 22:35:00,146907,11.738,False,Engineering
985,Stephen,,1983-07-10,2022-07-13 20:10:00,85668,1.909,False,Legal
986,Donna,Female,1982-11-26,2022-07-13 07:04:00,82871,17.999,False,Marketing


# Filter with multiple conditions

In [16]:

office = pd.read_csv("the_office.csv", parse_dates=["Airdate"])
office = office.drop_duplicates(subset=["Season", "Name", "Director", "Writer", "Airdate", "Viewership"])
office

Unnamed: 0,Season,Episode,Name,Director,Writer,Airdate,Viewership
0,1,1,Pilot,Ken Kwapis,Ricky Gervais & Stephen Merchant and Greg Daniels,2005-03-24,11.20
1,1,2,Diversity Day,Ken Kwapis,B. J. Novak,2005-03-29,6.00
2,1,3,Health Care,Ken Whittingham,Paul Lieberstein,2005-04-05,5.80
3,1,4,The Alliance,Bryan Gordon,Michael Schur,2005-04-12,5.40
4,1,5,Basketball,Greg Daniels,Greg Daniels,2005-04-19,5.00
...,...,...,...,...,...,...,...
194,9,19,Stairmageddon,Matt Sohn,Dan Sterling,2013-04-11,3.83
195,9,20,Paper Airplane,Jesse Peretz,Halsted Sullivan & Warren Lieberstein,2013-04-25,3.25
196,9,21,Livin' the Dream,Jeffrey Blitz,Niki Schwartz-Wright,2013-05-02,3.51
197,9,22,A.A.R.M.,David Rogers,Brent Forrester,2013-05-09,4.56


### Using the AND (&) Operator

In [17]:
mask1 = office["Writer"] == "Greg Daniels"
mask2 = office["Season"] >= 6
written_by_greg = office[mask1 & mask2]
written_by_greg

Unnamed: 0,Season,Episode,Name,Director,Writer,Airdate,Viewership
147,7,22,"Goodbye, Michael†",Paul Feig,Greg Daniels,2011-04-28,8.42
176,9,1,New Guys,Greg Daniels,Greg Daniels,2012-09-20,4.28
199,9,24,Finale,Ken Kwapis,Greg Daniels,2013-05-16,5.69


### Using the OR ( | ) Operator

In [22]:
mask1 = office["Season"] == 8
mask2 = office["Episode"] == 1
good_episodes = office[mask1 | mask2]
good_episodes.head(3)

Unnamed: 0,Season,Episode,Name,Director,Writer,Airdate,Viewership
0,1,1,Pilot,Ken Kwapis,Ricky Gervais & Stephen Merchant and Greg Daniels,2005-03-24,11.2
6,2,1,The Dundies,Greg Daniels,Mindy Kaling,2005-09-20,9.0
28,3,1,Gay Witch Hunt,Ken Kwapis,Greg Daniels,2006-09-21,9.11


### Using multiple conditions at once

In [23]:
mask1 = office["Airdate"] < "2012-01-01"
mask2 = office["Writer"] == "Greg Daniels"
mask3 = office["Director"] == "Jeffrey Blitz"
early_episodes = office[mask1 & (mask2 | mask3)]
early_episodes.head(3)

Unnamed: 0,Season,Episode,Name,Director,Writer,Airdate,Viewership
4,1,5,Basketball,Greg Daniels,Greg Daniels,2005-04-19,5.0
10,2,5,Halloween,Paul Feig,Greg Daniels,2005-10-18,8.0
16,2,11,Booze Cruise,Ken Kwapis,Greg Daniels,2006-01-05,8.7


## Checking for stuff with the .isin() method

In [41]:
df = pd.read_csv("employees.csv", parse_dates=["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].fillna("None")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-07-13 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-07-13 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-07-13 11:17:00,130590,11.858,False,Finance


In [42]:
legal = df["Team"] == "Legal"
sales = df["Team"] == "Sales"
product = df["Team"] == "Product"
df[legal | sales | product]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2022-07-13 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2022-07-13 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2022-07-13 15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,2022-07-13 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2022-07-13 06:09:00,59414,1.256,False,Product
...,...,...,...,...,...,...,...,...
981,James,Male,1993-01-15,2022-07-13 17:19:00,148985,19.280,False,Legal
985,Stephen,,1983-07-10,2022-07-13 20:10:00,85668,1.909,False,Legal
989,Justin,,1991-02-10,2022-07-13 16:58:00,38344,3.794,False,Legal
997,Russell,Male,2013-05-20,2022-07-13 12:39:00,96914,1.421,False,Product


In [43]:
mask = df["Team"].isin(["Legal", "Sales", "Product"])
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2022-07-13 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2022-07-13 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2022-07-13 15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,2022-07-13 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2022-07-13 06:09:00,59414,1.256,False,Product
...,...,...,...,...,...,...,...,...
981,James,Male,1993-01-15,2022-07-13 17:19:00,148985,19.280,False,Legal
985,Stephen,,1983-07-10,2022-07-13 20:10:00,85668,1.909,False,Legal
989,Justin,,1991-02-10,2022-07-13 16:58:00,38344,3.794,False,Legal
997,Russell,Male,2013-05-20,2022-07-13 12:39:00,96914,1.421,False,Product


## Checking with str.contains

In [44]:
Team = "Sale"
mask = df["Team"].str.contains(Team)
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
13,Gary,Male,2008-01-27,2022-07-13 23:40:00,109831,5.831,False,Sales
35,Theresa,Female,2006-10-10,2022-07-13 01:12:00,85182,16.675,False,Sales
45,Roger,Male,1980-04-17,2022-07-13 11:32:00,88010,13.886,True,Sales
49,Chris,,1980-01-24,2022-07-13 12:13:00,113590,3.055,False,Sales
51,,,2011-12-17,2022-07-13 08:29:00,41126,14.009,True,Sales
...,...,...,...,...,...,...,...,...
963,Ann,Female,1994-09-23,2022-07-13 11:15:00,89443,17.940,True,Sales
964,Bruce,Male,1980-05-07,2022-07-13 20:00:00,35802,12.391,True,Sales
972,Victor,,2006-07-28,2022-07-13 14:49:00,76381,11.159,True,Sales
975,Susan,Female,1995-04-07,2022-07-13 22:05:00,92436,12.467,False,Sales


## Using the .isnull() and .notnull() Methods

In [47]:
mask = df["Gender"].isnull()
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
20,Lois,,1995-04-22,2022-07-13 19:18:00,64714,4.934,True,Legal
22,Joshua,,2012-03-08,2022-07-13 01:58:00,90816,18.816,True,Client Services
27,Scott,,1991-07-11,2022-07-13 18:58:00,122367,5.218,False,Legal
31,Joyce,,2005-02-20,2022-07-13 14:40:00,88657,12.752,False,Product
41,Christine,,2015-06-28,2022-07-13 01:08:00,66582,11.308,True,Business Development
...,...,...,...,...,...,...,...,...
961,Antonio,,1989-06-18,2022-07-13 21:37:00,103050,3.050,False,Legal
972,Victor,,2006-07-28,2022-07-13 14:49:00,76381,11.159,True,Sales
985,Stephen,,1983-07-10,2022-07-13 20:10:00,85668,1.909,False,Legal
989,Justin,,1991-02-10,2022-07-13 16:58:00,38344,3.794,False,Legal


In [49]:
mask = df["Gender"].notnull()
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-07-13 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-07-13 06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,2022-07-13 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-07-13 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-07-13 16:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2022-07-13 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2022-07-13 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-07-13 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-07-13 16:45:00,60500,11.985,False,Business Development


## Using the .between() Method

In [50]:
mask = df["Salary"].between(60000, 70000)
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-07-13 06:53:00,61933,4.170,True,
6,Ruby,Female,1987-08-17,2022-07-13 16:20:00,65476,10.012,True,Product
10,Louise,Female,1980-08-12,2022-07-13 09:01:00,63241,15.132,True,
20,Lois,,1995-04-22,2022-07-13 19:18:00,64714,4.934,True,Legal
41,Christine,,2015-06-28,2022-07-13 01:08:00,66582,11.308,True,Business Development
...,...,...,...,...,...,...,...,...
965,Catherine,Female,1989-09-25,2022-07-13 01:31:00,68164,18.393,False,Client Services
970,Alice,Female,1988-09-03,2022-07-13 20:54:00,63571,15.397,True,Product
974,Harry,Male,2011-08-30,2022-07-13 18:31:00,67656,16.455,True,Client Services
978,Sean,Male,1983-01-17,2022-07-13 14:23:00,66146,11.178,False,Human Resources


In [51]:
df[df["Bonus %"].between(2.0, 5.0)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-07-13 06:53:00,61933,4.170,True,
20,Lois,,1995-04-22,2022-07-13 19:18:00,64714,4.934,True,Legal
40,Michael,Male,2008-10-10,2022-07-13 11:25:00,99283,2.665,True,Distribution
49,Chris,,1980-01-24,2022-07-13 12:13:00,113590,3.055,False,Sales
60,Paula,,2005-11-23,2022-07-13 14:01:00,48866,4.271,False,Distribution
...,...,...,...,...,...,...,...,...
943,Wayne,Male,2006-09-08,2022-07-13 11:09:00,67471,2.728,False,Engineering
961,Antonio,,1989-06-18,2022-07-13 21:37:00,103050,3.050,False,Legal
976,Denise,Female,1992-10-19,2022-07-13 05:42:00,137954,4.195,True,Legal
989,Justin,,1991-02-10,2022-07-13 16:58:00,38344,3.794,False,Legal


## .between() works on datetimes

In [53]:
datetime_mask = df["Start Date"].between("1991-01-01", "1992-01-01")
df[datetime_mask].head(5)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
27,Scott,,1991-07-11,2022-07-13 18:58:00,122367,5.218,False,Legal
75,Bonnie,Female,1991-07-02,2022-07-13 01:27:00,104897,5.118,True,Human Resources
88,Donna,Female,1991-11-27,2022-07-13 13:59:00,64088,6.155,True,Legal
116,,Male,1991-06-22,2022-07-13 20:58:00,76189,18.988,True,Legal
148,Patrick,,1991-07-14,2022-07-13 02:24:00,124488,14.837,True,Sales


In [56]:
# Pandas is good at parsing times apparently lol
df[df["Last Login Time"].between("2022-07-12 08:30", "2022-07-13 12:00")]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-07-13 06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,2022-07-13 11:17:00,130590,11.858,False,Finance
5,Dennis,Male,1987-04-18,2022-07-13 01:35:00,115163,10.125,False,Legal
7,,Female,2015-07-20,2022-07-13 10:43:00,45906,11.598,True,Finance
8,Angela,Female,2005-11-22,2022-07-13 06:29:00,95570,18.523,True,Engineering
...,...,...,...,...,...,...,...,...
988,Alice,Female,2004-10-05,2022-07-13 09:34:00,47638,11.209,False,Human Resources
991,Rose,Female,2002-08-25,2022-07-13 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2022-07-13 08:35:00,112769,11.625,True,Finance
995,Henry,,2014-11-23,2022-07-13 06:09:00,132483,16.655,False,Distribution


# Check for duplicate rows with .duplicated() method

In [58]:
df = df.sort_values("First Name")
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2022-07-13 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2022-07-13 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2022-07-13 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2022-07-13 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2022-07-13 01:45:00,95327,15.12,False,Distribution


In [60]:
df["First Name"].duplicated(keep="first")

101    False
327     True
440     True
937     True
137    False
       ...  
902     True
925     True
946     True
947     True
951     True
Name: First Name, Length: 1000, dtype: bool

In [61]:
df["First Name"].duplicated(keep="last")

101     True
327     True
440     True
937    False
137     True
       ...  
902     True
925     True
946     True
947     True
951    False
Name: First Name, Length: 1000, dtype: bool

In [63]:
# Throw out anything thats duplicated. Even the first and last. Only totally unique values allowed here.
mask = ~df["First Name"].duplicated(keep=False)
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2022-07-13 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2022-07-13 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2022-07-13 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2022-07-13 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2022-07-13 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2022-07-13 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2022-07-13 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2022-07-13 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2022-07-13 10:30:00,132839,17.463,True,Client Services


## Drop Duplicates w/ .drop_duplicates() method

#### Default behavior drops only wholey duplicate rows

In [64]:
len(df)

1000

In [66]:
df = df.drop_duplicates(subset=["First Name"], keep="first")
df

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2022-07-13 10:20:00,61602,11.849,True,Marketing
137,Adam,Male,2011-05-21,2022-07-13 01:45:00,95327,15.120,False,Distribution
300,Alan,Male,1988-06-26,2022-07-13 03:54:00,111786,3.592,True,Engineering
959,Albert,Male,1992-09-19,2022-07-13 02:35:00,45094,5.850,True,Business Development
988,Alice,Female,2004-10-05,2022-07-13 09:34:00,47638,11.209,False,Human Resources
...,...,...,...,...,...,...,...,...
512,Wanda,Female,1993-04-06,2022-07-13 03:11:00,78883,19.695,False,
943,Wayne,Male,2006-09-08,2022-07-13 11:09:00,67471,2.728,False,Engineering
941,William,Male,1997-06-26,2022-07-13 08:33:00,104840,15.653,True,Engineering
175,Willie,Male,1998-02-17,2022-07-13 20:20:00,146651,1.451,True,Engineering
