In [1]:
import pandas as pd

In [23]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [24]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 1.0+ MB


# Common String Methods - `.lower()`, `.upper()`, `.title()`, `.len()`

In [25]:
chicago["Name"].str.title()

0            Aaron,  Elvia J
1          Aaron,  Jeffery M
2             Aaron,  Karina
3        Aaron,  Kimberlei R
4        Abad Jr,  Vicente M
                ...         
32057    Zygadlo,  Michael J
32058     Zygowicz,  Peter J
32059      Zymantas,  Mark E
32060    Zyrkowski,  Carlo E
32061    Zyskowski,  Dariusz
Name: Name, Length: 32062, dtype: object

In [26]:
chicago["Position Title"].str.lower().str.strip()

0                      water rate taker
1                        police officer
2                        police officer
3              chief contract expediter
4                     civil engineer iv
                      ...              
32057    frm of machinists - automotive
32058                    police officer
32059                    police officer
32060                    police officer
32061           chief data base analyst
Name: Position Title, Length: 32062, dtype: object

In [27]:
chicago["Department"].str.len()

0        11
1         6
2         6
3        16
4        11
         ..
32057    16
32058     6
32059     6
32060     6
32061     4
Name: Department, Length: 32062, dtype: int64

# `str.replace()` Method

In [28]:
"Hello World".replace("l", "!")

'He!!o Wor!d'

In [29]:
chicago["Department"].str.replace("P", "B")

0             WATER MGMNT
1                  BOLICE
2                  BOLICE
3        GENERAL SERVICES
4             WATER MGMNT
               ...       
32057    GENERAL SERVICES
32058              BOLICE
32059              BOLICE
32060              BOLICE
32061                DoIT
Name: Department, Length: 32062, dtype: object

In [31]:
chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].str.replace("$", "").astype(float)

  chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].str.replace("$", "").astype(float)


In [33]:
chicago["Employee Annual Salary"].nlargest(10)

8184     300000.0
7954     216210.0
25532    202728.0
8924     197736.0
8042     197724.0
19208    195000.0
3706     187680.0
18556    187680.0
29466    187680.0
13754    185364.0
Name: Employee Annual Salary, dtype: float64

# Filter rows with string methods

In [37]:
mask = chicago["Position Title"].str.lower().str.contains("water")
chicago[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,90744.0
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,102440.0
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,82044.0
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,109272.0
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,102440.0
...,...,...,...,...
29669,"VERMA, ANUPAM",MANAGING ENGINEER - WATER MANAGEMENT,WATER MGMNT,111192.0
30239,"WASHINGTON, JOSEPH",WATER CHEMIST III,WATER MGMNT,89676.0
30544,"WEST, THOMAS R",GEN SUPT OF WATER MANAGEMENT,WATER MGMNT,115704.0
30991,"WILLIAMS, MATTHEW",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,102440.0


In [41]:
mask2 = chicago["Position Title"].str.lower().str.startswith("water")
chicago[mask2]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,90744.0
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,82044.0
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,82044.0
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,82044.0
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,53172.0
...,...,...,...,...
28574,"THREATT, DENISE R",WATER QUALITY INSPECTOR,WATER MGMNT,62004.0
28602,"TIGNOR, DARRYL B",WATER RATE TAKER,WATER MGMNT,78948.0
28955,"TRAVIS COOK, LESLIE R",WATER RATE TAKER,WATER MGMNT,78948.0
29584,"VELAZQUEZ, JOHN",WATER RATE TAKER,WATER MGMNT,78948.0


In [43]:
mask3 = chicago["Position Title"].str.lower().str.endswith("ist")
chicago[mask3]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,99840.0
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,81948.0
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,89880.0
472,"ALLEN, ROBERT",MACHINIST,WATER MGMNT,94328.0
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,91476.0
...,...,...,...,...
31667,"YODER, TERESA G",ARCHIVAL SPECIALIST,PUBLIC LIBRARY,74304.0
31688,"YOUNGBLOOM, LAURENCE G",CRIMES SURVEILLANCE SPECIALIST,OEMC,19676.8
31717,"YOUNG, KIMBERLY M",SR PROCUREMENT SPECIALIST,PROCUREMENT,68556.0
31837,"ZAPATA, HUGO",SR PROCUREMENT SPECIALIST,PROCUREMENT,87324.0


# String Methods on Index Labels and Column Labels

In [44]:
chicago = pd.read_csv("chicago.csv", index_col="Name").dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [47]:
chicago.index.str.strip().str.title()

Index(['Aaron,  Elvia J', 'Aaron,  Jeffery M', 'Aaron,  Karina',
       'Aaron,  Kimberlei R', 'Abad Jr,  Vicente M', 'Abarca,  Anabel',
       'Abarca,  Emmanuel', 'Abascal,  Reece E', 'Abbasi,  Christopher',
       'Abbatacola,  Robert J',
       ...
       'Zwit,  Jeffrey J', 'Zwolfer,  Matthew W', 'Zych,  Mateusz',
       'Zydek,  Bryan', 'Zygadlo,  John P', 'Zygadlo,  Michael J',
       'Zygowicz,  Peter J', 'Zymantas,  Mark E', 'Zyrkowski,  Carlo E',
       'Zyskowski,  Dariusz'],
      dtype='object', name='Name', length=32062)

In [48]:
chicago.columns.str.strip().str.upper()

Index(['POSITION TITLE', 'DEPARTMENT', 'EMPLOYEE ANNUAL SALARY'], dtype='object')

# Split strings

In [51]:
chicago.columns

Index(['Position Title', 'Department', 'Employee Annual Salary'], dtype='object')

In [54]:
chicago = chicago.reset_index()

In [59]:
# Split string into list and get value at index position for each row, then count values
chicago["Name"].str.split(",").str.get(0).value_counts()

WILLIAMS     293
JOHNSON      244
SMITH        241
BROWN        185
JONES        183
            ... 
HORN           1
HORNE JR       1
HORNER         1
HORNIK         1
ZYSKOWSKI      1
Name: Name, Length: 13830, dtype: int64

In [72]:
# Look at all of these chained methods baby
chicago["Name"].str.split(",").str.get(1).str.strip().str.split(" ").str.get(0).value_counts().head()

MICHAEL    1153
JOHN        899
JAMES       676
ROBERT      622
JOSEPH      537
Name: Name, dtype: int64

# The `expand` and `n` Params of the `str.split()` Method

In [23]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [74]:
# Expand will create columns for the list, rather than a list within the original column
chicago["Name"].str.split(",", expand=True)

Unnamed: 0,0,1
0,AARON,ELVIA J
1,AARON,JEFFERY M
2,AARON,KARINA
3,AARON,KIMBERLEI R
4,ABAD JR,VICENTE M
...,...,...
32057,ZYGADLO,MICHAEL J
32058,ZYGOWICZ,PETER J
32059,ZYMANTAS,MARK E
32060,ZYRKOWSKI,CARLO E


In [76]:
# Place it back into the DataFrame
chicago[["First Name", "Last Name"]] = chicago["Name"].str.split(",", expand=True)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M


In [77]:
# Lists with variable length
chicago["Position Title"].str.split(" ", expand=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,WATER,RATE,TAKER,,,,,,
1,POLICE,OFFICER,,,,,,,
2,POLICE,OFFICER,,,,,,,
3,CHIEF,CONTRACT,EXPEDITER,,,,,,
4,CIVIL,ENGINEER,IV,,,,,,
...,...,...,...,...,...,...,...,...,...
32057,FRM,OF,MACHINISTS,-,AUTOMOTIVE,,,,
32058,POLICE,OFFICER,,,,,,,
32059,POLICE,OFFICER,,,,,,,
32060,POLICE,OFFICER,,,,,,,


In [79]:
# Limit the above to n columns, with all after going into the nth column
chicago["Position Title"].str.split(" ", expand=True, n=1)

Unnamed: 0,0,1
0,WATER,RATE TAKER
1,POLICE,OFFICER
2,POLICE,OFFICER
3,CHIEF,CONTRACT EXPEDITER
4,CIVIL,ENGINEER IV
...,...,...
32057,FRM,OF MACHINISTS - AUTOMOTIVE
32058,POLICE,OFFICER
32059,POLICE,OFFICER
32060,POLICE,OFFICER


In [81]:
chicago[["TITLE_1", "TITLE_2"]] = chicago["Position Title"].str.split(" ", expand=True, n=1)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name,TITLE_1,TITLE_2
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE,OFFICER
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R,CHIEF,CONTRACT EXPEDITER
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M,CIVIL,ENGINEER IV
