# Working with Text Data

In [95]:
import pandas as pd

## This Module's Dataset
- This module's dataset (`chicago.csv`) is a collection of public sector employees in the city of Chicago.
- Each row inclues the employee's name, position, department, and salary.

In [96]:
chicago = pd.read_csv('chicago.csv').dropna(how='all')
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [97]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1.2+ MB


In [98]:
chicago.nunique()

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [99]:
# 2 Department 32062 non-null >>> category
chicago['Department'] = chicago['Department'].astype('category')
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 1.0+ MB


## Common String Methods
- A **Series** has a special `str` attribute that exposes an object with string methods.
- Access the `str` attribute, then invoke the string method on the nested object.
- Most method names will match their Python method equivalents (`upper`, `lower`, `title`, etc).

In [100]:
chicago['Position Title'].str.lower().head(10)

0               water rate taker
1                 police officer
2                 police officer
3       chief contract expediter
4              civil engineer iv
5           asst to the alderman
6          general laborer - dss
7    traffic control aide-hourly
8     staff asst to the alderman
9            electrical mechanic
Name: Position Title, dtype: object

In [101]:
chicago['Position Title'].str.title().head(10)

0               Water Rate Taker
1                 Police Officer
2                 Police Officer
3       Chief Contract Expediter
4              Civil Engineer Iv
5           Asst To The Alderman
6          General Laborer - Dss
7    Traffic Control Aide-Hourly
8     Staff Asst To The Alderman
9            Electrical Mechanic
Name: Position Title, dtype: object

In [102]:
chicago['Position Title'].str.len().head(10)

0    16
1    14
2    14
3    24
4    17
5    20
6    21
7    27
8    26
9    19
Name: Position Title, dtype: int64

In [103]:
chicago['Position Title'].str.lower().str.len().head()

0    16
1    14
2    14
3    24
4    17
Name: Position Title, dtype: int64

In [104]:
# remover espaços em branco (r - direita, l - esquerda, strip() - ambos):
chicago['Position Title'].str.lstrip().str.rstrip().head()
chicago['Position Title'].str.strip().head()

0            WATER RATE TAKER
1              POLICE OFFICER
2              POLICE OFFICER
3    CHIEF CONTRACT EXPEDITER
4           CIVIL ENGINEER IV
Name: Position Title, dtype: object

In [105]:
chicago['Department'].str.replace('MGMNT','MANAGEMENT').head()

0    WATER MANAGEMENT
1              POLICE
2              POLICE
3    GENERAL SERVICES
4    WATER MANAGEMENT
Name: Department, dtype: object

## Filtering with String Methods
- The `str.contains` method checks whether a substring exists anywhere in the string.
- The `str.startswith` method checks whether a substring exists at the start of the string.
- The `str.endswith` method checks whether a substring exists at the end of the string.

In [106]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 1.0+ MB


In [107]:
water_workers = chicago['Position Title'].str.lower().str.contains('water')
chicago[water_workers]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
...,...,...,...,...
29669,"VERMA, ANUPAM",MANAGING ENGINEER - WATER MANAGEMENT,WATER MGMNT,$111192.00
30239,"WASHINGTON, JOSEPH",WATER CHEMIST III,WATER MGMNT,$89676.00
30544,"WEST, THOMAS R",GEN SUPT OF WATER MANAGEMENT,WATER MGMNT,$115704.00
30991,"WILLIAMS, MATTHEW",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00


In [108]:
starts_with_civil = chicago['Position Title'].str.lower().str.startswith('civil')
chicago[starts_with_civil]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
25,"ABDULSATTAR, MUDHAR",CIVIL ENGINEER II,WATER MGMNT,$58536.00
34,"ABRAHAM, GIRLEY T",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
55,"ABUTALEB, AHMAD H",CIVIL ENGINEER II,WATER MGMNT,$89676.00
147,"ADAMS, TANERA C",CIVIL ENGINEER IV,TRANSPORTN,$106836.00
...,...,...,...,...
31623,"YANG, LUYANG",CIVIL ENGINEER V,TRANSPORTN,$116784.00
31656,"YEPEZ, JESUS",CIVIL ENGINEER IV,TRANSPORTN,$106836.00
31662,"YESUFU, STEPHANIE A",CIVIL ENGINEER III,TRANSPORTN,$92784.00
31797,"ZAKE, JOSHUA S",CIVIL ENGINEER IV,TRANSPORTN,$106836.00


In [109]:
ends_with_civil = chicago['Position Title'].str.lower().str.endswith('iv')
chicago[starts_with_civil & ends_with_civil]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
34,"ABRAHAM, GIRLEY T",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
147,"ADAMS, TANERA C",CIVIL ENGINEER IV,TRANSPORTN,$106836.00
2394,"BOLLAM, ALFRED R",CIVIL ENGINEER IV,TRANSPORTN,$106836.00
4272,"CASTILLA, FERNANDO",CIVIL ENGINEER IV,TRANSPORTN,$106836.00
4536,"CHAVES, OSWALDO E",CIVIL ENGINEER IV,TRANSPORTN,$106836.00
5935,"CRUZAT, ERNESTO M",CIVIL ENGINEER IV,BUILDINGS,$106836.00
8025,"ERRERA, JOHN S",CIVIL ENGINEER IV,TRANSPORTN,$106836.00
10810,"GRIFFIN, KALI R",CIVIL ENGINEER IV,TRANSPORTN,$106836.00
11528,"HARRIS, ABABI",CIVIL ENGINEER IV,TRANSPORTN,$106836.00


## String Methods on Index and Columns
- Use the `index` and `columns` attributes to access the **DataFrame** index/column labels.
- These objects support string methods via their own `str` attribute.

In [110]:
chicago = pd.read_csv('chicago.csv', index_col='Name').dropna(how='all').sort_index()
chicago['Department'] = chicago['Department'].astype('category')
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [111]:
chicago.index = chicago.index.str.strip().str.title()
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [112]:
chicago.columns = chicago.columns.str.upper()
chicago.head()

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


## The split Method
- The `str.split` method splits a string by the occurrence of a delimiter. Pandas returns a **Series** of lists.
- Use the `str.get` method to access a nested list element by its index position.

In [113]:
chicago = pd.read_csv('chicago.csv').dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [114]:
chicago['Position Title'].str.split(' ').str.get(0).value_counts()

Position Title
POLICE             10856
FIREFIGHTER-EMT     1509
SERGEANT            1186
POOL                 918
FIREFIGHTER          810
                   ...  
PORTFOLIO              1
SUPERVISOR             1
SUPT                   1
STRUCTURAL             1
CORPORATION            1
Name: count, Length: 320, dtype: int64

## More Practice with Splits

In [115]:
chicago['Name'].str.title().str.split(', ').str.get(1).str.strip().str.split(' ').str.get(0).value_counts()

Name
Michael     1153
John         899
James        676
Robert       622
Joseph       537
            ... 
Russ           1
Fabiola        1
Jurdon         1
Nateesha       1
Lilya          1
Name: count, Length: 5091, dtype: int64

In [116]:
def split_function(row):
    nome = row['Name'].split(',  ')
    nome = nome[1].split(' ')
    return nome[0].title()

chicago.apply(split_function, axis='columns').value_counts()

Michael     1153
John         899
James        676
Robert       622
Joseph       537
            ... 
Russ           1
Fabiola        1
Jurdon         1
Nateesha       1
Lilya          1
Name: count, Length: 5091, dtype: int64

## The expand and n Parameters of the split Method
- The `expand` parameter returns a **DataFrame** instead of a **Series** of lists.
- The `n` parameter limits the number of splits.

In [117]:
chicago[['Last Name','First Name']] = chicago['Name'].str.split(',', expand=True)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,Last Name,First Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M


In [123]:
# reorganizar a ordem das colunas:
colunas_reordenadas = ['Name', 'Last Name', 'First Name', 'Position Title','Department','Employee Annual Salary','Last Name']
chicago = chicago[colunas_reordenadas]
chicago.head()

Unnamed: 0,Name,Last Name,First Name,Position Title,Department,Employee Annual Salary,Last Name.1
0,"AARON, ELVIA J",AARON,ELVIA J,WATER RATE TAKER,WATER MGMNT,$90744.00,AARON
1,"AARON, JEFFERY M",AARON,JEFFERY M,POLICE OFFICER,POLICE,$84450.00,AARON
2,"AARON, KARINA",AARON,KARINA,POLICE OFFICER,POLICE,$84450.00,AARON
3,"AARON, KIMBERLEI R",AARON,KIMBERLEI R,CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON
4,"ABAD JR, VICENTE M",ABAD JR,VICENTE M,CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR


In [124]:
chicago['Position Title'].str.split(' ', expand=True, n=1).head()

Unnamed: 0,0,1
0,WATER,RATE TAKER
1,POLICE,OFFICER
2,POLICE,OFFICER
3,CHIEF,CONTRACT EXPEDITER
4,CIVIL,ENGINEER IV
