# Manipulando Dados Textuais com Pandas

In [2]:
# Começamos importando a biblioteca pandas com a abreviação 'pd'
import pandas as pd

In [3]:
# Carregamos os dados sobre salários de pessoas da cidade de Chicago
chicago = pd.read_csv('http://dadosdatascience.netlify.com/chicago.csv')
# Transformamos a coluna Department para o tipo category
chicago['Department'] = chicago['Department'].astype('category')
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


### Coletando Informações

In [48]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null category
Employee Annual Salary    32062 non-null object
dtypes: category(1), object(3)
memory usage: 784.4+ KB


In [49]:
# Número de departamentos únicos
chicago['Department'].nunique()

35

In [50]:
# Número de amostras da coluna Departamento
chicago['Department'].count()

32062

### Métodos Comuns de Strings

In [51]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


### Aplicando Transformações

In [52]:
# Transformando todos os nomes em uppercase
chicago['Name'].str.lower().str.upper()

0                 AARON,  ELVIA J
1               AARON,  JEFFERY M
2                  AARON,  KARINA
3             AARON,  KIMBERLEI R
4             ABAD JR,  VICENTE M
5                 ABARCA,  ANABEL
6               ABARCA,  EMMANUEL
7               ABASCAL,  REECE E
8            ABBASI,  CHRISTOPHER
9           ABBATACOLA,  ROBERT J
10          ABBATEMARCO,  JAMES J
11               ABBATE,  TERRY M
12               ABBOTT,  BETTY L
13              ABBOTT,  LYNISE M
14         ABBRUZZESE,  WILLIAM J
15                ABDALLAH,  ZAID
16          ABDELHADI,  ABDALMAHD
17            ABDELLATIF,  AREF R
18             ABDELMAJEID,  AZIZ
19            ABDOLLAHZADEH,  ALI
20       ABDUL-KARIM,  MUHAMMAD A
21            ABDULLAH,  DANIEL N
22               ABDULLAH,  KEVIN
23           ABDULLAH,  LAKENYA N
24            ABDULLAH,  RASHAD J
25           ABDULSATTAR,  MUDHAR
26           ABDUL-SHAKUR,  TAHIR
27         ABDULWAHAB,  ABUUBAIDA
28              ABEJERO,  JASON V
29        ABER

In [53]:
# Transformando todos os nomes em Title
chicago['Name'].str.title()

0                 Aaron,  Elvia J
1               Aaron,  Jeffery M
2                  Aaron,  Karina
3             Aaron,  Kimberlei R
4             Abad Jr,  Vicente M
5                 Abarca,  Anabel
6               Abarca,  Emmanuel
7               Abascal,  Reece E
8            Abbasi,  Christopher
9           Abbatacola,  Robert J
10          Abbatemarco,  James J
11               Abbate,  Terry M
12               Abbott,  Betty L
13              Abbott,  Lynise M
14         Abbruzzese,  William J
15                Abdallah,  Zaid
16          Abdelhadi,  Abdalmahd
17            Abdellatif,  Aref R
18             Abdelmajeid,  Aziz
19            Abdollahzadeh,  Ali
20       Abdul-Karim,  Muhammad A
21            Abdullah,  Daniel N
22               Abdullah,  Kevin
23           Abdullah,  Lakenya N
24            Abdullah,  Rashad J
25           Abdulsattar,  Mudhar
26           Abdul-Shakur,  Tahir
27         Abdulwahab,  Abuubaida
28              Abejero,  Jason V
29        Aber

In [54]:
# Altera cada string da coluna Position Title para lowercase
chicago['Position Title'] = chicago['Position Title'].str.lower()

In [55]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",water rate taker,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",police officer,POLICE,$84450.00
2,"AARON, KARINA",police officer,POLICE,$84450.00
3,"AARON, KIMBERLEI R",chief contract expediter,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",civil engineer iv,WATER MGMNT,$106836.00


In [15]:
# Tamanho de cada string da coluna Department
chicago['Department'].str.len()

0        11.0
1         6.0
2         6.0
3        16.0
4        11.0
5        12.0
6        13.0
7         4.0
8        12.0
9         8.0
10        4.0
11        6.0
12       16.0
13        6.0
14        4.0
15        6.0
16        6.0
17        4.0
18        6.0
19        4.0
20       11.0
21        4.0
22        4.0
23        4.0
24       16.0
25       11.0
26       13.0
27       16.0
28        6.0
29        4.0
         ... 
32033     6.0
32034     6.0
32035     6.0
32036    13.0
32037     4.0
32038    11.0
32039     4.0
32040     6.0
32041     4.0
32042    16.0
32043    13.0
32044     6.0
32045     4.0
32046     7.0
32047     6.0
32048     3.0
32049     4.0
32050    11.0
32051     8.0
32052     6.0
32053     4.0
32054     6.0
32055     6.0
32056    16.0
32057    16.0
32058     6.0
32059     6.0
32060     6.0
32061     4.0
32062     NaN
Name: Department, Length: 32063, dtype: float64

In [4]:
# Veja que aqui estamos dropando todos os valores nulos
chicago = pd.read_csv('http://dadosdatascience.netlify.com/chicago.csv').dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32057,"ZYGADLO, MICHAEL J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
32058,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [104]:
# Substitui todas as strings 'MGMNT' para 'MANAGMEMENT' (Coluna Department)
chicago['Department'].str.replace('MGMNT', 'MANAGMEMENT').head()

0    WATER MANAGMEMENT
1               POLICE
2               POLICE
3     GENERAL SERVICES
4    WATER MANAGMEMENT
Name: Department, dtype: object

In [105]:
# Removemos o símbolo $ e convertemos os valores para float (Coluna Employee Annual Salary)
chicago['Employee Annual Salary'] = chicago['Employee Annual Salary'].str.replace('$','').astype(float).head()

In [108]:
# Obtendo a soma da coluna Employee Annual Salary
chicago['Employee Annual Salary'].sum()

456360.0

In [109]:
# Obtendo a média da coluna Employee Annual Salary
chicago['Employee Annual Salary'].mean()

91272.0

In [110]:
# Obtendo os 10 maiores valores da coluna Employee Annual Salary
chicago['Employee Annual Salary'].nlargest(10)

4    106836.0
0     90744.0
3     89880.0
1     84450.0
2     84450.0
Name: Employee Annual Salary, dtype: float64

### Filtrando com Métodos de String

In [5]:
chicago = pd.read_csv('http://dadosdatascience.netlify.com/chicago.csv').dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32057,"ZYGADLO, MICHAEL J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
32058,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [63]:
# Criamos uma máscara para selecionar todos os elementos da coluna Position Title que contenham a palavra water
mask = chicago['Position Title'].str.lower().str.contains('water')
chicago[mask].head(8)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00


In [64]:
# Criamos uma máscara para selecionar todos os elementos da coluna Position Title que comecem com a palavra water
mask_2 = chicago['Position Title'].str.lower().str.startswith('water')
chicago[mask_2].head(8)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
1285,"BAJIC, JOHN A",WATER METER MACHINIST,WATER MGMNT,$82576.00
2400,"BOLTON, BRIAN E",WATER RATE TAKER,WATER MGMNT,$78948.00
2586,"BOYCE, ADNER L",WATER CHEMIST II,WATER MGMNT,$82044.00


In [65]:
# Criamos uma máscara para selecionar todos os elementos da coluna Position Title que terminem com 'ist'
mask_3 = chicago['Position Title'].str.lower().str.endswith('ist')
chicago[mask_3].head(8)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00
472,"ALLEN, ROBERT",MACHINIST,WATER MGMNT,$94328.00
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$91476.00
1022,"ARTEAGA, PAUL",MACHINIST,TRANSPORTN,$94328.00
1163,"AYALA JR, JUAN",FIELD SANITATION SPECIALIST,STREETS & SAN,$78948.00
1285,"BAJIC, JOHN A",WATER METER MACHINIST,WATER MGMNT,$82576.00


### Os Métodos strip(), lstrip() e rstrip()

In [26]:
chicago['Name'].str.rstrip().str.lstrip()

0                 AARON,  ELVIA J
1               AARON,  JEFFERY M
2                  AARON,  KARINA
3             AARON,  KIMBERLEI R
4             ABAD JR,  VICENTE M
5                 ABARCA,  ANABEL
6               ABARCA,  EMMANUEL
7               ABASCAL,  REECE E
8            ABBASI,  CHRISTOPHER
9           ABBATACOLA,  ROBERT J
10          ABBATEMARCO,  JAMES J
11               ABBATE,  TERRY M
12               ABBOTT,  BETTY L
13              ABBOTT,  LYNISE M
14         ABBRUZZESE,  WILLIAM J
15                ABDALLAH,  ZAID
16          ABDELHADI,  ABDALMAHD
17            ABDELLATIF,  AREF R
18             ABDELMAJEID,  AZIZ
19            ABDOLLAHZADEH,  ALI
20       ABDUL-KARIM,  MUHAMMAD A
21            ABDULLAH,  DANIEL N
22               ABDULLAH,  KEVIN
23           ABDULLAH,  LAKENYA N
24            ABDULLAH,  RASHAD J
25           ABDULSATTAR,  MUDHAR
26           ABDUL-SHAKUR,  TAHIR
27         ABDULWAHAB,  ABUUBAIDA
28              ABEJERO,  JASON V
29        ABER

In [27]:
chicago['Position Title'].str.strip()

0                                     WATER RATE TAKER
1                                       POLICE OFFICER
2                                       POLICE OFFICER
3                             CHIEF CONTRACT EXPEDITER
4                                    CIVIL ENGINEER IV
5                                 ASST TO THE ALDERMAN
6                                GENERAL LABORER - DSS
7                          TRAFFIC CONTROL AIDE-HOURLY
8                           STAFF ASST TO THE ALDERMAN
9                                  ELECTRICAL MECHANIC
10                                   FIRE ENGINEER-EMT
11                                      POLICE OFFICER
12                                  FOSTER GRANDPARENT
13                                           CLERK III
14                              INVESTIGATOR - IPRA II
15                                      POLICE OFFICER
16                                      POLICE OFFICER
17       FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC
18        

### Métodos de Strings em Índices e Colunas

In [6]:
# Veja que estamos carregando a coluna 'Name' como nosso índice
chicago = pd.read_csv('http://dadosdatascience.netlify.com/chicago.csv', index_col='Name').dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"ZYGADLO, MICHAEL J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00
"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [82]:
# Transformando nosso índice para o modelo Title
chicago.index = chicago.index.str.strip().str.title()

In [83]:
chicago.tail()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Zygadlo, Michael J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
"Zygowicz, Peter J",POLICE OFFICER,POLICE,$87384.00
"Zymantas, Mark E",POLICE OFFICER,POLICE,$84450.00
"Zyrkowski, Carlo E",POLICE OFFICER,POLICE,$87384.00
"Zyskowski, Dariusz",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [84]:
# Transformando nossas colunas para uppercase
chicago.columns = chicago.columns.str.upper()

In [85]:
chicago.head()

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


### O Método split()

In [86]:
chicago = pd.read_csv('dados/chicago.csv').dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32057,"ZYGADLO, MICHAEL J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
32058,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [93]:
# Visualizando os 10 nomes que mais ocorrem 
chicago['Name'].str.split(',').str.get(0).str.title().value_counts().head(10)

Williams     293
Johnson      244
Smith        241
Brown        185
Jones        183
Rodriguez    171
Jackson      136
Garcia       130
Davis        127
Hernandez    110
Name: Name, dtype: int64

In [95]:
# Selecionando somente a primeira palavra dos elementos da coluna Position Title
chicago['Position Title'].str.split(' ').str.get(0).head(10)

0         WATER
1        POLICE
2        POLICE
3         CHIEF
4         CIVIL
5          ASST
6       GENERAL
7       TRAFFIC
8         STAFF
9    ELECTRICAL
Name: Position Title, dtype: object

In [96]:
# Selecionando apenas o segundo nome dos elementos da coluna Name
chicago['Name'].str.split(',').str.get(1).str.strip().str.split(' ').str.get(0).head()

0        ELVIA
1      JEFFERY
2       KARINA
3    KIMBERLEI
4      VICENTE
Name: Name, dtype: object

In [99]:
# Criando duas colunas a partir da coluna Position TItle com o método split()
chicago[['First Title Word', 'Second Title Word']] = chicago['Position Title'].str.split(' ', expand=True, n=1).head()

In [100]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Title Word,Second Title Word
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,POLICE,OFFICER
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,CHIEF,CONTRACT EXPEDITER
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,CIVIL,ENGINEER IV
