In [1]:
import pandas as pd
import numpy as np

In [2]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

In [3]:
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [4]:
s.str.lower() # transforma as strings em minúsculas

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [5]:
s.str.upper() # transoforma as strings em maiúscula

0       A
1       B
2       C
3    AABA
4    BACA
5     NaN
6    CABA
7     DOG
8     CAT
dtype: object

In [6]:
s.str.len() # conta a quantidade de caracteres

0    1.0
1    1.0
2    1.0
3    4.0
4    4.0
5    NaN
6    4.0
7    3.0
8    3.0
dtype: float64

In [7]:
idx = pd.Index([' jack', 'jill ', ' jesse ', 'frank'])

In [8]:
idx

Index([' jack', 'jill ', ' jesse ', 'frank'], dtype='object')

In [9]:
idx.str.strip() # remove todos os espaços

Index(['jack', 'jill', 'jesse', 'frank'], dtype='object')

In [10]:
idx.str.lstrip() # remove todos os espaços a esquerda (LEFT)

Index(['jack', 'jill ', 'jesse ', 'frank'], dtype='object')

In [11]:
idx.str.rstrip() # Remove todos os espaçoes a direita (RIGHT)

Index([' jack', 'jill', ' jesse', 'frank'], dtype='object')

In [13]:
df = pd.DataFrame(np.random.randn(3, 2), columns=[' Column A ', ' Column B '],
                 index=range(3))
df

Unnamed: 0,Column A,Column B
0,0.26233,-0.056734
1,-0.090801,1.142442
2,0.470581,-0.921572


In [14]:
df.columns.str.strip() # Você pdoe remover os espaços das strings das colunas dos DataFrames

Index(['Column A', 'Column B'], dtype='object')

In [16]:
# é possível combinar modificações com combinações de ações. Lembrando que sempre temos que setar a propriedade "str"
df.columns.str.strip().str.lower().str.replace(' ', '_')

Index(['column_a', 'column_b'], dtype='object')

# Splitting and Replacing Strings

In [19]:
s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
s2

0    a_b_c
1    c_d_e
2      NaN
3    f_g_h
dtype: object

In [20]:
s2.str.split('_')

0    [a, b, c]
1    [c, d, e]
2          NaN
3    [f, g, h]
dtype: object

In [23]:
s2.str.split('_').get(1)

['c', 'd', 'e']

In [24]:
s2.str.split('_').str.get(1)

0      b
1      d
2    NaN
3      g
dtype: object

In [26]:
s2.str.split('_').str[1] # equivalente a anterior

0      b
1      d
2    NaN
3      g
dtype: object

In [28]:
s2.str.split('_', expand=True) # Este parâmetro transforma cada valor em colunas

Unnamed: 0,0,1,2
0,a,b,c
1,c,d,e
2,,,
3,f,g,h


In [32]:
s2.str.split('_', expand=True, n=1) # dá para limitar até que elemento será fatiado pelo split

Unnamed: 0,0,1
0,a,b_c
1,c,d_e
2,,
3,f,g_h


In [34]:
s2.str.rsplit('_', expand=True, n=1)  # dá para definir por onde o split deve fatiar a string. Nesse caso pela direita(RIGHT)

Unnamed: 0,0,1
0,a_b,c
1,c_d,e
2,,
3,f_g,h


### 1.1 Utulizando expressão regular

In [35]:
s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', '', np.nan, 'CABA', 'dog', 'cat'])
s3

0       A
1       B
2       C
3    Aaba
4    Baca
5        
6     NaN
7    CABA
8     dog
9     cat
dtype: object

In [39]:
s3.str.replace('^.a|dog', 'XX-XX ', case=False)

0           A
1           B
2           C
3    XX-XX ba
4    XX-XX ca
5            
6         NaN
7    XX-XX BA
8      XX-XX 
9     XX-XX t
dtype: object

In [42]:
dollars = pd.Series(['12', '-$10', '$10,000']); dollars

0         12
1       -$10
2    $10,000
dtype: object

In [43]:
dollars.str.replace('$','')

0        12
1       -10
2    10,000
dtype: object

In [44]:
dollars.str.replace('-$','-')

0         12
1       -$10
2    $10,000
dtype: object

In [46]:
dollars.str.replace(r'-\$','')

0         12
1         10
2    $10,000
dtype: object

In [47]:
pat = r'[a-z]+'

In [50]:
repl = lambda m: m.group(0)[::-1]

In [51]:
pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(pat, repl)

0    oof 123
1    rab zab
2        NaN
dtype: object

In [52]:
pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"

In [53]:
repl = lambda m: m.group('two').swapcase()

In [54]:
pd.Series(['Foo Bar Baz', np.nan]).str.replace(pat, repl)

0    bAR
1    NaN
dtype: object

In [63]:
myStr = pd.Series(['WALmir'])
myStr.str.swapcase() # inverte o que é maiúsculo em minusculo e vise-vesa

0    walMIR
dtype: object

In [64]:
import re

In [65]:
regex_pat = re.compile(r'^.a|dog', flags=re.IGNORECASE)

In [66]:
s3

0       A
1       B
2       C
3    Aaba
4    Baca
5        
6     NaN
7    CABA
8     dog
9     cat
dtype: object

In [68]:
s3.str.replace(regex_pat, 'XX-XX ')

0           A
1           B
2           C
3    XX-XX ba
4    XX-XX ca
5            
6         NaN
7    XX-XX BA
8      XX-XX 
9     XX-XX t
dtype: object

# 2 Indexing with .str

In [69]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']); s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [70]:
s.str[0]

0      A
1      B
2      C
3      A
4      B
5    NaN
6      C
7      d
8      c
dtype: object

In [72]:
s.str[1] # ao buscar pelo indice que não existir o valor correspondente a chave, retorna um NaN =Not a Number

0    NaN
1    NaN
2    NaN
3      a
4      a
5    NaN
6      A
7      o
8      a
dtype: object

# 3 Extracting Substrings

## 3.1 Extract first match in each subject (extract)

In [73]:
pd.Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)', expand=False)

Unnamed: 0,0,1
0,a,1.0
1,b,2.0
2,,


In [77]:
pd.Series(['a1', 'b2', 'c3']).str.extract('(?P<letter>[ab])(?P<digit>\d)', expand=False) # nomeando o grupo extraído

Unnamed: 0,letter,digit
0,a,1.0
1,b,2.0
2,,


In [78]:
pd.Series(['a1', 'b2', 'c3']).str.extract('([ab])?(\d)', expand=False) # Extração opicional

Unnamed: 0,0,1
0,a,1
1,b,2
2,,3


In [82]:
s = pd.Series(['a1', 'b2', 'c3'], ['A11', 'B22', 'C33']) ;s

A11    a1
B22    b2
C33    c3
dtype: object

In [83]:
s.index

Index(['A11', 'B22', 'C33'], dtype='object')

In [84]:
s.index.str.extract("(?P<letter>[a-zA-Z])", expand=True)

Unnamed: 0,letter
0,A
1,B
2,C


In [85]:
s.index.str.extract("(?P<letter>[a-zA-Z])", expand=False)

Index(['A', 'B', 'C'], dtype='object', name='letter')

In [89]:
s.index.str.extract("(?P<letter>[a-zA-Z])([0-9]+)", expand=True) # nessa condição, se expand=False, retorna erro

Unnamed: 0,letter,1
0,A,11
1,B,22
2,C,33


## 3.2 Extract all matches in each subject (extractall)

In [90]:
s = pd.Series(['a1a2', 'b1', 'c1'], index=['A', 'B', 'C']); s

A    a1a2
B      b1
C      c1
dtype: object

In [91]:
two_groups = '(?P<letter>[a-z])(?P<digit>[0-9])'

In [92]:
s.str.extract(two_groups, expand=True)

Unnamed: 0,letter,digit
A,a,1
B,b,1
C,c,1


In [93]:
s.str.extractall(two_groups)

Unnamed: 0_level_0,Unnamed: 1_level_0,letter,digit
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0,a,1
A,1,a,2
B,0,b,1
C,0,c,1


### 3.3 Testing for Strings that Match or Contain a Pattern

In [94]:
pattern = r'[0-9][a-z]'

In [95]:
pd.Series(['1', '2', '3a', '3b', '03c']).str.contains(pattern)

0    False
1    False
2     True
3     True
4     True
dtype: bool

In [98]:
pd.Series(['1', '2', '3a', '3b', '03c']).str.match(pattern) 

0    False
1    False
2     True
3     True
4    False
dtype: bool

In [99]:
s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']); s4

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [100]:
s4.str.contains('A', na=False)

0     True
1    False
2    False
3     True
4    False
5    False
6     True
7    False
8    False
dtype: bool

# 4 Creating Indicator Variables

In [102]:
s = pd.Series(['a', 'a|b', np.nan, 'a|c']); s

0      a
1    a|b
2    NaN
3    a|c
dtype: object

In [103]:
s.str.get_dummies(sep='|')

Unnamed: 0,a,b,c
0,1,0,0
1,1,1,0
2,0,0,0
3,1,0,1


In [104]:
idx = pd.Index(['a', 'a|b', np.nan, 'a|c']); idx

Index(['a', 'a|b', nan, 'a|c'], dtype='object')

In [107]:
idx.str.get_dummies(sep='|') # transforma em multiindex

MultiIndex(levels=[[0, 1], [0, 1], [0, 1]],
           labels=[[1, 1, 0, 1], [0, 1, 0, 0], [0, 0, 0, 1]],
           names=['a', 'b', 'c'])