In [7]:
import pandas as pd
import numpy as np

# Working with text data
## Text data types

There are two ways to store text data in pandas:

- object -dtype NumPy array.

- StringDtype extension type.

We recommend using StringDtype to store text data.

Prior to pandas 1.0, object dtype was the only option. This was unfortunate for many reasons:

1. You can accidentally store a mixture of strings and non-strings in an object dtype array. It’s better to have a dedicated dtype.

2. object dtype breaks dtype-specific operations like DataFrame.select_dtypes(). There isn’t a clear way to select just text while excluding non-text but still object-dtype columns.

3. When reading code, the contents of an object dtype array is less clear than 'string'.

Currently, the performance of object dtype arrays of strings and arrays.StringArray are about the same. We expect future enhancements to significantly increase the performance and lower the memory overhead of StringArray.

In [2]:
pd.Series(["a", "b", "c"])


0    a
1    b
2    c
dtype: object

In [3]:
pd.Series(["a", "b", "c"], dtype='string')


0    a
1    b
2    c
dtype: string

In [4]:
pd.Series(["a", "b", "c"], dtype=pd.StringDtype())

0    a
1    b
2    c
dtype: string

In [5]:
s = pd.Series(["a", "b", "c"])
s.astype('string')

0    a
1    b
2    c
dtype: string

In [8]:
s1 = pd.Series([1, 2, np.nan], dtype="Int64")

s1

0       1
1       2
2    <NA>
dtype: Int64

## Behavior differences

In [9]:
s = pd.Series(["a", None, "b"], dtype="string")

s

0       a
1    <NA>
2       b
dtype: string

In [10]:
s.str.count("a")


0       1
1    <NA>
2       0
dtype: Int64

In [11]:
s.dropna().str.count("a")


0    1
2    0
dtype: Int64

In [12]:
s2 = pd.Series(["a", None, "b"], dtype="object")

s2.str.count("a")

0    1.0
1    NaN
2    0.0
dtype: float64

In [13]:
s2.dropna().str.count("a")


0    1
2    0
dtype: int64

In [14]:
s.str.isdigit()

0    False
1     <NA>
2    False
dtype: boolean

# String methods

In [15]:
s = pd.Series(
    ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string"
)

In [16]:
s.str.lower()


0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string

In [17]:
s.str.upper()


0       A
1       B
2       C
3    AABA
4    BACA
5    <NA>
6    CABA
7     DOG
8     CAT
dtype: string

In [18]:
s.str.len()


0       1
1       1
2       1
3       4
4       4
5    <NA>
6       4
7       3
8       3
dtype: Int64

In [19]:
idx = pd.Index([" jack", "jill ", " jesse ", "frank"])


In [21]:
idx = pd.Index([" jack", "jill ", " jesse ", "frank"])

print(idx.str.strip())

print(idx.str.lstrip())

print(idx.str.rstrip())

Index(['jack', 'jill', 'jesse', 'frank'], dtype='object')
Index(['jack', 'jill ', 'jesse ', 'frank'], dtype='object')
Index([' jack', 'jill', ' jesse', 'frank'], dtype='object')


In [32]:
df = pd.DataFrame(
    np.random.randn(3, 2), columns=[" Column A ", " Column B "], index=range(3)
)

df

Unnamed: 0,Column A,Column B
0,-0.06419,1.128095
1,0.453684,-0.275639
2,0.339591,1.681056


In [23]:
type(df.columns)

pandas.core.indexes.base.Index

!!!! Since df.columns is an Index object, we can use the .str accessor !!!!

In [25]:
df.columns.str.strip()

Index(['Column A', 'Column B'], dtype='object')

In [26]:
df.columns.str.lower()


Index([' column a ', ' column b '], dtype='object')

In [33]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df

Unnamed: 0,column_a,column_b
0,-0.06419,1.128095
1,0.453684,-0.275639
2,0.339591,1.681056


In [30]:
df.columns = [" Column A", " Column B "]
df.columns = df.columns.str.lower().str.replace(" ", "_")
df

Unnamed: 0,_column_a,_column_b_
0,-1.013046,0.583679
1,-2.220471,-0.048091
2,0.121998,0.197507


# Splitting and replacing strings

In [34]:
s2 = pd.Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype="string")

s2.str.split("_")

0    [a, b, c]
1    [c, d, e]
2         <NA>
3    [f, g, h]
dtype: object

In [35]:
s2.str.split("_").str.get(1)


0       b
1       d
2    <NA>
3       g
dtype: object

In [37]:
s2.str.split("_").str[1]

0       b
1       d
2    <NA>
3       g
dtype: object

In [38]:
s2.str.split("_", expand=True)


Unnamed: 0,0,1,2
0,a,b,c
1,c,d,e
2,,,
3,f,g,h


In [39]:
# The argument n allow to limit the expanding size to n
s2.str.split("_", expand=True, n=1)


Unnamed: 0,0,1
0,a,b_c
1,c,d_e
2,,
3,f,g_h


In [40]:
s2.str.split("_", expand=True, n=2)


Unnamed: 0,0,1,2
0,a,b,c
1,c,d,e
2,,,
3,f,g,h


In [42]:
s3 = pd.Series(
    ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"],
    dtype="string",
)

s3

0       A
1       B
2       C
3    Aaba
4    Baca
5        
6    <NA>
7    CABA
8     dog
9     cat
dtype: string

In [43]:
s3.str.replace("^.a|dog", "XX-XX ", case=False, regex=True)


0           A
1           B
2           C
3    XX-XX ba
4    XX-XX ca
5            
6        <NA>
7    XX-XX BA
8      XX-XX 
9     XX-XX t
dtype: string

In [44]:
"^.a|dog"

'^.a|dog'

In [45]:
dollars = pd.Series(["12", "-$10", "$10,000"], dtype="string")

In [46]:
dollars.str.replace(r"-\$", "-", regex=True)


0         12
1        -10
2    $10,000
dtype: string

In [47]:
dollars.str.replace("-$", "-", regex=False)


0         12
1        -10
2    $10,000
dtype: string

In [48]:
pat = r"[a-z]+"
pat

'[a-z]+'

In [49]:
def repl(m):
    return m.group(0)[::-1]

In [50]:
pd.Series(["foo 123", "bar baz", np.nan], dtype="string").str.replace(
    pat, repl, regex=True
)

0    oof 123
1    rab zab
2       <NA>
dtype: string

In [52]:
pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
pat

'(?P<one>\\w+) (?P<two>\\w+) (?P<three>\\w+)'

In [53]:
def repl(m):
    return m.group("two").swapcase()

In [54]:
pd.Series(["Foo Bar Baz", np.nan], dtype="string").str.replace(
    pat, repl, regex=True
)

0     bAR
1    <NA>
dtype: string

In [55]:
help(pd.Series.str.replace)

Help on function replace in module pandas.core.strings.accessor:

replace(self, pat: 'str | re.Pattern', repl: 'str | Callable', n: 'int' = -1, case: 'bool | None' = None, flags: 'int' = 0, regex: 'bool' = False)
    Replace each occurrence of pattern/regex in the Series/Index.
    
    Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on
    the regex value.
    
    Parameters
    ----------
    pat : str or compiled regex
        String can be a character sequence or regular expression.
    repl : str or callable
        Replacement string or a callable. The callable is passed the regex
        match object and must return a replacement string to be used.
        See :func:`re.sub`.
    n : int, default -1 (all)
        Number of replacements to make from start.
    case : bool, default None
        Determines if replace is case sensitive:
    
        - If True, case sensitive (the default if `pat` is a string)
        - Set to False for case insensitive
        - Cann

In [56]:
import re

regex_pat = re.compile(r"^.a|dog", flags=re.IGNORECASE)

s3.str.replace(regex_pat, "XX-XX ", regex=True)

0           A
1           B
2           C
3    XX-XX ba
4    XX-XX ca
5            
6        <NA>
7    XX-XX BA
8      XX-XX 
9     XX-XX t
dtype: string

In [57]:
s = pd.Series(["str_foo", "str_bar", "no_prefix"])

s.str.removeprefix("str_")

0          foo
1          bar
2    no_prefix
dtype: object

In [59]:
s = pd.Series(["foo_str", "bar_str", "no_suffix"])
s.str.removesuffix("_str")


0          foo
1          bar
2    no_suffix
dtype: object

# Concatenation

## Concatenating a single Series into a string

In [60]:
s = pd.Series(["a", "b", "c", "d"], dtype="string")
s.str.cat(sep=",")


'a,b,c,d'

In [61]:
t = pd.Series(["a", "b", np.nan, "d"], dtype="string")

t.str.cat(sep=",")

'a,b,d'

## Concatenating a Series and something list-like into a Series

In [62]:
s

0    a
1    b
2    c
3    d
dtype: string

In [63]:
s.str.cat(["A", "B", "C", "D"])


0    aA
1    bB
2    cC
3    dD
dtype: string

In [64]:
s.str.cat(t)


0      aa
1      bb
2    <NA>
3      dd
dtype: string

In [65]:
s.str.cat(t, na_rep="-")


0    aa
1    bb
2    c-
3    dd
dtype: string

## Concatenating a Series and something array-like into a Series

In [81]:
d = pd.concat([t, s], axis=1)

d

Unnamed: 0,0,1
0,a,a
1,b,b
2,,c
3,d,d


In [78]:
s.str.cat(d, na_rep="-")


0    aaa
1    bbb
2    c-c
3    ddd
dtype: string

## Concatenating a Series and an indexed object into a Series, with alignment

In [66]:
u = pd.Series(["b", "d", "a", "c"], index=[1, 3, 0, 2], dtype="string")
u

1    b
3    d
0    a
2    c
dtype: string

In [67]:
s

0    a
1    b
2    c
3    d
dtype: string

In [68]:
s.str.cat(u)


0    aa
1    bb
2    cc
3    dd
dtype: string

In [69]:
s.str.cat(u, join="left")


0    aa
1    bb
2    cc
3    dd
dtype: string

In [71]:
v = pd.Series(["z", "a", "b", "d", "e"], index=[-1, 0, 1, 3, 4], dtype="string")
v

-1    z
 0    a
 1    b
 3    d
 4    e
dtype: string

In [72]:
s

0    a
1    b
2    c
3    d
dtype: string

In [73]:
v

-1    z
 0    a
 1    b
 3    d
 4    e
dtype: string

In [74]:
s.str.cat(v, join="left", na_rep="-")


0    aa
1    bb
2    c-
3    dd
dtype: string

In [75]:
s.str.cat(v, join="outer", na_rep="-")


-1    -z
 0    aa
 1    bb
 2    c-
 3    dd
 4    -e
dtype: string

In [79]:
f = d.loc[[3, 2, 1, 0], :]
f

Unnamed: 0,0,1
3,d,d
2,,c
1,b,b
0,a,a


In [80]:
d

Unnamed: 0,0,1
0,a,a
1,b,b
2,,c
3,d,d


## Concatenating a Series and many objects into a Series

In [82]:
s

0    a
1    b
2    c
3    d
dtype: string

In [83]:
u

1    b
3    d
0    a
2    c
dtype: string

In [84]:
s.str.cat([u, u.to_numpy()], join="left")


0    aab
1    bbd
2    cca
3    ddc
dtype: string

All elements without an index (e.g. np.ndarray) within the passed list-like must match in length to the calling Series (or Index), but Series and Index may have arbitrary length (as long as alignment is not disabled with join=None):



In [85]:
v

-1    z
 0    a
 1    b
 3    d
 4    e
dtype: string

In [86]:
s.str.cat([v, u, u.to_numpy()], join="outer", na_rep="-")


-1    -z--
0     aaab
1     bbbd
2     c-ca
3     dddc
4     -e--
dtype: string

If using join='right' on a list-like of others that contains different indexes, the union of these indexes will be used as the basis for the final concatenation:

In [87]:
u.loc[[3]]


3    d
dtype: string

In [88]:
v.loc[[-1, 0]]


-1    z
 0    a
dtype: string

In [89]:
s.str.cat([u.loc[[3]], v.loc[[-1, 0]]], join="right", na_rep="-")


 3    dd-
-1    --z
 0    a-a
dtype: string

# Indexing with .str

In [90]:
s = pd.Series(
    ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string"
)

In [94]:
s.str[0]

0       A
1       B
2       C
3       A
4       B
5    <NA>
6       C
7       d
8       c
dtype: string

In [97]:
s.str[0:9]


0       A
1       B
2       C
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

# Extracting substrings

## Extract first match in each subject (extract)

Note r"your string": r stands for conversion

In [98]:
pd.Series(
    ["a1", "b2", "c3"],
    dtype="string",
).str.extract(r"([ab])(\d)", expand=False)


Unnamed: 0,0,1
0,a,1.0
1,b,2.0
2,,


In [99]:
pd.Series(["a1", "b2", "c3"], dtype="string").str.extract(
    r"(?P<letter>[ab])(?P<digit>\d)", expand=False
)

Unnamed: 0,letter,digit
0,a,1.0
1,b,2.0
2,,


In [100]:
pd.Series(["a1", "b2", "c3"], dtype="string").str.extract(r"[ab](\d)", expand=True)


Unnamed: 0,0
0,1.0
1,2.0
2,


In [101]:
pd.Series(
    ["a1", "b2", "3"],
    dtype="string",
).str.extract(r"([ab])?(\d)", expand=True)


Unnamed: 0,0,1
0,a,1
1,b,2
2,,3


In [102]:
pd.Series(["a1", "b2", "c3"], dtype="string").str.extract(r"[ab](\d)", expand=True)


Unnamed: 0,0
0,1.0
1,2.0
2,


In [103]:
s = pd.Series(["a1", "b2", "c3"], ["A11", "B22", "C33"], dtype="string")
s.index.str.extract("(?P<letter>[a-zA-Z])", expand=True)


Unnamed: 0,letter
0,A
1,B
2,C


## Extract all matches in each subject (extractall)

In [104]:
s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"], dtype="string")

s

two_groups = "(?P<letter>[a-z])(?P<digit>[0-9])"

s.str.extract(two_groups, expand=True)


Unnamed: 0,letter,digit
A,a,1
B,b,1
C,c,1


In [105]:
s.str.extractall(two_groups)


Unnamed: 0_level_0,Unnamed: 1_level_0,letter,digit
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0,a,1
A,1,a,2
B,0,b,1
C,0,c,1


In [107]:
s = pd.Series(["a3", "b3", "c2"], dtype="string")
s

0    a3
1    b3
2    c2
dtype: string

# Testing for strings that match or contain a pattern

In [108]:
pattern = r"[0-9][a-z]"

pd.Series(
    ["1", "2", "3a", "3b", "03c", "4dx"],
    dtype="string",
).str.contains(pattern)

0    False
1    False
2     True
3     True
4     True
5     True
dtype: boolean

In [109]:
pd.Series(
    ["1", "2", "3a", "3b", "03c", "4dx"],
    dtype="string",
).str.match(pattern)


0    False
1    False
2     True
3     True
4    False
5     True
dtype: boolean

In [110]:
pd.Series(
    ["1", "2", "3a", "3b", "03c", "4dx"],
    dtype="string",
).str.fullmatch(pattern)

0    False
1    False
2     True
3     True
4    False
5    False
dtype: boolean

# Method summary
Method

Description

cat()

Concatenate strings

split()

Split strings on delimiter

rsplit()

Split strings on delimiter working from the end of the string

get()

Index into each element (retrieve i-th element)

join()

Join strings in each element of the Series with passed separator

get_dummies()

Split strings on the delimiter returning DataFrame of dummy variables

contains()

Return boolean array if each string contains pattern/regex

replace()

Replace occurrences of pattern/regex/string with some other string or the return value of a callable given the occurrence

removeprefix()

Remove prefix from string, i.e. only remove if string starts with prefix.

removesuffix()

Remove suffix from string, i.e. only remove if string ends with suffix.

repeat()

Duplicate values (s.str.repeat(3) equivalent to x * 3)

pad()

Add whitespace to left, right, or both sides of strings

center()

Equivalent to str.center

ljust()

Equivalent to str.ljust

rjust()

Equivalent to str.rjust

zfill()

Equivalent to str.zfill

wrap()

Split long strings into lines with length less than a given width

slice()

Slice each string in the Series

slice_replace()

Replace slice in each string with passed value

count()

Count occurrences of pattern

startswith()

Equivalent to str.startswith(pat) for each element

endswith()

Equivalent to str.endswith(pat) for each element

findall()

Compute list of all occurrences of pattern/regex for each string

match()

Call re.match on each element, returning matched groups as list

extract()

Call re.search on each element, returning DataFrame with one row for each element and one column for each regex capture group

extractall()

Call re.findall on each element, returning DataFrame with one row for each match and one column for each regex capture group

len()

Compute string lengths

strip()

Equivalent to str.strip

rstrip()

Equivalent to str.rstrip

lstrip()

Equivalent to str.lstrip

partition()

Equivalent to str.partition

rpartition()

Equivalent to str.rpartition

lower()

Equivalent to str.lower

casefold()

Equivalent to str.casefold

upper()

Equivalent to str.upper

find()

Equivalent to str.find

rfind()

Equivalent to str.rfind

index()

Equivalent to str.index

rindex()

Equivalent to str.rindex

capitalize()

Equivalent to str.capitalize

swapcase()

Equivalent to str.swapcase

normalize()

Return Unicode normal form. Equivalent to unicodedata.normalize

translate()

Equivalent to str.translate

isalnum()

Equivalent to str.isalnum

isalpha()

Equivalent to str.isalpha

isdigit()

Equivalent to str.isdigit

isspace()

Equivalent to str.isspace

islower()

Equivalent to str.islower

isupper()

Equivalent to str.isupper

istitle()

Equivalent to str.istitle

isnumeric()

Equivalent to str.isnumeric

isdecimal()

Equivalent to str.isdecimal