https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html

Series and Index are equipped with a set of string processing methods that make it easy to operate on each element of the array. Perhaps most importantly, these methods exclude missing/NA values automatically. These are accessed via the str attribute and generally have names matching the equivalent (scalar) built-in string methods:

In [4]:
import pandas as pd
import numpy as np

In [5]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

In [6]:
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [16]:
for x,y,z in zip(s.str.lower(), s.str.upper(), s.str.len()):
    if pd.isnull(x): continue
    print(x,"\t", y,"\t", int(z))

a 	 A 	 1
b 	 B 	 1
c 	 C 	 1
aaba 	 AABA 	 4
baca 	 BACA 	 4
caba 	 CABA 	 4
dog 	 DOG 	 3
cat 	 CAT 	 3


In [17]:
idx = pd.Index([' jack', 'jill ', ' jesse ', 'frank'])

In [18]:
idx.str.strip()

Index(['jack', 'jill', 'jesse', 'frank'], dtype='object')

In [19]:
df = pd.DataFrame(np.random.randn(3, 2),
        columns=[' Column A ', ' Column B '], index=range(3))

In [20]:
df

Unnamed: 0,Column A,Column B
0,-0.419324,1.08795
1,0.507848,1.213927
2,0.986737,0.697656


In [21]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [22]:
df

Unnamed: 0,column_a,column_b
0,-0.419324,1.08795
1,0.507848,1.213927
2,0.986737,0.697656


In [23]:
s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
s2

0    a_b_c
1    c_d_e
2      NaN
3    f_g_h
dtype: object

In [24]:
s2.str.split('_')

0    [a, b, c]
1    [c, d, e]
2          NaN
3    [f, g, h]
dtype: object

In [25]:
s2.dropna()

0    a_b_c
1    c_d_e
3    f_g_h
dtype: object

In [26]:
s2.str.split('_').str.get(1)

0      b
1      d
2    NaN
3      g
dtype: object

In [27]:
s2.str.split('_').str[1]

0      b
1      d
2    NaN
3      g
dtype: object

In [33]:
# expand to dataframe
s2a = s2.str.split('_', expand=True)
s2a

Unnamed: 0,0,1,2
0,a,b,c
1,c,d,e
2,,,
3,f,g,h


In [35]:
s2a.columns=["Col_A", "Col_B", "Col_C"]
s2a

Unnamed: 0,Col_A,Col_B,Col_C
0,a,b,c
1,c,d,e
2,,,
3,f,g,h


In [36]:
s2a.dropna()

Unnamed: 0,Col_A,Col_B,Col_C
0,a,b,c
1,c,d,e
3,f,g,h


In [37]:
s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca',
        '', np.nan, 'CABA', 'dog', 'cat'])
s3

0       A
1       B
2       C
3    Aaba
4    Baca
5        
6     NaN
7    CABA
8     dog
9     cat
dtype: object

In [38]:
s3.str.replace('^.a|dog', 'XX-XX ', case=False)

0           A
1           B
2           C
3    XX-XX ba
4    XX-XX ca
5            
6         NaN
7    XX-XX BA
8      XX-XX 
9     XX-XX t
dtype: object

In [40]:
s = pd.Series(['a', 'b', 'c', 'd'])
s

0    a
1    b
2    c
3    d
dtype: object

In [41]:
s.str.cat()

'abcd'