In [54]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'name':['jordon', 'MIKE', 'Kelvin', 'xiaoLi', 'qiqi','Amei'],
                   'Age':[18, 30, 45, 23, 45, 62],
                   'level':['high','Low','M','L','middle',np.nan],
                   'Email':['jordon@sohu.com','Mike@126.cn','KelvinChai@gmail.com','xiaoli@163.com',np.nan,'amei@qq.com']})
df

Unnamed: 0,name,Age,level,Email
0,jordon,18,high,jordon@sohu.com
1,MIKE,30,Low,Mike@126.cn
2,Kelvin,45,M,KelvinChai@gmail.com
3,xiaoLi,23,L,xiaoli@163.com
4,qiqi,45,middle,
5,Amei,62,,amei@qq.com


## 1、文本格式

In [16]:
# 文本变成小写
df.name.str.lower()

0    jordon
1      mike
2    kelvin
3    xiaoli
4      qiqi
5      amei
Name: name, dtype: object

In [19]:
df.columns.str.lower()

Index(['name', 'age', 'level', 'email'], dtype='object')

In [None]:
# 对齐
# # 居中对齐，宽度为8，其余用’*’填充
# s.str.center(, fillchar='*')
# # 左对齐，宽度为8，其余用’*’填充
# s.str.ljust(8, fillchar='*')
# # 右对齐，宽度为8，其余用’*’填充
# s.str.rjust(8, fillchar='*')
# # 自定义对齐方式，参数可调整宽度、对齐方向、填充字符
# s.str.pad(width=8, side='both',fillchar='*')

In [26]:
# 举例
df.name.str.center(8, fillchar='*')

0    *jordon*
1    **MIKE**
2    *Kelvin*
3    *xiaoLi*
4    **qiqi**
5    **Amei**
Name: name, dtype: object

In [25]:
df.name.str.pad(width=8, side='both', fillchar='*')

0    *jordon*
1    **MIKE**
2    *Kelvin*
3    *xiaoLi*
4    **qiqi**
5    **Amei**
Name: name, dtype: object

In [42]:
df.Email.str.split('@')

0         [jordon, sohu.com]
1            [Mike, 126.com]
2    [KelvinChai, gmail.com]
3          [xiaoli, 163.com]
4                        NaN
5             [amei, qq.com]
Name: Email, dtype: object

## 2、文本拆分

In [55]:
df.Email.str.split('@',expand=True)

Unnamed: 0,0,1
0,jordon,sohu.com
1,Mike,126.cn
2,KelvinChai,gmail.com
3,xiaoli,163.com
4,,
5,amei,qq.com


In [56]:
df.Email.str.split('@',expand=True,n=1)

Unnamed: 0,0,1
0,jordon,sohu.com
1,Mike,126.cn
2,KelvinChai,gmail.com
3,xiaoli,163.com
4,,
5,amei,qq.com


In [112]:
df.Email.str.split('@|\.',expand=True)

Unnamed: 0,0,1,2
0,jordon,sohu,com
1,Mike,126,cn
2,KelvinChai,gmail,com
3,xiaoli,163,com
4,,,
5,amei,qq,com


## 3、文本替换

In [116]:
df.Email.str.replace('com','cn')

0         jordon@sohu.cn
1            Mike@126.cn
2    KelvinChai@gmail.cn
3          xiaoli@163.cn
4                    NaN
5             amei@qq.cn
Name: Email, dtype: object

In [117]:
# 正则
df.Email.str.replace('(.*?)@','xxx@')

0     xxx@sohu.com
1       xxx@126.cn
2    xxx@gmail.com
3      xxx@163.com
4              NaN
5       xxx@qq.com
Name: Email, dtype: object

In [144]:
df.Email.str.replace('(.*?)@', lambda x:x.group().upper())

0         JORDON@sohu.com
1             MIKE@126.cn
2    KELVINCHAI@gmail.com
3          XIAOLI@163.com
4                     NaN
5             AMEI@qq.com
Name: Email, dtype: object

slice_replace

In [149]:
df.Email.str.slice_replace(start=1,stop=2,repl='XX')

0         jXXrdon@sohu.com
1             MXXke@126.cn
2    KXXlvinChai@gmail.com
3          xXXaoli@163.com
4                      NaN
5             aXXei@qq.com
Name: Email, dtype: object

repeat

In [152]:
df.name.str.repeat(repeats=2)

0    jordonjordon
1        MIKEMIKE
2    KelvinKelvin
3    xiaoLixiaoLi
4        qiqiqiqi
5        AmeiAmei
Name: name, dtype: object

## 4、文本拼接

In [153]:
df.name.str.cat()

'jordonMIKEKelvinxiaoLiqiqiAmei'

In [156]:
df.level.str.cat(sep='-',na_rep='*')

'high-Low-M-L-middle-*'

In [161]:
df.name.str.cat(['*']*6).str.cat(df.level)

0    jordon*high
1       MIKE*Low
2       Kelvin*M
3       xiaoLi*L
4    qiqi*middle
5            NaN
Name: name, dtype: object

In [170]:
df.name.str.cat([df.level,df.Email],na_rep='*')

0      jordonhighjordon@sohu.com
1             MIKELowMike@126.cn
2    KelvinMKelvinChai@gmail.com
3          xiaoLiLxiaoli@163.com
4                    qiqimiddle*
5               Amei*amei@qq.com
Name: name, dtype: object

## 5、文本提取

In [181]:
df.Email.str.extract(pat='(.*?)@(.*).com')

Unnamed: 0,0,1
0,jordon,sohu
1,,
2,KelvinChai,gmail
3,xiaoli,163
4,,
5,amei,qq


In [186]:
df.Email.str.extractall(pat='(.*?)@(.*).com')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,jordon,sohu
2,0,KelvinChai,gmail
3,0,xiaoli,163
5,0,amei,qq


## 6、文本查询

In [210]:
df['@position'] = df.Email.str.find('@')
df[['Email','@position']]

Unnamed: 0,Email,@position
0,jordon@sohu.com,6.0
1,Mike@126.cn,4.0
2,KelvinChai@gmail.com,10.0
3,xiaoli@163.com,6.0
4,,
5,amei@qq.com,4.0


In [203]:
df.Email.str.findall('(.*?)@(.*).com')

0         [(jordon, sohu)]
1                       []
2    [(KelvinChai, gmail)]
3          [(xiaoli, 163)]
4                      NaN
5             [(amei, qq)]
Name: Email, dtype: object

## 7、文本包含

In [212]:
df.Email.str.contains('jordon|com',na='*')

0     True
1    False
2     True
3     True
4        *
5     True
Name: Email, dtype: object

In [217]:
df.loc[df.Email.str.contains('jordon|com',na=False)]

Unnamed: 0,name,Age,level,Email,@position
0,jordon,18,high,jordon@sohu.com,6.0
2,Kelvin,45,M,KelvinChai@gmail.com,10.0
3,xiaoLi,23,L,xiaoli@163.com,6.0
5,Amei,62,,amei@qq.com,4.0


## 8、文本的虚拟变量

In [218]:
df.name.str.get_dummies()

Unnamed: 0,Amei,Kelvin,MIKE,jordon,qiqi,xiaoLi
0,0,0,0,1,0,0
1,0,0,1,0,0,0
2,0,1,0,0,0,0
3,0,0,0,0,0,1
4,0,0,0,0,1,0
5,1,0,0,0,0,0
