In [15]:
import pandas as pd
import numpy as np

## series with text data 

In [2]:
ser = pd.Series(list('abcd'))

In [7]:
ser.dtypes

dtype('O')

In [16]:
ser

0    a
1    b
2    c
3    d
dtype: object

### count the number of 'a' in the series 

In [18]:
ser.str.count('a') # output dtype is int 

0    1
1    0
2    0
3    0
dtype: int64

# string methods 

In [23]:
a = pd.Series(['A','b','mama','BLACK','hello','HI','2020','september','03','instagram'], dtype = 'object')

In [24]:
a

0            A
1            b
2         mama
3        BLACK
4        hello
5           HI
6         2020
7    september
8           03
9    instagram
dtype: object

###### to lower case 

In [25]:

a.str.lower()

0            a
1            b
2         mama
3        black
4        hello
5           hi
6         2020
7    september
8           03
9    instagram
dtype: object

##### to upper case 

In [26]:
a.str.upper()

0            A
1            B
2         MAMA
3        BLACK
4        HELLO
5           HI
6         2020
7    SEPTEMBER
8           03
9    INSTAGRAM
dtype: object

##### count the number of element 

In [27]:
a.str.len()

0    1
1    1
2    4
3    5
4    5
5    2
6    4
7    9
8    2
9    9
dtype: int64

In [29]:
data = pd.DataFrame(np.arange(1,10).reshape(3,-1))
data.columns = ['Column a', 'Column B', 'column c']

In [105]:
data.columns.str.capitalize()

Index(['Column a', 'Column b', 'Column c'], dtype='object')

In [30]:
data.head()

Unnamed: 0,Column a,Column B,column c
0,1,2,3
1,4,5,6
2,7,8,9


#### change the column into lower case 

In [35]:
data.columns = data.columns.str.lower()

In [36]:
data

Unnamed: 0,column a,column b,column c
0,1,2,3
1,4,5,6
2,7,8,9


In [37]:
data.columns.str.strip()

Index(['column a', 'column b', 'column c'], dtype='object')

#### replace the space by _

In [38]:
data.columns.str.replace(' ','_')

Index(['column_a', 'column_b', 'column_c'], dtype='object')

#### change the column into the title format

In [39]:
data.columns.str.title()

Index(['Column A', 'Column B', 'Column C'], dtype='object')

#### change the column name to upper case 

In [40]:
data.columns.str.upper()

Index(['COLUMN A', 'COLUMN B', 'COLUMN C'], dtype='object')

# Splitting and replacing strings

In [44]:
seri = pd.Series(['a_c','m_o',np.nan,'a_B_c'], dtype  = 'object')

In [45]:
seri

0      a_c
1      m_o
2      NaN
3    a_B_c
dtype: object

In [46]:
seri.str.split('_')

0       [a, c]
1       [m, o]
2          NaN
3    [a, B, c]
dtype: object

In [49]:
# element in the split can be accessed by using get or []
seri.str.split('_').str[0]

0      a
1      m
2    NaN
3      a
dtype: object

In [51]:
seri.str.split('_').str.get(0)

0      a
1      m
2    NaN
3      a
dtype: object

###### we can return it to dataframe using expand = True

In [52]:
seri.str.split('_', expand = True)

Unnamed: 0,0,1,2
0,a,c,
1,m,o,
2,,,
3,a,B,c


In [54]:
dollars = pd.Series(['12', '-$10', '$10,000'])

In [55]:
dollars

0         12
1       -$10
2    $10,000
dtype: object

In [60]:
# lets replace $ by ' '
dollars.str.replace('$','')

0        12
1       -10
2    10,000
dtype: object

In [67]:
# replace $ by *
dollars.str.replace('$','*')

0         12
1       -*10
2    *10,000
dtype: object

# Concatenation

In [68]:
se = pd.Series(['a','b','c','d','e','f','g','h'])

In [69]:
se

0    a
1    b
2    c
3    d
4    e
5    f
6    g
7    h
dtype: object

In [73]:
se.str.cat(sep = ',')

'a,b,c,d,e,f,g,h'

In [77]:
# we can concatenation another list to the series
se.str.cat(list('hgfedcba'), sep = '-')

0    a-h
1    b-g
2    c-f
3    d-e
4    e-d
5    f-c
6    g-b
7    h-a
dtype: object

# Indexing with .str

In [80]:
se = pd.Series(['hello','its','me'])
se

0    hello
1      its
2       me
dtype: object

In [82]:
se.str[0] # for the series with the first letter from se series

0    h
1    i
2    m
dtype: object

In [85]:
se.str[2] # returns nan if there is no element for that index in the particular row

0      l
1      s
2    NaN
dtype: object

### Testing for strings that match or contain a pattern

In [86]:
seri = pd.Series(['ab','cd','ef','gh','ijk','lmn'])
seri

0     ab
1     cd
2     ef
3     gh
4    ijk
5    lmn
dtype: object

In [92]:
seri.str.contains('ab')

0     True
1    False
2    False
3    False
4    False
5    False
dtype: bool

In [93]:
pattern = r'[0-9][a-z]'
series = pd.Series(['1d','5ff','g7','jh','9k','10aa','7dd','34kdk'])
series

0       1d
1      5ff
2       g7
3       jh
4       9k
5     10aa
6      7dd
7    34kdk
dtype: object

In [94]:
series.str.contains(pattern)

0     True
1     True
2    False
3    False
4     True
5     True
6     True
7     True
dtype: bool

### Slicing 

In [96]:
series.str.slice(1,3)

0     d
1    ff
2     7
3     h
4     k
5    0a
6    dd
7    4k
dtype: object

# count

In [104]:
series.str.count('1d')

0    1
1    0
2    0
3    0
4    0
5    0
6    0
7    0
dtype: int64

### To check whether series contains  or not:

In [114]:
seri = pd.Series(['aa','12','AA','mn','Ab','45',33])

In [115]:
seri.str.islower()

0     True
1    False
2    False
3     True
4    False
5    False
6      NaN
dtype: object

In [116]:
seri.str.isupper()

0    False
1    False
2     True
3    False
4    False
5    False
6      NaN
dtype: object

In [117]:
seri.str.istitle()

0    False
1    False
2    False
3    False
4     True
5    False
6      NaN
dtype: object

In [118]:
seri.str.isnumeric()

0    False
1     True
2    False
3    False
4    False
5     True
6      NaN
dtype: object

In [119]:
seri.str.isdecimal()

0    False
1     True
2    False
3    False
4    False
5     True
6      NaN
dtype: object

In [121]:
seri.str.isdigit()

0    False
1     True
2    False
3    False
4    False
5     True
6      NaN
dtype: object