# Text Data manupulation
```
Reference : https://pandas.pydata.org/docs/user_guide/text.html 
```

In [1]:
import numpy as np
import pandas as pd

In [3]:
names = pd.Series(['andrew','bobo','claire','david','5'])


In [4]:
names.str.upper()

0    ANDREW
1      BOBO
2    CLAIRE
3     DAVID
4         5
dtype: object

In [5]:
names.str.isdigit()

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [6]:
tech_finance = ['GOOG,APPL,AMZN','JPM,BAC,GS']


In [7]:
len(tech_finance)

2

In [9]:
tickers = pd.Series(tech_finance)
tickers

0    GOOG,APPL,AMZN
1        JPM,BAC,GS
dtype: object

In [11]:
tickers.str.split(',').str[0]

0    GOOG
1     JPM
dtype: object

In [13]:
tickers.str.split(',',expand=True)  
# We converted to a dataframe 

Unnamed: 0,0,1,2
0,GOOG,APPL,AMZN
1,JPM,BAC,GS


In [14]:
# Stacking strings calls

messay_names =  pd.Series(['andrew  ',"bo:bo","   claire  "])
messay_names

0       andrew  
1          bo:bo
2       claire  
dtype: object

In [15]:
messay_names[0]

'andrew  '

In [19]:
messay_names.str.replace(':','').str.strip().str.capitalize()

0    Andrew
1      Bobo
2    Claire
dtype: object

In [21]:
# using apply 

def cleanup(name):
    name = name.replace(":","")
    name = name.strip()
    name = name.capitalize()
    return name

In [22]:
messay_names.apply(cleanup)

0    Andrew
1      Bobo
2    Claire
dtype: object

# Which one is more efficient ?

In [28]:
import timeit

# code snippet to be executed only once 

setup = '''
import pandas as pd
import numpy as np
messay_names = pd.Series(['andrew','bo;bo','   claire  '])
def cleanup(name):
    name = name.replace(":","")
    name = name.strip()
    name = name.capitalize()
    return name
''' 

# code snippet whose execution time is to be measured

stmt_pandas_str = '''
messay_names.str.replace(";","").str.strip().str.capitalize()
''' 

stmt_pandas_apply = ''' 
messay_names.apply(cleanup)
'''

stmt_pandas_vectorize = ''' 
np.vectorize(cleanup)(messay_names)
'''

In [32]:
timeit.timeit(setup=setup,
              stmt = stmt_pandas_str,
              number = 1000000
              )



74.06074983300641

In [33]:
timeit.timeit(setup = setup,
              stmt = stmt_pandas_apply,
              number = 1000000
              )

16.710476333042607

In [34]:
timeit.timeit(setup = setup,
              stmt = stmt_pandas_vectorize,
              number = 1000000
              )

12.931994499871507

In [35]:
# vectorize is performing faster 