In [3]:
import pandas as pd
import numpy as np

In [5]:
email = 'jose@gmail.com'


In [6]:
email.split('@')

['jose', 'gmail.com']

In [10]:
email.isdigit()

False

In [11]:
'5'.isdigit()

True

In [7]:
names = pd.Series(['andew','bobo','claire','david','5'])

In [8]:
names

0     andew
1      bobo
2    claire
3     david
4         5
dtype: object

In [9]:
names.str.upper()

0     ANDEW
1      BOBO
2    CLAIRE
3     DAVID
4         5
dtype: object

In [12]:
names.str.isdigit()

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [13]:
tech_finance = ['GOOG,APPl,AMZN','JPM,BAC,GS']

In [14]:
len(tech_finance)

2

In [15]:
tickers = pd.Series(tech_finance)

In [16]:
tickers

0    GOOG,APPl,AMZN
1        JPM,BAC,GS
dtype: object

In [27]:
tickers.str.split(',',expand=True)

Unnamed: 0,0,1,2
0,GOOG,APPl,AMZN
1,JPM,BAC,GS


In [17]:
tech = 'GOOG,APPl,AMZN'

In [21]:
tech.split(',')[0]

'GOOG'

In [28]:
messy_names = pd.Series(['andrew   ','bo;bo','    claire    '])

In [30]:
messy_names[0]

'andrew   '

In [35]:
messy_names.str.replace(';','').str.strip().str.capitalize()

0    Andrew
1      Bobo
2    Claire
dtype: object

In [36]:
def cleanup(name):
    name = name.replace(";","")
    name = name.strip()
    name = name.capitalize()
    return name

In [37]:
messy_names.apply(cleanup)

0    Andrew
1      Bobo
2    Claire
dtype: object

In [38]:
import timeit 
  
# code snippet to be executed only once 
setup = '''
import pandas as pd
import numpy as np
messy_names = pd.Series(["andrew  ","bo;bo","  claire  "])
def cleanup(name):
    name = name.replace(";","")
    name = name.strip()
    name = name.capitalize()
    return name
'''
  
# code snippet whose execution time is to be measured 
stmt_pandas_str = ''' 
messy_names.str.replace(";","").str.strip().str.capitalize()
'''

stmt_pandas_apply = '''
messy_names.apply(cleanup)
'''

stmt_pandas_vectorize='''
np.vectorize(cleanup)(messy_names)
'''

In [39]:
timeit.timeit(setup = setup, 
                    stmt = stmt_pandas_str, 
                    number = 10000) 

5.112245299999813

In [40]:
timeit.timeit(setup = setup, 
                    stmt = stmt_pandas_apply, 
                    number = 10000) 

1.0767210000003615

In [41]:
timeit.timeit(setup = setup, 
                    stmt = stmt_pandas_vectorize, 
                    number = 10000) 


0.9000186000002941