In [1]:
import numpy as np
print("numpy version: {}".format(np.__version__))
import pandas as pd 
print("pandas version: {}".format(pd.__version__))
import matplotlib
import matplotlib.pyplot as plt
print("matplotlib version: {}".format(matplotlib.__version__))
import scipy as sp
print("scipy version: {}".format(sp.__version__))
import sklearn as sl
print("scikit-learn: {}".format(sl.__version__))
import seaborn as sns
print("seaborn: {}".format(sns.__version__))
import statsmodels as sm
print("statsmodels: {}".format(sm.__version__))

numpy version: 1.17.4
pandas version: 0.25.3
matplotlib version: 3.1.2
scipy version: 1.3.3
scikit-learn: 0.21.3
seaborn: 0.9.0
statsmodels: 0.10.2


## Handling Missing Data

The way that missing data is represented in pandas objects is somewhat imperfect,
but it is functional for a lot of users. For numeric data, pandas uses the floating-point
value NaN (Not a Number) to represent missing data. We call this a sentinel value that
can be easily detected:

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [3]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data[0] = None

In [6]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

dropna, fillna, isnull, notnull

In [7]:
from numpy import nan as NA

In [8]:
data = pd.Series([1, NA, 3.5, NA, 7])

In [9]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

With DataFrame objects, things are a bit more complex. You may want to drop rows
or columns that are all NA or only those containing any NAs. dropna by default drops
any row containing a missing value:

In [11]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                    [NA, NA, NA], [NA, 6.5, 3.]])

In [12]:
cleaned = data.dropna()

In [13]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [14]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


Passing ```how='all'``` will only drop rows that are all NA:

In [15]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


To drop columns in the same way, pass ```axis=1``` :

In [16]:
data[4] = NA

In [17]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [18]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


A related way to filter out DataFrame rows tends to concern time series data. Suppose
you want to keep only rows containing a certain number of observations. You can
indicate this with the thresh argument:

In [19]:
df = pd.DataFrame(np.random.randn(7, 3))

In [20]:
df.iloc[:4, 1] = NA

In [21]:
df.iloc[:2, 2] = NA

In [22]:
df

Unnamed: 0,0,1,2
0,0.950494,,
1,-1.010841,,
2,0.260969,,-0.683597
3,0.275752,,-0.780007
4,0.403206,0.027329,1.062815
5,-0.199395,1.129725,0.935074
6,0.356956,1.054678,-0.241467


In [23]:
df.dropna()

Unnamed: 0,0,1,2
4,0.403206,0.027329,1.062815
5,-0.199395,1.129725,0.935074
6,0.356956,1.054678,-0.241467


In [24]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.260969,,-0.683597
3,0.275752,,-0.780007
4,0.403206,0.027329,1.062815
5,-0.199395,1.129725,0.935074
6,0.356956,1.054678,-0.241467


### Filling In Missing Data

Rather than filtering out missing data (and potentially discarding other data along
with it), you may want to fill in the “holes” in any number of ways. For most purposes, the ```fillna``` method is the workhorse function to use. Calling ```fillna``` with a
constant replaces missing values with that value:

In [25]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.950494,0.0,0.0
1,-1.010841,0.0,0.0
2,0.260969,0.0,-0.683597
3,0.275752,0.0,-0.780007
4,0.403206,0.027329,1.062815
5,-0.199395,1.129725,0.935074
6,0.356956,1.054678,-0.241467


Calling ```fillna``` with a dict, you can use a different fill value for each column:

In [26]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.950494,0.5,0.0
1,-1.010841,0.5,0.0
2,0.260969,0.5,-0.683597
3,0.275752,0.5,-0.780007
4,0.403206,0.027329,1.062815
5,-0.199395,1.129725,0.935074
6,0.356956,1.054678,-0.241467


```fillna``` returns a new object, but you can modify the existing object in-place:

In [27]:
_ = df.fillna(0, inplace=True)

In [28]:
df

Unnamed: 0,0,1,2
0,0.950494,0.0,0.0
1,-1.010841,0.0,0.0
2,0.260969,0.0,-0.683597
3,0.275752,0.0,-0.780007
4,0.403206,0.027329,1.062815
5,-0.199395,1.129725,0.935074
6,0.356956,1.054678,-0.241467


The same interpolation methods available for reindexing can be used with ```fillna``` :

In [29]:
df = pd.DataFrame(np.random.randn(6, 3))

In [30]:
df.iloc[2:, 1] = NA

In [31]:
df.iloc[4:, 2] = NA

In [32]:
df

Unnamed: 0,0,1,2
0,-2.18465,-1.667092,-0.155851
1,-0.501096,-1.301399,-0.653307
2,-1.11691,,0.568401
3,0.801663,,0.03242
4,0.433004,,
5,0.649878,,


In [33]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-2.18465,-1.667092,-0.155851
1,-0.501096,-1.301399,-0.653307
2,-1.11691,-1.301399,0.568401
3,0.801663,-1.301399,0.03242
4,0.433004,-1.301399,0.03242
5,0.649878,-1.301399,0.03242


In [34]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-2.18465,-1.667092,-0.155851
1,-0.501096,-1.301399,-0.653307
2,-1.11691,-1.301399,0.568401
3,0.801663,-1.301399,0.03242
4,0.433004,,0.03242
5,0.649878,,0.03242


In [35]:
data = pd.Series([1., NA, 3.5, NA, 7])

In [36]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## Data Transformation

### Removing Duplicates

Duplicate rows may be found in a DataFrame for any number of reasons.

In [37]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})

In [38]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [39]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [40]:
print(len(data))
data_droped = data.drop_duplicates()
print(len(data_droped))

7
6


In [41]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [42]:
data['v1'] = range(7)

In [43]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


```duplicated``` and ```drop_duplicates``` by default keep the first observed value combination. Passing ```keep='last'``` will return the last one:

In [44]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [45]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                    'Pastrami', 'corned beef', 'Bacon',
                    'pastrami', 'honey ham', 'nova lox'],
                    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [46]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [47]:
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

In [48]:
lowercased = data['food'].str.lower()

In [49]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [50]:
data['animal'] = lowercased.map(meat_to_animal)

In [51]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [52]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [53]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])

In [54]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

To replace these with NA values that pandas understands, we can use replace , producing a new Series (unless you pass ```inplace=True``` ):

In [55]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [56]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [57]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [58]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [59]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [60]:
transform = lambda x: x[:4].upper()

In [61]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [62]:
data.index = data.index.map(transform)

In [63]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


If you want to create a transformed version of a dataset without modifying the original, a useful method is rename :

In [64]:
data.rename(index={'OHIO': 'INDIANA'},
           columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [65]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)

In [66]:
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### Discretization and Binning

In [67]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

Let’s divide these into bins of 18 to 25, 26 to 35, 36 to 60, and finally 61 and older. To
do so, you have to use ```cut``` , a function in pandas:

In [68]:
bins = [18, 25, 35, 60, 100]

In [69]:
cats = pd.cut(ages, bins)

In [70]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [71]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [72]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

```pd.value_counts(cats)``` are the bin counts for the result of ```pandas.cut``` .

In [73]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [74]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [75]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [76]:
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [77]:
data = np.random.rand(20)

If you pass an integer number of bins to cut instead of explicit bin edges, it will compute equallength bins based on the minimum and maximum values in the data.
Consider the case of some uniformly distributed data chopped into fourths:

In [78]:
pd.cut(data, 4, precision=2)

[(0.28, 0.49], (0.49, 0.69], (0.066, 0.28], (0.28, 0.49], (0.69, 0.9], ..., (0.28, 0.49], (0.28, 0.49], (0.69, 0.9], (0.066, 0.28], (0.066, 0.28]]
Length: 20
Categories (4, interval[float64]): [(0.066, 0.28] < (0.28, 0.49] < (0.49, 0.69] < (0.69, 0.9]]

A closely related function, ```qcut``` , bins the data based on sample quantiles. Depending
on the distribution of the data, using cut will not usually result in each bin having the
same number of data points. Since qcut uses sample quantiles instead, by definition
you will obtain roughly equalsize bins:

In [79]:
data = np.random.randn(1000) # Normally distributed

In [80]:
cats = pd.qcut(data, 4)

In [81]:
cats

[(0.652, 3.531], (0.652, 3.531], (-0.682, -0.0587], (0.652, 3.531], (0.652, 3.531], ..., (-0.0587, 0.652], (0.652, 3.531], (0.652, 3.531], (-0.682, -0.0587], (-0.0587, 0.652]]
Length: 1000
Categories (4, interval[float64]): [(-3.528, -0.682] < (-0.682, -0.0587] < (-0.0587, 0.652] < (0.652, 3.531]]

In [82]:
pd.value_counts(cats)

(0.652, 3.531]       250
(-0.0587, 0.652]     250
(-0.682, -0.0587]    250
(-3.528, -0.682]     250
dtype: int64

In [83]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(1.25, 3.531], (1.25, 3.531], (-1.247, -0.0587], (1.25, 3.531], (1.25, 3.531], ..., (-0.0587, 1.25], (1.25, 3.531], (-0.0587, 1.25], (-1.247, -0.0587], (-0.0587, 1.25]]
Length: 1000
Categories (4, interval[float64]): [(-3.528, -1.247] < (-1.247, -0.0587] < (-0.0587, 1.25] < (1.25, 3.531]]

In [84]:
data = pd.DataFrame(np.random.randn(1000, 4))

In [85]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.018662,-0.015225,0.021592,0.032806
std,1.02311,0.970108,0.998078,1.004348
min,-3.333358,-3.026446,-2.871041,-2.922236
25%,-0.706257,-0.685631,-0.654975,-0.662407
50%,0.003871,-0.047817,0.017004,0.066466
75%,0.696358,0.655247,0.648074,0.704917
max,2.846702,2.973734,3.50928,4.081271


In [86]:
col = data[2]

In [87]:
col[np.abs(col) > 3]

107    3.451776
490    3.127469
593    3.087317
956    3.509280
Name: 2, dtype: float64

In [88]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
107,1.975604,0.385564,3.451776,-0.379013
170,-3.042569,0.28869,-1.08663,-0.576197
490,0.058572,0.453886,3.127469,1.955138
494,-3.333358,-0.138649,-1.3803,0.616402
520,0.701904,-3.026446,0.550303,0.753111
593,-1.430815,0.163826,3.087317,0.056226
599,1.330161,-2.00211,-0.595836,4.081271
730,-3.107995,0.930201,0.858763,-0.396256
956,-0.021947,0.694786,3.50928,-0.537829


In [89]:
data[np.abs(data) > 3] = np.sign(data) * 3

In [90]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.018178,-0.015199,0.020416,0.031725
std,1.021636,0.970026,0.994313,1.00056
min,-3.0,-3.0,-2.871041,-2.922236
25%,-0.706257,-0.685631,-0.654975,-0.662407
50%,0.003871,-0.047817,0.017004,0.066466
75%,0.696358,0.655247,0.648074,0.704917
max,2.846702,2.973734,3.0,3.0


In [91]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,-1.0,1.0
1,1.0,1.0,-1.0,-1.0
2,1.0,-1.0,1.0,-1.0
3,-1.0,-1.0,-1.0,1.0
4,-1.0,1.0,1.0,-1.0


In [92]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))

In [93]:
sampler = np.random.permutation(5)

In [94]:
sampler

array([3, 1, 4, 2, 0])

In [95]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


That array can then be used in iloc-based indexing or the equivalent take function:

In [96]:
df.take(sampler)

Unnamed: 0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
4,16,17,18,19
2,8,9,10,11
0,0,1,2,3


In [97]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
2,8,9,10,11
4,16,17,18,19
0,0,1,2,3


To generate a sample with replacement (to allow repeat choices), pass ```replace=True```
to sample :

In [98]:
choices = pd.Series([5, 7, -1, 6, 4])

In [99]:
draws = choices.sample(n=10, replace=True)

In [100]:
draws

2   -1
3    6
4    4
1    7
0    5
0    5
2   -1
3    6
4    4
3    6
dtype: int64

Another type of transformation for statistical modeling or machine learning applications is converting a categorical variable into a “dummy” or “indicator” matrix. If a
column in a DataFrame has k distinct values, you would derive a matrix or DataFrame with k columns containing all 1s and 0s. pandas has a ```get_dummies``` function
for doing this, though devising one yourself is not difficult. Let’s return to an earlier
example DataFrame:

In [101]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})

In [102]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [103]:
dummies = pd.get_dummies(df['key'], prefix='key')

In [104]:
df_with_dummy = df[['data1']].join(dummies)

In [105]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


#### String Manipulation

##### String Object Methods

In [106]:
val = 'a,b,   guido'

In [107]:
val.split(',')

['a', 'b', '   guido']

```split``` is often combined with ```strip``` to trim whitespace (including line breaks):

In [108]:
pieces = [x.strip() for x in val.split(',')]

In [109]:
pieces

['a', 'b', 'guido']

In [110]:
first, second, third = pieces

In [111]:
first + '::' + second + '::' + third

'a::b::guido'

But this isn’t a practical generic method. A faster and more Pythonic way is to pass a
list or tuple to the ```join``` method on the string '::' :

In [112]:
'::'.join(pieces)

'a::b::guido'

In [113]:
'guido' in val

True

In [114]:
val.index(',')

1

In [115]:
val.find(':')

-1

In [116]:
val.index(':')

ValueError: substring not found

In [117]:
val.count(',')

2

In [118]:
val.replace(',', '::')

'a::b::   guido'

In [119]:
val.replace(',', '')

'ab   guido'

- count
- endswith
- startswith
- join
- index 
- find
- rfind
- replace
- strip
- rstrip
- lstrip
- split
- lower
- upper
- casefold
- ljust
- rjust

### Regular Expressions

__Regular expressions__ provide a flexible way to search or match (often more complex)
string patterns in text. A single expression, commonly called a regex, is a string
formed according to the regular expression language. Python’s built-in re module is
responsible for applying regular expressions to strings; I’ll give a number of examples
of its use here.

The ```re``` module functions fall into three categories: _pattern matching_, _substitution_, and _splitting_. Naturally these are all related; a __regex__ describes a pattern to locate in the text, which can then be used for many purposes. Let’s look at a simple example:

suppose we wanted to split a string with a variable number of whitespace characters
(tabs, spaces, and newlines). The __regex__ describing one or more whitespace characters
is ```\s+``` :

In [120]:
import re

In [122]:
text = 'foo    bar\t baz    \tqux'

In [123]:
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

When you call ```re.split('\s+', text)``` , the regular expression is first _compiled_, and
then its ```split``` method is called on the passed text. You can __compile__ the regex yourself with ```re.compile```, forming a reusable regex object:

In [124]:
regex = re.compile('\s+')

In [125]:
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [126]:
regex.findall(text)

['    ', '\t ', '    \t']

Creating a regex object with ```re.compile``` is highly recommended if you intend to
apply the same expression to many strings; doing so will save CPU cycles.

In [127]:
type(regex)

re.Pattern

In [129]:
re.compile?

In [130]:
text = 'foo  bar foo\tbar \tfoo\t bar'

In [131]:
regex.split(text)

['foo', 'bar', 'foo', 'bar', 'foo', 'bar']

In [132]:
regex.findall(text)

['  ', ' ', '\t', ' \t', '\t ']

In [133]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

In [134]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

In [135]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [136]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

search returns a special match object for the first email address in the text. For the
preceding regex, the match object can only tell us the start and end position of the
pattern in the string:

In [138]:
m = regex.search(text)

In [139]:
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [140]:
text[m.start():m.end()]

'dave@google.com'

In [142]:
regex.match(text)

In [143]:
text

'Dave dave@google.com\nSteve steve@gmail.com\nRob rob@gmail.com\nRyan ryan@yahoo.com\n'

In [144]:
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



Suppose you wanted to find email addresses and simultaneously segment each
address into its three components: username, domain name, and domain suffix. To
do this, put parentheses around the parts of the pattern to segment:

In [145]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [147]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [148]:
m = regex.match('wesm@bright.net')

In [149]:
m.groups()

('wesm', 'bright', 'net')

In [150]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)'

In [151]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [152]:
m = regex.match('wesm@brigth.net')

In [153]:
m.groups()

('wesm', 'brigth.net')

In [154]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [155]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [156]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

```sub``` also has access to groups in each match using special symbols like \1 and \2 . The symbol \1 corresponds to the first matched group, \2 corresponds to the second, and
so forth:

In [157]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



- findall
- finditer
- match
- search
- split
- sub
- subn

##### Vectorized String Functions in pandas

In [158]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}

In [159]:
data = pd.Series(data)

In [160]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [161]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

Series has array-oriented methods for string operations that skip NA values. These are accessed through Series’s str attribute; for example, we could check whether each email address has 'gmail' in it with ```str.contains```:

In [162]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [163]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

__Regular expressions__ can be used, too, along with any ```re``` options like ```IGNORECASE```:

In [164]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [165]:
matches = data.str.match(pattern, flags=re.IGNORECASE)

In [166]:
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [167]:
matches.str.get(1)

AttributeError: Can only use .str accessor with string values!

In [168]:
matches.str[0]

AttributeError: Can only use .str accessor with string values!

In [169]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object