# Chapter 7

# 7.1 Handling Missing Data

In [1]:
import pandas as pd
from numpy import nan as  NA
import numpy as np


In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [3]:
string_data


0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:

string_data[0] = None # assign NUll to o index


In [6]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

# Filtering Out Missing Data
There are a few ways to filter out missing data. While you always have the option to do it by hand using pandas.isnull and boolean indexing, the dropna can be helpful. On a Series, it returns the Series with only the non-null data and index values:

In [7]:
data = pd.Series([1, NA, 3.5, NA, 7])

In [8]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [9]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])

In [11]:


data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
cleaned = data.dropna()  # drop the row which have null values 

In [13]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [14]:
data.dropna(how='all') #Passing how='all' will only drop rows that are all NA:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [15]:
data[4] = NA  #4th column added with NaN
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [16]:

data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
df = pd.DataFrame(np.random.randn(7, 3))

In [18]:


df


Unnamed: 0,0,1,2
0,-0.848863,-0.020192,0.544514
1,-0.304479,-0.840534,-0.851375
2,-0.078974,0.818603,1.346652
3,0.497623,-0.596844,1.645901
4,0.838364,-1.284789,-2.024998
5,0.078451,1.123538,0.169797
6,-0.211087,0.385529,-1.562707


In [19]:
df.iloc[:4, 1] = NA # row index 0,1,2,3 and column index 1 make NA

In [20]:
df

Unnamed: 0,0,1,2
0,-0.848863,,0.544514
1,-0.304479,,-0.851375
2,-0.078974,,1.346652
3,0.497623,,1.645901
4,0.838364,-1.284789,-2.024998
5,0.078451,1.123538,0.169797
6,-0.211087,0.385529,-1.562707


In [21]:
df.iloc[:2, 2] = NA



In [22]:
df


Unnamed: 0,0,1,2
0,-0.848863,,
1,-0.304479,,
2,-0.078974,,1.346652
3,0.497623,,1.645901
4,0.838364,-1.284789,-2.024998
5,0.078451,1.123538,0.169797
6,-0.211087,0.385529,-1.562707


In [23]:
df.dropna() # drop all null values row

Unnamed: 0,0,1,2
4,0.838364,-1.284789,-2.024998
5,0.078451,1.123538,0.169797
6,-0.211087,0.385529,-1.562707


In [24]:
df.dropna(thresh=2) #strart at 2 index row


Unnamed: 0,0,1,2
2,-0.078974,,1.346652
3,0.497623,,1.645901
4,0.838364,-1.284789,-2.024998
5,0.078451,1.123538,0.169797
6,-0.211087,0.385529,-1.562707


# Filling In Missing Data
Rather than filtering out missing data (and potentially discarding other data along with it), you may want to fill in the “holes” in any number of ways. For most pur‐ poses, the fillna method is the workhorse function to use. Calling fillna with a constant replaces missing values with that value:

In [25]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.848863,0.0,0.0
1,-0.304479,0.0,0.0
2,-0.078974,0.0,1.346652
3,0.497623,0.0,1.645901
4,0.838364,-1.284789,-2.024998
5,0.078451,1.123538,0.169797
6,-0.211087,0.385529,-1.562707


In [26]:
df.fillna({1: 0.5, 2: 0})   #where ever there are null in 1st column fill 0.5 and 2nd column 0

Unnamed: 0,0,1,2
0,-0.848863,0.5,0.0
1,-0.304479,0.5,0.0
2,-0.078974,0.5,1.346652
3,0.497623,0.5,1.645901
4,0.838364,-1.284789,-2.024998
5,0.078451,1.123538,0.169797
6,-0.211087,0.385529,-1.562707


In [27]:
df= df.fillna(0, inplace=True)

In [28]:
df

In [29]:
df = pd.DataFrame(np.random.randn(6, 3))

In [30]:
df

Unnamed: 0,0,1,2
0,-0.404391,0.2685,-1.441342
1,-0.294019,1.254682,-1.100288
2,-0.503306,0.435487,0.932307
3,2.115799,0.928921,0.076282
4,-0.334595,0.547185,-0.738721
5,0.409302,-3.138239,-1.238004


df

In [31]:
df.iloc[2:,1] =NA  # from 2 row column 1

In [32]:
df

Unnamed: 0,0,1,2
0,-0.404391,0.2685,-1.441342
1,-0.294019,1.254682,-1.100288
2,-0.503306,,0.932307
3,2.115799,,0.076282
4,-0.334595,,-0.738721
5,0.409302,,-1.238004


In [33]:
df.fillna(method='ffill')
 # fill the same value

Unnamed: 0,0,1,2
0,-0.404391,0.2685,-1.441342
1,-0.294019,1.254682,-1.100288
2,-0.503306,1.254682,0.932307
3,2.115799,1.254682,0.076282
4,-0.334595,1.254682,-0.738721
5,0.409302,1.254682,-1.238004


In [34]:
df.fillna(method='ffill', limit=2)# limit for 2 

Unnamed: 0,0,1,2
0,-0.404391,0.2685,-1.441342
1,-0.294019,1.254682,-1.100288
2,-0.503306,1.254682,0.932307
3,2.115799,1.254682,0.076282
4,-0.334595,,-0.738721
5,0.409302,,-1.238004


In [35]:
data = pd.Series([1., NA, 3.5, NA, 7])



In [36]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# 7.2 Data Transformation

In [37]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], 'k2': [1, 1, 2, 3, 3, 4, 4]})


In [38]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [39]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [40]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [41]:
data['v1'] =range(7)# v1 column added 

In [42]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [43]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [44]:
 data.drop_duplicates(['k1','k2'],keep='last')   # in data remove duplicate keep last one

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


# Transforming Data Using a Function or Mapping
r many datasets, you may wish to perform some transformation based on the val‐ ues in an array, Series, or column in a DataFrame. Consider the following hypotheti‐ cal data collected about various kinds of meat:

In [45]:
data=pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                            'Pastrami', 'corned beef', 'Bacon',
                            'pastrami', 'honey ham', 'nova lox'],
                    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [46]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [47]:
  meat_to_animal = {
      'bacon': 'pig',
      'pulled pork': 'pig',
      'pastrami': 'cow',
      'corned beef': 'cow',
      'honey ham': 'pig',
      'nova lox': 'salmon'
}

In [48]:
lowercased=data['food'].str.lower()

In [49]:
lowercased


0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [50]:
data['animal'] = lowercased.map(meat_to_animal)  # map the data from meat_to_animal to food 

In [51]:
data


Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [52]:
data['food'].map(lambda x:meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

# Replacing Values
Filling in missing data with the fillna method is a special case of more general value replacement. As you’ve already seen, map can be used to modify a subset of values in an object but replace provides a simpler and more flexible way to do so. Let’s con‐ sider this Series:

In [53]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])

In [54]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [55]:
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [56]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [57]:
 data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

# Renaming Axis Indexes

In [58]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),index=['Ohio', 'Colorado', 'New York'],
             columns=['one', 'two', 'three', 'four'])

In [59]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [60]:
transform = lambda x: x[:4].upper()  # THIS tranform is for 0, 1,2,3 to upper


In [61]:
transform

<function __main__.<lambda>(x)>

In [62]:
 data.index.map(transform)  # transform applied to index
    

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [63]:
data.index=data.index.map(transform)  # applied to index of the data frame

In [64]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [65]:
data.rename(index=str.title,columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [66]:
data.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [67]:
 data.rename(index={'OHIO': 'INDIANA'}, inplace=True)#do it in org dataframe

In [68]:
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


# Discretization and Binning
Continuous data is often discretized or otherwise separated into “bins” for analysis. Suppose you have data about a group of people in a study, and you want to group them into discrete age buckets:

In [69]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [70]:
bins = [18, 25, 35, 60, 100]

In [71]:
cats=pd.cut(ages,bins)

In [72]:
cats


[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [73]:
cats.shape

(12,)

In [74]:
cats.codes



array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [75]:
 cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [76]:
pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [77]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [78]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']


In [79]:
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [80]:
data = np.random.rand(20)

In [81]:
 pd.cut(data, 4, precision=2)

[(0.75, 1.0], (0.5, 0.75], (0.5, 0.75], (0.009, 0.26], (0.26, 0.5], ..., (0.009, 0.26], (0.5, 0.75], (0.75, 1.0], (0.26, 0.5], (0.5, 0.75]]
Length: 20
Categories (4, interval[float64, right]): [(0.009, 0.26] < (0.26, 0.5] < (0.5, 0.75] < (0.75, 1.0]]

In [82]:
data = np.random.randn(1000) 

In [83]:
cats = pd.qcut(data, 4)

In [84]:
cats


[(0.0454, 0.673], (0.0454, 0.673], (0.673, 2.98], (-3.917, -0.689], (-0.689, 0.0454], ..., (0.0454, 0.673], (-3.917, -0.689], (0.0454, 0.673], (0.0454, 0.673], (0.0454, 0.673]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.917, -0.689] < (-0.689, 0.0454] < (0.0454, 0.673] < (0.673, 2.98]]

In [85]:
pd.value_counts(cats)

(-3.917, -0.689]    250
(-0.689, 0.0454]    250
(0.0454, 0.673]     250
(0.673, 2.98]       250
dtype: int64

In [86]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(0.0454, 1.208], (0.0454, 1.208], (1.208, 2.98], (-1.329, 0.0454], (-1.329, 0.0454], ..., (0.0454, 1.208], (-1.329, 0.0454], (0.0454, 1.208], (0.0454, 1.208], (0.0454, 1.208]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.917, -1.329] < (-1.329, 0.0454] < (0.0454, 1.208] < (1.208, 2.98]]

# Detecting and Filtering Outliers
Filtering or transforming outliers is largely a matter of applying array operations. Consider a DataFrame with some normally distributed data:

In [87]:
 data = pd.DataFrame(np.random.randn(1000, 4))
    

In [88]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.00701,-0.011497,0.009623,0.027162
std,1.009934,0.96817,1.000784,0.945076
min,-2.804316,-2.991232,-3.020163,-3.020715
25%,-0.695179,-0.687406,-0.673608,-0.617952
50%,-0.054321,-0.013522,-0.024343,0.043399
75%,0.696579,0.658672,0.698135,0.670231
max,2.988543,3.22933,2.886194,3.344994


In [89]:
data.shape

(1000, 4)

In [90]:
 col = data[2]

In [91]:
col

0      0.632890
1      1.309435
2      0.938631
3     -1.384984
4      0.577748
         ...   
995   -0.589281
996   -0.463991
997   -0.467439
998   -0.205495
999    1.595961
Name: 2, Length: 1000, dtype: float64

In [92]:
col[np.abs(col)>3]

889   -3.020163
Name: 2, dtype: float64

In [93]:
data[(np.abs(data) > 3).any(1)]


Unnamed: 0,0,1,2,3
17,-0.123876,-0.743306,1.329865,3.344994
62,0.077851,-0.383921,-0.808853,-3.020715
678,0.843541,3.22933,-0.246507,0.394763
889,-0.53209,-0.419354,-3.020163,-0.370554


In [94]:
data.describe()


Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.00701,-0.011497,0.009623,0.027162
std,1.009934,0.96817,1.000784,0.945076
min,-2.804316,-2.991232,-3.020163,-3.020715
25%,-0.695179,-0.687406,-0.673608,-0.617952
50%,-0.054321,-0.013522,-0.024343,0.043399
75%,0.696579,0.658672,0.698135,0.670231
max,2.988543,3.22933,2.886194,3.344994


In [95]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,-1.0,1.0,1.0
1,1.0,1.0,1.0,1.0
2,-1.0,-1.0,1.0,1.0
3,1.0,-1.0,-1.0,-1.0
4,-1.0,1.0,1.0,1.0


# Permutation and Random Sampling
Permuting (randomly reordering) a Series or the rows in a DataFrame is easy to do using the numpy.random.permutation function. Calling permutation with the length of the axis you want to permute produces an array of integers indicating the new ordering:


In [96]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))

In [97]:
df


Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [98]:
sampler = np.random.permutation(5)


In [99]:
sampler


array([1, 4, 3, 0, 2])

In [100]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [101]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
3,12,13,14,15
0,0,1,2,3
2,8,9,10,11


In [102]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
2,8,9,10,11


In [103]:
df.sample(n=2)

Unnamed: 0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [104]:
choices = pd.Series([5, 7, -1, 6, 4])

In [105]:
choices

0    5
1    7
2   -1
3    6
4    4
dtype: int64

In [106]:
draws = choices.sample(n=10, replace=True) #To generate a sample with replacement (to allow repeat choices), pass replace=True to sample:


In [107]:
draws

2   -1
1    7
0    5
3    6
3    6
2   -1
0    5
2   -1
3    6
1    7
dtype: int64

# Computing Indicator/Dummy Variables
Another type of transformation for statistical modeling or machine learning applica‐ tions is converting a categorical variable into a “dummy” or “indicator” matrix. If a column in a DataFrame has k distinct values, you would derive a matrix or Data‐ Frame with k columns containing all 1s and 0s. pandas has a get_dummies function for doing this, though devising one yourself is not difficult. Let’s return to an earlier example DataFrame:

In [108]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})

In [109]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [110]:
pd.get_dummies(df['key']) # 1 and 0

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [111]:
dummies = pd.get_dummies(df['key'], prefix='key')

In [112]:
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [113]:
df_with_dummy = df[['data1']].join(dummies)

In [114]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [115]:
mnames = ['movie_id', 'title', 'genres']

In [116]:
movies = pd.read_table('datasets/movielens/movies.dat', sep='::', header=None, names=mnames)


  movies = pd.read_table('datasets/movielens/movies.dat', sep='::', header=None, names=mnames)


FileNotFoundError: [Errno 2] No such file or directory: 'datasets/movielens/movies.dat'

In [None]:
#all_genres = []
#for x in movies.genres:
#          all_genres.extend(x.split('|'))
#genres = pd.unique(all_genres)

In [None]:
#genres

In [None]:
#zero_matrix = np.zeros((len(movies), len(genres)))

In [None]:
# dummies = pd.DataFrame(zero_matrix, columns=genres)

In [None]:
#gen = movies.genres[0]

In [None]:
#gen.split('|')

In [None]:
#dummies.columns.get_indexer(gen.split('|'))

In [None]:
#for i, gen in enumerate(movies.genres):
 #          indices = dummies.columns.get_indexer(gen.split('|'))
  #         dummies.iloc[i, indices] = 1

In [None]:
#movies_windic = movies.join(dummies.add_prefix('Genre_'))


In [None]:
#movies_windic.iloc[0]

In [117]:
 np.random.seed(12345)

In [118]:
values=np.random.rand(10)

In [119]:
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [120]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]


In [121]:
pd.get_dummies(pd.cut(values, bins))


Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


# 7.3 String Manipulation
Python has long been a popular raw data manipulation language in part due to its ease of use for string and text processing. Most text operations are made simple with the string object’s built-in methods. For more complex pattern matching and text manipulations, regular expressions may be needed. pandas adds to the mix by ena‐ bling you to apply string and regular expressions concisely on whole arrays of data, additionally handling the annoyance of missing data.

In [122]:
val = 'a,b,  guido'

In [123]:
val.split(',')

['a', 'b', '  guido']

In [124]:
pieces = [x.strip() for x in val.split(',')]

In [125]:
pieces

['a', 'b', 'guido']

In [126]:
first,second,third =pieces


In [127]:
first + '::' + second + '::' + third

'a::b::guido'

In [128]:
'::'.join(pieces)

'a::b::guido'

In [129]:
'guido' in val

True

In [130]:
val.index(',')

1

In [131]:
val.find(',')

1

In [132]:
val.index(':')

ValueError: substring not found

In [133]:
 val.count(',')

2

In [134]:
val.replace(',', '::')

'a::b::  guido'

In [135]:
val.replace(',', '')

'ab  guido'

# Regular Expressions
Regular expressions provide a flexible way to search or match (often more complex) string patterns in text. A single expression, commonly called a regex, is a string formed according to the regular expression language. Python’s built-in re module is responsible for applying regular expressions to strings; I’ll give a number of examples of its use here.


In [136]:
import re

In [137]:
text = "foo bar\t baz \tqux"

In [138]:
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [139]:
regex = re.compile('\s+')

In [140]:
regex

re.compile(r'\s+', re.UNICODE)

In [141]:
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [142]:
regex.findall(text)

[' ', '\t ', ' \t']

In [143]:
text = """Dave dave@google.com
    Steve steve@gmail.com
    Rob rob@gmail.com
    Ryan ryan@yahoo.com
    """

In [144]:
   pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

In [145]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [146]:
 regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [147]:
m = regex.search(text)

In [148]:
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [149]:
 text[m.start():m.end()]
    

'dave@google.com'

In [150]:
 print(regex.match(text))

None


In [151]:
print(regex.sub('REDACTED', text))

Dave REDACTED
    Steve REDACTED
    Rob REDACTED
    Ryan REDACTED
    


In [152]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [153]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [154]:
 m = regex.match('wesm@bright.net')

In [155]:
m.groups()

('wesm', 'bright', 'net')

In [156]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [157]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
    Steve Username: steve, Domain: gmail, Suffix: com
    Rob Username: rob, Domain: gmail, Suffix: com
    Ryan Username: ryan, Domain: yahoo, Suffix: com
    


# Vectorized String Functions in pandas
Cleaning up a messy dataset for analysis often requires a lot of string munging and regularization. To complicate matters, a column containing strings will sometimes have missing data:

In [158]:
 data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
           'Rob': 'rob@gmail.com', 'Wes': np.nan}


In [159]:
 data = pd.Series(data)

In [160]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [161]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [162]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [163]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [164]:
 data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [165]:
matches = data.str.match(pattern, flags=re.IGNORECASE)

In [166]:
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [167]:
#matches.str.get(1)

In [None]:
matches.str[0]

In [None]:
 data.str[:5]