In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

# for displaying variable w/o print
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## <a id='0'> Data Cleaning and Data Preprocessing

Wrangle data into a form that is more suitable for Data Analysis and Machine Learning. <br>
<a href='#1'>1. Handling Anomalies<br></a>
Missing Values, Duplicates and Outliers: Detect, Drop and Replace.<br></a>
<a href='#2'>2. Data Tranformation</a><br>
- Element-wise: Mapping, Replacing, Renaming<br>
- Inter-Conversion b/t Numerical and Categorical<br>
    - Discretization and Binning<br>
    - Dummy variables<br>

<a href='#3'>3. String Manipulation</a><br>
- Regular expression <br>

Note: this notebook is created based on Chapter 7 of Wes McKinney's "Python for Data Analysis" 2nd edition. I simplified and clarified some of the examples that I found confusing or unnecessary when I was learning them, added some examples to illustrate the point more precisely. Hope this could help others.<br>

### Missing Values
Detect, Drop and Fill in    
#### Detect

In [23]:
s1 = ['aardvark', 'artichoke', None]
s2 = [None,       'orange',    None]

df = pd.DataFrame({'col1':s1, 'col2': s2})
df

Unnamed: 0,col1,col2
0,aardvark,
1,artichoke,orange
2,,


##### print rows with at least one NA

In [7]:
bool_na_df = df.isna()  # create the boolean df
bool_s  = bool_na_df.any(axis=1)  # boolean series
df.loc[bool_s]

Unnamed: 0,col1,col2
0,aardvark,
2,,


##### print columns with at least one NA

In [8]:
bool_s = bool_na_df.any(axis=0)
df[df.columns[bool_s]]

Unnamed: 0,col1,col2
0,aardvark,
1,artichoke,orange
2,,


##### calculate fraction of missing values for each column

In [12]:
bool_na_df.sum(axis=0)/len(df)

col1    0.333333
col2    0.666667
dtype: float64

##### list rows with more than one NAs

In [16]:
df.loc[bool_na_df.sum(axis=1) >= 2]

Unnamed: 0,col1,col2
2,,


#### Drop

In [25]:
df.dropna(axis=0, how='all') # drop empty rows, use 'axis=1' for empty cols

Unnamed: 0,col1,col2
0,aardvark,
1,artichoke,orange


##### Drop columns if there is less than 90% good values

In [27]:
df = pd.DataFrame(np.random.randn(5, 3))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df
df.dropna(axis=1, how='any', thresh=int(0.9*len(df)))

Unnamed: 0,0,1,2
0,-0.09863,,
1,0.874917,,
2,0.009588,,-0.31122
3,-0.617723,,0.863839
4,0.225778,0.805041,0.555138


Unnamed: 0,0
0,-0.09863
1,0.874917
2,0.009588
3,-0.617723
4,0.225778


#### Fill

##### Series: fill na with a method

In [31]:
s = pd.Series([1., np.nan, 3.5, np.nan, 7])
s.fillna(s.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

##### Dataframe: fill na differently for each column

In [35]:
df = pd.DataFrame(np.random.randn(5, 3), columns=list('abc'))
df.iloc[[2,4], 1] = np.nan
df.iloc[4:, 2]      = np.nan
df

Unnamed: 0,a,b,c
0,0.025733,0.750056,-0.648371
1,-0.054266,-1.23106,0.902463
2,0.703757,,-0.178324
3,-0.850762,0.420131,-0.686591
4,-0.283946,,


##### fill NA differently based on the column index

In [36]:
df.fillna({'b': 0.5, 'c': 0}) 

Unnamed: 0,a,b,c
0,0.025733,0.750056,-0.648371
1,-0.054266,-1.23106,0.902463
2,0.703757,0.5,-0.178324
3,-0.850762,0.420131,-0.686591
4,-0.283946,0.5,0.0


### Duplicates

In [37]:
df = pd.DataFrame({'k1': ['one', 'two']*2 + ['two'],
                   'k2': [1, 2, 3, 4, 4],
                   'k3': [1, 2, 3, 4, 4]})
df

Unnamed: 0,k1,k2,k3
0,one,1,1
1,two,2,2
2,one,3,3
3,two,4,4
4,two,4,4


#### Detect and Drop: rows or columns

In [41]:
df.duplicated()    # rows
df.T.duplicated()  # columns

## Drop 
df.drop_duplicates()      # rows
df.T.drop_duplicates().T  # columns

0    False
1    False
2    False
dtype: bool

a    False
b    False
c    False
d    False
dtype: bool

Unnamed: 0,a,b,c,d
0,14,1,0,1
1,2,2,3,3
2,15,1,16,2


Unnamed: 0,a,b,c,d
0,14,1,0,1
1,2,2,3,3
2,15,1,16,2


### Outliers

#### Detect

In [44]:
df = pd.DataFrame(
     [[14, 1,  0, 1],
      [ 2, 2,  3, 3],
      [-15,1, 16, 2]
     ],
    columns=['a','b','c','d']
)
df

Unnamed: 0,a,b,c,d
0,14,1,0,1
1,2,2,3,3
2,-15,1,16,2


##### find all rows containing at least one outlier that not within [-3,3]

In [45]:
bool_outlier_df = np.abs(df) > 3
df.loc[bool_outlier_df.any(axis=1)]

Unnamed: 0,a,b,c,d
0,14,1,0,1
2,-15,1,16,2


#### Cap

In [46]:
df[bool_outlier_df] = np.sign(df) * 3
df

Unnamed: 0,a,b,c,d
0,3,1,0,1
1,2,2,3,3
2,-3,1,3,2


## <a id='2'>Data Transformation
### Mapping & Replacing</a>
- <a href='#0'> Back to TOC

### Mapping in Series
- Element-wise transformation using a function or a dict <br>
- Recommend: operating on a Series instead of the whole dataframe, since the latter is more likely to cause unwanted results.
- Tips: all the values need to be mapped, otherwise it creates NAs

In [47]:
df = pd.DataFrame({'food'  : ['bacon'   , 'pulled pork', 'bacon',
                              'pastrami', 'corned beef', 'bacon'],
                   'ounces': [4, 3, 12, 6, 7.5, 8]
                  })
df

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0


In [50]:
meat_to_animal = {
  'bacon'      : 'pig', 'pastrami'   : 'cow',
  'pulled pork': 'pig', 'corned beef': 'cow'}

df['food'].map(meat_to_animal)

0    pig
1    pig
2    pig
3    cow
4    cow
5    pig
Name: food, dtype: object

### Replacing Values
Similar to mapping, but take values only, no functions. <br>
Note: unlisted values are unchanged.

In [51]:
# series
se = pd.Series([1., -999., 2., -999., -1000., 3.])
se.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

### Renaming Axis Indexes
Rename: a convenience function for mapping row index and column names

In [55]:
df = pd.DataFrame(np.arange(12).reshape((3, 4)),
                  index=['ohio', 'colorado', 'new york'],
                  columns=['one', 'two', 'three', 'four'])
df

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,4,5,6,7
new york,8,9,10,11


In [60]:
upper_func = lambda x: x.upper()

# equivalent
df.index.map(str.title); df.columns.map(upper_func)
df.rename(index=str.title, columns=upper_func, inplace=False)

Index(['Ohio', 'Colorado', 'New York'], dtype='object')

Index(['ONE', 'TWO', 'THREE', 'FOUR'], dtype='object')

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


#### Use the first row of the dataframe as the new header

In [61]:
raw_data = {
    '0': ['first_name', 'Molly', 'Tina', 'Jake', 'Amy'], 
    '1': ['last_name', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
    '2': ['age', 52, 36, 24, 73], 
    '3': ['preTestScore', 24, 31, 2, 3]}

df = pd.DataFrame(raw_data)
df.head(3)

Unnamed: 0,0,1,2,3
0,first_name,last_name,age,preTestScore
1,Molly,Jacobson,52,24
2,Tina,Ali,36,31


In [62]:
header = df.iloc[0] # return a Series, dict-like

df.drop(index=0, inplace=True) # or df = df.iloc[1:]
df.rename(columns=header, inplace=True)
df.head(2)

Unnamed: 0,first_name,last_name,age,preTestScore
1,Molly,Jacobson,52,24
2,Tina,Ali,36,31


### <a id='22'>Discretization and Binning: cut and qcut</a>
Convert numerical columns to categorical.
- <a href='#0'> Back to TOC

#### Cut

In [65]:
# cut a list of values into categories
age_data = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]

pd.cut(age_data, bins).categories

# pass an integer
pd.cut(age_data, 5).categories

# label the categories
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(age_data, bins, labels=group_names).categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

IntervalIndex([(19.959, 28.2], (28.2, 36.4], (36.4, 44.6], (44.6, 52.8], (52.8, 61.0]]
              closed='right',
              dtype='interval[float64]')

Index(['Youth', 'YoungAdult', 'MiddleAged', 'Senior'], dtype='object')

#### Cut vs qcut

In [66]:
# qcut: discretization based on quantiles, not distance
data = np.random.randn(1000)    # normal distribution
pd.cut( data, 4).value_counts()
pd.qcut(data, 4).value_counts()

(-3.41, -1.796]      41
(-1.796, -0.188]    378
(-0.188, 1.419]     487
(1.419, 3.027]       94
dtype: int64

(-3.404, -0.694]    250
(-0.694, 0.0639]    250
(0.0639, 0.756]     250
(0.756, 3.027]      250
dtype: int64

### Computing Dummy Variables
Convert categorical columns to numerical.

In [69]:
df = pd.DataFrame({'key1': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'key2': ['e', 'f', 'g', 'e', 'f', 'f'],
                   'data1': range(6)})
df.head(3)

Unnamed: 0,key1,key2,data1
0,b,e,0
1,b,f,1
2,a,g,2


In [73]:
# One-Hot-Encoding a categorical column
cat_cols = df.columns[df.dtypes == 'O'] # covert all the categorical columns automatically!
pd.get_dummies(df, prefix=cat_cols)

Unnamed: 0,data1,key1_a,key1_b,key1_c,key2_e,key2_f,key2_g
0,0,0,1,0,1,0,0
1,1,0,1,0,0,1,0
2,2,1,0,0,0,0,1
3,3,0,0,1,1,0,0
4,4,1,0,0,0,1,0
5,5,0,1,0,0,1,0


## <a id='3'> String Manipulation
- <a href='#0'> Back to TOC

String manipulation, one of the most frequently applied skills in our daily data science workflows. Problems are often messy and solutions can be laborious. But with the power of regular expression, we can often create elegant solutions.
    
- Python string methods
- Python regular expression (RE)
- Pandas vectorized string methods

#### Split a string into a list of elements

In [105]:
s = 'a,b,    c,  d,  b, e, b'

s.split()    # split by space
s.split(',') # split by comma

[x.strip() for x in s.split(',')]  # split + strip
re.split(',\s*', s)                # regular expression: re.func(pattern, string)

['a,b,', 'c,', 'd,', 'b,', 'e,', 'b']

['a', 'b', '    c', '  d', '  b', ' e', ' b']

['a', 'b', 'c', 'd', 'b', 'e', 'b']

['a', 'b', 'c', 'd', 'b', 'e', 'b']

#### Join the elements in a list to a string

In [106]:
s_list = re.split(',\s*', s)
'|'.join(s_list) # list -> string

'a|b|c|d|b|e|b'

#### Search, Find, Count and Replace

In [109]:
# Search
'b' in s

# Find
print(s.index('b')) # raise an exception if not exist
print(s.find('b'))  # return first index, -1 if it doesn't exist

# Count
print(s.count('b'))

# Replace
s.replace(',', '|')

True

2
2
3


'a|b|    c|  d|  b| e| b'

### The Power of RE

#### Find all emails in the text using RE

In [118]:
# findall : all matched substring 
text = """
Dave dave@google.com
Steve steve-li@gmail.com
Rob rob.NG@gmail.com
Ryan ryan.Zh_W@yahoo.com
"""

pattern = r'[A-Z0-9._-]+@[A-Z0-9.-]+\.[A-Z]{2,4}' # raw str inside [], note the escape before '.' outside []

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

regex.findall(text)

All emails:


['dave@google.com',
 'steve-li@gmail.com',
 'rob.NG@gmail.com',
 'ryan.Zh_W@yahoo.com']

#### Extract user names from the emails

In [121]:
# Use parentheses () to target a substring in a matched pattern

pattern = r'([A-Z0-9._%+-]+)@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern, flags=re.IGNORECASE)

regex.findall(text) # output in groups

['dave', 'steve-li', 'rob.NG', 'ryan.Zh_W']

### Pandas: Vectorized String Functions
Handling missing values

In [123]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve-li@gmail.com',
        'Rob': 'rob.NG@gmail.com', 'Wes': np.nan}
se = pd.Series(data)
se

Dave        dave@google.com
Steve    steve-li@gmail.com
Rob        rob.NG@gmail.com
Wes                     NaN
dtype: object

In [124]:
se.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object