# Data Cleaning and Preparation

Data preparation : loading, cleaning, transforming, rearranging

## 7.1 Handling Missing Data


In [2]:
import pandas as pd
import numpy as np

In [None]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])

# Python's build in None value also been treated as NA
string_data.isna()

In [None]:
num_data = pd.Series([0, 1, 3, 0])

num_data.isnull()

In [4]:
# Filtering out missing data

data = pd.Series([1, np.nan, 3.5, np.nan, 7])

# dropna by default drops any rows containing a missing value
# use how="all" will drop only rows that are all NA
# use axis="columns" to drop the column that contains NA value
# thresh argument will keep only rows containing at most a certain number of missing observations 
data.dropna(how="all")
# Same as 
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [7]:
# Filling In Missing Data
data.fillna(0)

0    1.0
1    0.0
2    3.5
3    0.0
4    7.0
dtype: float64

In [8]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [18]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))

In [19]:
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan

In [20]:
# Use different fill value for each column
# ffill only works when at least the first row is not NA
# df.fillna({1:0.5, 2:0})

df.fillna(method="ffill")

Unnamed: 0,0,1,2
0,-1.419314,-2.131236,-0.097613
1,-1.03244,-0.211922,-0.42118
2,0.917722,-0.211922,1.996031
3,-0.003102,-0.211922,-2.005211
4,0.173572,-0.211922,-2.005211
5,1.377814,-0.211922,-2.005211
6,0.789744,-0.211922,-2.005211


## 7.2 Data Transformation

### Removing duplicates

In [24]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
"k2": [1,1,2,3,3,4,4]
})

In [27]:
# Indicating whether each row is duplicated
data.duplicated()

# returns a dataframe with rows where the duplicated array is False filtered out
# Subset will remove the duplicates for all the passed in columns
# last will keep the last duplicated index instead first by default
data.drop_duplicates(subset=["k1", "k2"], keep="last")

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [29]:
# Transforming Data Using a Function or Mapping
data = pd.DataFrame({"food": ["bacon", "pulled prok"], "ounces": [3, 4]})

meat_to_animal = {"bacon": "pig", "pulled pork": "pig"}

# Map method will map the key of the Series with the key of the passed in dictionary, and return the value
data["animal"] = data["food"].map(meat_to_animal)

In [30]:
# Replacing Values
data = pd.Series([1, -999, 2, -999, -1000, 3])

data.replace(-999, np.nan)

# Replace multiple data at once
data.replace([-999, -1000], np.nan)

# use different replacement for each value
data.replace([-999, -1000], [np.nan, 0])
# or passin a dictionary
data.replace({{-999:np.nan, -1000: 0}})

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [33]:
# Renaming Axis Indexes
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
index = ["Ohio", "Colorado", "New York"],
columns = ["one", "two", "three", "four"]
)

def transform(x):
	return x[:4].upper()

data.index = data.index.map(transform)

# Create a transformed version of a dataset without modifying the original
data.rename(index=str.title, columns=str.upper)

# Use in conjunction with a dictionary-like object
data.rename(index={"OHIO": "INDIANA"}, columns={"three":"peekabo"})

Unnamed: 0,one,two,peekabo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### Discretization and Binning

Continuous data is ofthen discretized or binned for analysis. 


In [38]:
ages = [20, 22, 25, 27, 21, 23, 27, 37, 31, 61, 45, 41, 32]

bins = [18, 25, 35, 60, 100]

# the bin computed by pandas.cut. 
age_categories = pd.cut(ages, bins)
# Get the index number of the cutted sets in original data
age_categories.codes

age_categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [41]:
# parenthesis means the side is open and exclusive
# square bracket means the side is closed, inclusive

pd.cut(ages, bins, right=False)

# Define a interval-based bin labeling
group_names = ["Youth", "YoungAdult", "MiddleAgend", "Senior"]

pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAgend', 'MiddleAgend', 'YoungAdult']
Length: 13
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAgend' < 'Senior']

In [43]:
data = np.random.uniform(size = 20)

# If pass an integer of bin, pandas will compute the equal-length bins based on the minimum and max values 
# precision=2 limits the decimal precision to two digits
pd.cut(data, 4, precision=2)

# pd.qcut bins the data based on sample quantiles
pd.qcut(data, 4, precision=2)


[(0.1, 0.35], (0.58, 0.85], (0.35, 0.5], (0.5, 0.58], (0.1, 0.35], ..., (0.35, 0.5], (0.5, 0.58], (0.1, 0.35], (0.35, 0.5], (0.35, 0.5]]
Length: 20
Categories (4, interval[float64, right]): [(0.1, 0.35] < (0.35, 0.5] < (0.5, 0.58] < (0.58, 0.85]]