# Data Cleaning and Preparation

Data preparation : loading, cleaning, transforming, rearranging

## 7.1 Handling Missing Data


In [None]:
import pandas as pd
import numpy as np

In [None]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])

# Python's build in None value also been treated as NA
string_data.isna()

In [None]:
num_data = pd.Series([0, 1, 3, 0])

num_data.isnull()

In [None]:
# Filtering out missing data

data = pd.Series([1, np.nan, 3.5, np.nan, 7])

# dropna by default drops any rows containing a missing value
# use how="all" will drop only rows that are all NA
# use axis="columns" to drop the column that contains NA value
# thresh argument will keep only rows containing at most a certain number of missing observations 
data.dropna(how="all")
# Same as 
data[data.notna()]

In [None]:
# Filling In Missing Data
data.fillna(0)

In [None]:
data

In [None]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))

In [None]:
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan

In [None]:
# Use different fill value for each column
# ffill only works when at least the first row is not NA
# df.fillna({1:0.5, 2:0})

df.fillna(method="ffill")

## 7.2 Data Transformation

### Removing duplicates

In [None]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
"k2": [1,1,2,3,3,4,4]
})

In [None]:
# Indicating whether each row is duplicated
data.duplicated()

# returns a dataframe with rows where the duplicated array is False filtered out
# Subset will remove the duplicates for all the passed in columns
# last will keep the last duplicated index instead first by default
data.drop_duplicates(subset=["k1", "k2"], keep="last")

In [None]:
# Transforming Data Using a Function or Mapping
data = pd.DataFrame({"food": ["bacon", "pulled prok"], "ounces": [3, 4]})

meat_to_animal = {"bacon": "pig", "pulled pork": "pig"}

# Map method will map the key of the Series with the key of the passed in dictionary, and return the value
data["animal"] = data["food"].map(meat_to_animal)

In [None]:
# Replacing Values
data = pd.Series([1, -999, 2, -999, -1000, 3])

data.replace(-999, np.nan)

# Replace multiple data at once
data.replace([-999, -1000], np.nan)

# use different replacement for each value
data.replace([-999, -1000], [np.nan, 0])
# or passin a dictionary
data.replace({{-999:np.nan, -1000: 0}})

In [None]:
# Renaming Axis Indexes
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
index = ["Ohio", "Colorado", "New York"],
columns = ["one", "two", "three", "four"]
)

def transform(x):
	return x[:4].upper()

data.index = data.index.map(transform)

# Create a transformed version of a dataset without modifying the original
data.rename(index=str.title, columns=str.upper)

# Use in conjunction with a dictionary-like object
data.rename(index={"OHIO": "INDIANA"}, columns={"three":"peekabo"})

### Discretization and Binning

Continuous data is ofthen discretized or binned for analysis. 


In [None]:
ages = [20, 22, 25, 27, 21, 23, 27, 37, 31, 61, 45, 41, 32]

bins = [18, 25, 35, 60, 100]

# the bin computed by pandas.cut. 
age_categories = pd.cut(ages, bins)
# Get the index number of the cutted sets in original data
age_categories.codes

age_categories.categories

In [None]:
# parenthesis means the side is open and exclusive
# square bracket means the side is closed, inclusive

pd.cut(ages, bins, right=False)

# Define a interval-based bin labeling
group_names = ["Youth", "YoungAdult", "MiddleAgend", "Senior"]

pd.cut(ages, bins, labels=group_names)

In [None]:
data = np.random.uniform(size = 20)

# If pass an integer of bin, pandas will compute the equal-length bins based on the minimum and max values 
# precision=2 limits the decimal precision to two digits
pd.cut(data, 4, precision=2)

# pd.qcut bins the data based on sample quantiles
pd.qcut(data, 4, precision=2)


### Detecting and Filtering Outliers

In [None]:
data = pd.DataFrame(np.random.standard_normal((1000,4)))

data.describe()

In [None]:
col = data[2]

# Find values in one of the columns exceeding 2.5 in absolute value
col[col.abs() > 2.5]

# To select all rows having a value exceeding 3 or -3
# .any method can be used on boolean DataFrame
data[(data.abs() > 3).any(axis='columns')]

# Cap values outside of -2 or 2
# np.sign() provide a -1 and 1 value based on the data
data[data.abs()>2] = np.sign(data) * 2

### Permutation and Random Sampling
Permuting (randomly reordering) a Series or the rows in a DataFrame


In [None]:
data = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))

In [None]:
# Random the order of the DataFrame
sampler = np.random.permutation(5)

# Apply the random order to the DataFrame
data.take(sampler)
# Same as 
data.iloc[sampler]

column_sampler = np.random.permutation(7)

data.take(column_sampler, axis="columns")

# To select a random subset without replacement
data.sample(n=3)

# to generate a sample with replacement (to alow repeated choices) 
choices = pd.Series([5, 7, -1, 6, 4])
choices.sample(n=10, replace=True)

### Computing Indicator / Dummy Variables
Convert a categorical variable in to a dummy or indicator matrix

In [47]:
df = pd.DataFrame({"key":["b", "b", "a", "c", "a", "b"], "data1":range(6)})

# Dummy variables
dummies = pd.get_dummies(df["key"], prefix="key")

# Join the dummy data
df_with_dymmy = df[["data1"]].join(dummies)

In [49]:
mnames = ["movie_id", "title", "genres"]

movies = pd.read_table(
    "./datasets/movielens/movies.dat",
    sep="::",
    header=None,
    names=mnames,
    engine="python",
)


In [50]:
# Get dummies if a row in a DataFrame belongs to multiple categories
dummies = movies["genres"].str.get_dummies("|")

In [52]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))

In [54]:
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                   0
Genre_Adventure                                0
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Crime                                    0
Genre_Documentary                              0
Genre_Drama                                    0
Genre_Fantasy                                  0
Genre_Film-Noir                                0
Genre_Horror                                   0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Romance                                  0
Genre_Sci-Fi                                   0
Genre_Thriller                                 0
Genre_War                                      0
Genre_Western       

In [56]:
# Combine pandas.get_dummies with a discretization function likes pandas.clear_output
np.random.seed(12345)

values = np.random.uniform(size=10)

values

bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

# Return a dataframe with provided list value been categorized
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


dummies

In [57]:
pd.cut(values, bins)

[(0.8, 1.0], (0.2, 0.4], (0.0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.4, 0.6], (0.8, 1.0], (0.6, 0.8], (0.6, 0.8], (0.6, 0.8]]
Categories (5, interval[float64, right]): [(0.0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1.0]]

In [58]:

values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])