# Data Cleaning and Preparation

Data preparation : loading, cleaning, transforming, rearranging

## 7.1 Handling Missing Data


In [1]:
import pandas as pd
import numpy as np

In [None]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])

# Python's build in None value also been treated as NA
string_data.isna()

In [None]:
num_data = pd.Series([0, 1, 3, 0])

num_data.isnull()

In [None]:
# Filtering out missing data

data = pd.Series([1, np.nan, 3.5, np.nan, 7])

# dropna by default drops any rows containing a missing value
# use how="all" will drop only rows that are all NA
# use axis="columns" to drop the column that contains NA value
# thresh argument will keep only rows containing at most a certain number of missing observations 
data.dropna(how="all")
# Same as 
data[data.notna()]

In [None]:
# Filling In Missing Data
data.fillna(0)

In [None]:
data

In [None]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))

In [None]:
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan

In [None]:
# Use different fill value for each column
# ffill only works when at least the first row is not NA
# df.fillna({1:0.5, 2:0})

df.fillna(method="ffill")

## 7.2 Data Transformation

### Removing duplicates

In [None]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
"k2": [1,1,2,3,3,4,4]
})

In [None]:
# Indicating whether each row is duplicated
data.duplicated()

# returns a dataframe with rows where the duplicated array is False filtered out
# Subset will remove the duplicates for all the passed in columns
# last will keep the last duplicated index instead first by default
data.drop_duplicates(subset=["k1", "k2"], keep="last")

In [None]:
# Transforming Data Using a Function or Mapping
data = pd.DataFrame({"food": ["bacon", "pulled prok"], "ounces": [3, 4]})

meat_to_animal = {"bacon": "pig", "pulled pork": "pig"}

# Map method will map the key of the Series with the key of the passed in dictionary, and return the value
data["animal"] = data["food"].map(meat_to_animal)

In [None]:
# Replacing Values
data = pd.Series([1, -999, 2, -999, -1000, 3])

data.replace(-999, np.nan)

# Replace multiple data at once
data.replace([-999, -1000], np.nan)

# use different replacement for each value
data.replace([-999, -1000], [np.nan, 0])
# or passin a dictionary
data.replace({{-999:np.nan, -1000: 0}})

In [None]:
# Renaming Axis Indexes
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
index = ["Ohio", "Colorado", "New York"],
columns = ["one", "two", "three", "four"]
)

def transform(x):
	return x[:4].upper()

data.index = data.index.map(transform)

# Create a transformed version of a dataset without modifying the original
data.rename(index=str.title, columns=str.upper)

# Use in conjunction with a dictionary-like object
data.rename(index={"OHIO": "INDIANA"}, columns={"three":"peekabo"})

### Discretization and Binning

Continuous data is ofthen discretized or binned for analysis. 


In [None]:
ages = [20, 22, 25, 27, 21, 23, 27, 37, 31, 61, 45, 41, 32]

bins = [18, 25, 35, 60, 100]

# the bin computed by pandas.cut. 
age_categories = pd.cut(ages, bins)
# Get the index number of the cutted sets in original data
age_categories.codes

age_categories.categories

In [None]:
# parenthesis means the side is open and exclusive
# square bracket means the side is closed, inclusive

pd.cut(ages, bins, right=False)

# Define a interval-based bin labeling
group_names = ["Youth", "YoungAdult", "MiddleAgend", "Senior"]

pd.cut(ages, bins, labels=group_names)

In [None]:
data = np.random.uniform(size = 20)

# If pass an integer of bin, pandas will compute the equal-length bins based on the minimum and max values 
# precision=2 limits the decimal precision to two digits
pd.cut(data, 4, precision=2)

# pd.qcut bins the data based on sample quantiles
pd.qcut(data, 4, precision=2)


### Detecting and Filtering Outliers

In [None]:
data = pd.DataFrame(np.random.standard_normal((1000,4)))

data.describe()

In [None]:
col = data[2]

# Find values in one of the columns exceeding 2.5 in absolute value
col[col.abs() > 2.5]

# To select all rows having a value exceeding 3 or -3
# .any method can be used on boolean DataFrame
data[(data.abs() > 3).any(axis='columns')]

# Cap values outside of -2 or 2
# np.sign() provide a -1 and 1 value based on the data
data[data.abs()>2] = np.sign(data) * 2

### Permutation and Random Sampling
Permuting (randomly reordering) a Series or the rows in a DataFrame


In [None]:
data = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))

In [None]:
# Random the order of the DataFrame
sampler = np.random.permutation(5)

# Apply the random order to the DataFrame
data.take(sampler)
# Same as 
data.iloc[sampler]

column_sampler = np.random.permutation(7)

data.take(column_sampler, axis="columns")

# To select a random subset without replacement
data.sample(n=3)

# to generate a sample with replacement (to alow repeated choices) 
choices = pd.Series([5, 7, -1, 6, 4])
choices.sample(n=10, replace=True)

### Computing Indicator / Dummy Variables
Convert a categorical variable in to a dummy or indicator matrix

In [None]:
df = pd.DataFrame({"key":["b", "b", "a", "c", "a", "b"], "data1":range(6)})

# Dummy variables
dummies = pd.get_dummies(df["key"], prefix="key")

# Join the dummy data
df_with_dymmy = df[["data1"]].join(dummies)

In [None]:
mnames = ["movie_id", "title", "genres"]

movies = pd.read_table(
    "./datasets/movielens/movies.dat",
    sep="::",
    header=None,
    names=mnames,
    engine="python",
)


In [None]:
# Get dummies if a row in a DataFrame belongs to multiple categories
dummies = movies["genres"].str.get_dummies("|")

In [None]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))

In [None]:
movies_windic.iloc[0]

In [None]:
# Combine pandas.get_dummies with a discretization function likes pandas.clear_output
np.random.seed(12345)

values = np.random.uniform(size=10)

values

bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

# Return a dataframe with provided list value been categorized
pd.get_dummies(pd.cut(values, bins))

dummies

In [None]:
pd.cut(values, bins)

In [None]:

values

## 7.3 Extension Data Types
Extension type system, allowing for new data types to be added even if they are not supported natively by NumPy



In [14]:
# By default, the series will be convert to float64 and np.nan for missing value
s = pd.Series([1, 2, 3, None])

# Create a series with pandas Int64Dtype
s = pd.Series([1, 2, 3, None], dtype=pd.Int64Dtype())
s

# Missing vlaye will be use pandas.NA sentinel value
s[3] is pd.NA

# Sorthand for "pd.Int64Dtype()"
s = pd.Series([1, 2, 3, None], dtype="Int64")

# String arrays use less memory and more efficient
s = pd.Series(["one", "two", None, "three"], dtype=pd.StringDtype())


In [18]:
# Extention types can be passed to Series astype method

df = pd.DataFrame({"A": [1, 2, None, 4],
"B": ["one", "two", "three", None],
"C": [False, None, False, True]
})

df["A"] = df["A"].astype('Int64')
df["B"] = df["B"].astype(pd.StringDtype())
df["C"] = df["C"].astype("boolean")

In [19]:

df

Unnamed: 0,A,B,C
0,1.0,one,False
1,2.0,two,
2,,three,False
3,4.0,,True


## 7.4 String Manipulation

### Python Built-In String Object Methods



In [21]:
val = "a,b, guido"

val.split(",")

pieces = [x.strip() for x in val.split(",")]

In [34]:
first, second, third = pieces

first + "::" + second + "::" + third

"::".join(pieces)

"guido" in val

# val.index(",")

# difference between find() and index()
# find() doesn't raise error
val.find("asdf")

# index() will raise error if the element does not exist
val.index(',')

# count return the number or occurrences of a particular substring
val.count(",")

# Replace will substitute occurrences of one pattern for another
val.replace(",", "::")

# Also can be used to delete patterns by passing an empty string.
val.replace(",", "")



'ab guido'

### Regular Expressions

The re module functions fall in to three categories: pattern matching, substitution, splitting


In [35]:
import re

text = "foo   bar\t baz \tqux"

# first complied and then split the passed text by white space characters
re.split(r"\s+", text)

['foo', 'bar', 'baz', 'qux']

In [36]:
# Same process as above
regex = re.compile(r"\s+")

regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [37]:
regex.findall(text)

['   ', '\t ', ' \t']

Creating a regex object with re.complie is highly recommended

match and search are closely related to findall

find all returns all mathes in a string. search returns only the first match.

Match only matches at the buginning of the string

In [55]:
text = """ 
Dave dave@google.com
Steve steve@gmail.com
Ryan ryan@yahoo.com
"""

pattern = r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}"

# re.IGNORECASE makes the regex case insensitive
regex = re.compile(pattern, flags= re.IGNORECASE)

regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'ryan@yahoo.com']

In [43]:
# Only find the first match
m = regex.search(text)
m

<re.Match object; span=(7, 22), match='dave@google.com'>

In [45]:
# Because the pattern is not matched start from beginning
print(regex.match(text))

# sub will return a new string with occurrences of the pattern replaced by a new string.
regex.sub('Redacted', text)

None


' \nDave Redacted\nSteve Redacted\nRyan Redacted\n'

In [57]:
# Find segment for each group
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"

regex = re.compile(pattern, flags=re.IGNORECASE)

# text = "wesm@bright.net"
m = regex.match(text)


In [58]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

Sub has access to groups in each match using special symbold like \1, \2

### String functions in pandas
Clean up a messy dataset for analysis 


In [61]:
data = {"Dave": "dave@google.com", "Steve": "steve@gmail.com", "Wes": np.nan}

data = pd.Series(data)

In [62]:
data.isna()

Dave     False
Steve    False
Wes       True
dtype: bool

In [67]:

data.str.contains('gmail')


data_as_string = data.astype("string")

# use regular expression in string values
data_as_string.str.findall(pattern, flags=re.IGNORECASE)

# str.extract() will return an data frame
data_as_string.str.extract(pattern, flags=re.IGNORECASE)

Unnamed: 0,0,1,2
Dave,dave,google,com
Steve,steve,gmail,com
Wes,,,


In [65]:
data_as_string

Dave     dave@google.com
Steve    steve@gmail.com
Wes                 <NA>
dtype: string