- Note: I refer to Chapter 07 < Python for Data Analysis: Data Wrangling with Pandas, NumPy, and IPython (3rd) > (2023, Wes McKinney)

# 0. Preparation

In [7]:
# Import modules
import pandas as pd
import numpy as np

# 1. Remove Duplicates

- Methods
  - `duplicated()`: Return a Boolean Series -> True for duplicated values
  - `drop_duplicates()`: Drop duplicated rows
      - 'drop_duplicates(subset=["COLUMN_NAME"]): Filter duplicates based on "COLUMN_NAME"
      - 'drop_duplicates(keep="last"): Drop duplicates from the last observed value

In [8]:
# Create a DataFrame
data = pd.DataFrame({"Column1": ["Apple", "Banana"] * 3 + ["Banana"],
                   "Column2": [1, 1, 2, 3, 3, 4, 4]})

data

Unnamed: 0,Column1,Column2
0,Apple,1
1,Banana,1
2,Apple,2
3,Banana,3
4,Apple,3
5,Banana,4
6,Banana,4


In [9]:
# Use 'Duplicated' method to check duplicates
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [10]:
# Drop duplicated rows
data.drop_duplicates()

Unnamed: 0,Column1,Column2
0,Apple,1
1,Banana,1
2,Apple,2
3,Banana,3
4,Apple,3
5,Banana,4


In [11]:
# Add a column for next step
data["Column3"] = range(7)

data

Unnamed: 0,Column1,Column2,Column3
0,Apple,1,0
1,Banana,1,1
2,Apple,2,2
3,Banana,3,3
4,Apple,3,4
5,Banana,4,5
6,Banana,4,6


In [12]:
# Use `drop_duplicated(subset=["COLUMN"])` to drop duplicates based on the column
data.drop_duplicates(subset=["Column1"])


Unnamed: 0,Column1,Column2,Column3
0,Apple,1,0
1,Banana,1,1


In [13]:
# Use 'drop_duplicates(keep="last") to drop duplicates from the last one
data.drop_duplicates(["Column1", "Column2"], keep="last")

Unnamed: 0,Column1,Column2,Column3
0,Apple,1,0
1,Banana,1,1
2,Apple,2,2
3,Banana,3,3
4,Apple,3,4
6,Banana,4,6


# 2. Transform Data using Functions or Mapping

- Use `map()` method
  - Transform data based on values in arrays, Series, or columns in a DataFrame

In [14]:
# Create a DataFrame
data1 = pd.DataFrame ({"Fruits": ["Apple", "Banana", "Mango", "Pineapple", "Guava", "Grape", "Orange", "Melon"],
                     "Weights": [3, 5, 24, 4, 6.2, 31, 6, 1.8]})

data2 = pd.DataFrame ({"Fruits": ["Apple", "Banana", "Mango", "Pineapple", "Guava", "Grape", "Orange", "Melon"],
                     "Weights": [3, 5, 24, 4, 6.2, 31, 6, 1.8]})

data1

Unnamed: 0,Fruits,Weights
0,Apple,3.0
1,Banana,5.0
2,Mango,24.0
3,Pineapple,4.0
4,Guava,6.2
5,Grape,31.0
6,Orange,6.0
7,Melon,1.8


In [15]:
# Add a column of colors of each fruit
colors_of_fruits = {"Apple": "Red",
                     "Banana": "Yellow",
                     "Mango": "Golden Yellow",
                     "Pineapple": "Brown",
                     "Guava": "Green",
                     "Grape": "Purple",
                     "Orange": "Orange",
                     "Melon": "Green"
}

## 2.1. Map

In [16]:
# Use 'map' method to transform values
data1["Colors"] = data1["Fruits"].map(colors_of_fruits)

data1

Unnamed: 0,Fruits,Weights,Colors
0,Apple,3.0,Red
1,Banana,5.0,Yellow
2,Mango,24.0,Golden Yellow
3,Pineapple,4.0,Brown
4,Guava,6.2,Green
5,Grape,31.0,Purple
6,Orange,6.0,Orange
7,Melon,1.8,Green


## 2.2. Function

In [17]:
# Use 'map' method to transform values

# Define a function
def get_colors(x):
    return colors_of_fruits[x]

In [18]:
data2["Colors"] = data2["Fruits"].map(get_colors)

data2

Unnamed: 0,Fruits,Weights,Colors
0,Apple,3.0,Red
1,Banana,5.0,Yellow
2,Mango,24.0,Golden Yellow
3,Pineapple,4.0,Brown
4,Guava,6.2,Green
5,Grape,31.0,Purple
6,Orange,6.0,Orange
7,Melon,1.8,Green


# 3. Replace Values

- Use `replace()` method

In [19]:
# Create a Seriese
data = pd.Series([1., -999, 2., -999, 3., -1000])

data

0       1.0
1    -999.0
2       2.0
3    -999.0
4       3.0
5   -1000.0
dtype: float64

In [20]:
# Replace a single value
data1 = data.replace(-999, np.nan)

data1

0       1.0
1       NaN
2       2.0
3       NaN
4       3.0
5   -1000.0
dtype: float64

In [21]:
# Replace multiple values to a single new value
data2 = data.replace([-999, -1000], np.nan)

data2

0    1.0
1    NaN
2    2.0
3    NaN
4    3.0
5    NaN
dtype: float64

In [22]:
# Replace multiple values to each value: 'replace' method
data3 = data.replace([-999, -1000], [np.nan, 0.0])

data3

0    1.0
1    NaN
2    2.0
3    NaN
4    3.0
5    0.0
dtype: float64

In [23]:
# Replace multiple values to each value: Dictionary type
data4 = data.replace({-999: np.nan, -1000:0.0})

data4

0    1.0
1    NaN
2    2.0
3    NaN
4    3.0
5    0.0
dtype: float64

# 4. Rename Axis Indexes

In [24]:
# Create a DataFrame
data = pd.DataFrame(np.arange(24).reshape((4, 6)),
                   index = ["Math", "Music", "English", "Science"],
                   columns = ["Year1", "Year2", "Year3", "Year4", "Year5", "Year6"])

data

Unnamed: 0,Year1,Year2,Year3,Year4,Year5,Year6
Math,0,1,2,3,4,5
Music,6,7,8,9,10,11
English,12,13,14,15,16,17
Science,18,19,20,21,22,23


In [25]:
# Replace with Capital letters: Use 'upper()' option and 'map' method

def transform(x):
    return x[:5].upper()

data2 = data.index.map(transform)
data2

Index(['MATH', 'MUSIC', 'ENGLI', 'SCIEN'], dtype='object')

In [26]:
# use 'rename' method -> Not modify the original data

data3 = data.rename(index=str.title, columns=str.upper)

data3

Unnamed: 0,YEAR1,YEAR2,YEAR3,YEAR4,YEAR5,YEAR6
Math,0,1,2,3,4,5
Music,6,7,8,9,10,11
English,12,13,14,15,16,17
Science,18,19,20,21,22,23


In [27]:
# Use 'rename' method to modify a subset of axis lables
data4 = data.rename(index = {"Math": "Mathmatics"},
                   columns = {"Year6": "Year Six"})

data4

Unnamed: 0,Year1,Year2,Year3,Year4,Year5,Year Six
Mathmatics,0,1,2,3,4,5
Music,6,7,8,9,10,11
English,12,13,14,15,16,17
Science,18,19,20,21,22,23


# 5. Discretization and Binning

- `pd.cut()`
  - Discretize continuous data
  - Seperate continuous data into 'bins'

- `pd.value_count()`   
  : Check intervals and number of values in each bin

## 5.1. Use the 'Bin' by manual

In [37]:
# Create a list
studentID = [ 1, 4, 8, 12, 25, 28, 30, 39, 40, 48, 53, 59, 62, 64, 87, 89, 90, 92, 98 ]

studentID

[1, 4, 8, 12, 25, 28, 30, 39, 40, 48, 53, 59, 62, 64, 87, 89, 90, 92, 98]

In [44]:
# Seperate numbers into 'bins'

# Create a bin
bins = [1, 20, 40, 60, 80, 100]

# Devide the list into five groups: 1 to 20, 21 to 40, 41 to 60, 61 to 80, 81 to 100
studentID_categories = pd.cut(items, bins)

studentID_categories

[NaN, (1.0, 20.0], (1.0, 20.0], (1.0, 20.0], (20.0, 40.0], ..., (80, 100], (80, 100], (80, 100], (80, 100], (80, 100]]
Length: 19
Categories (5, interval[int64, right]): [(1, 20] < (20, 40] < (40, 60] < (60, 80] < (80, 100]]

In [45]:
# Check numbers of values in each category
studentID_categories.codes

array([-1,  0,  0,  0,  1,  1,  1,  1,  1,  2,  2,  2,  3,  3,  4,  4,  4,
        4,  4], dtype=int8)

In [46]:
# Check intervals in each group
studentID_categories.categories

IntervalIndex([(1, 20], (20, 40], (40, 60], (60, 80], (80, 100]], dtype='interval[int64, right]')

In [47]:
# Check the first interval
studentID_categories.categories[0]

Interval(1, 20, closed='right')

In [48]:
# Use 'pd.value_count(categories)' to check the bin counts: (interval + numbers of values)
pd.value_counts(studentID_categories)

(20, 40]     5
(80, 100]    5
(1, 20]      3
(40, 60]     3
(60, 80]     2
dtype: int64

In [50]:
# Label the default interval-based bin

# Create a list for labels
classes = ["Beginner", "Pre-Intermediate", "Intermediate", "Pre-Advanced", "Advanced"]

# Add labels to the bin
pd.cut(studentID, bins, labels=classes)

[NaN, 'Beginner', 'Beginner', 'Beginner', 'Pre-Intermediate', ..., 'Advanced', 'Advanced', 'Advanced', 'Advanced', 'Advanced']
Length: 19
Categories (5, object): ['Beginner' < 'Pre-Intermediate' < 'Intermediate' < 'Pre-Advanced' < 'Advanced']

## 5.2. Use the compute equal-length bins

In [52]:
# Use 'np.random.uniform()'

# Create a list of random numbers
data = np.random.uniform(size=20)

data

array([0.94068557, 0.93680735, 0.60040325, 0.16738623, 0.22321768,
       0.95583533, 0.30509643, 0.49743881, 0.37242209, 0.0944555 ,
       0.44323451, 0.22533939, 0.32650323, 0.5182671 , 0.79727504,
       0.95946822, 0.89629643, 0.67685534, 0.1792399 , 0.30689475])

In [54]:
# 'pd.cut(DATA, NUMBER OF GROUPS)': Seperate into groups
# 'pd.cut(precision)': Limit to two decimal places
pd.cut(data, 4, precision=2)

[(0.74, 0.96], (0.74, 0.96], (0.53, 0.74], (0.094, 0.31], (0.094, 0.31], ..., (0.74, 0.96], (0.74, 0.96], (0.53, 0.74], (0.094, 0.31], (0.094, 0.31]]
Length: 20
Categories (4, interval[float64, right]): [(0.094, 0.31] < (0.31, 0.53] < (0.53, 0.74] < (0.74, 0.96]]

In [58]:
# Use 'np.qcut()'

# Create a list of random numbers
data = np.random.standard_normal(100)

data

array([ 0.05388448, -1.58539918,  0.93198417, -0.41546973,  1.86648873,
       -0.53753617,  0.34399919, -0.89997706, -1.05426536,  1.27296306,
       -0.50288341, -0.60052076, -1.25402234,  2.59284806, -0.52516398,
       -0.15472177, -2.36579097, -1.04831983, -1.69890844,  1.12991539,
        0.01905861,  0.34628338,  1.38048476,  0.83721606,  2.69503601,
       -0.00853118, -0.69211777, -0.20892829,  1.36107927,  0.24914032,
        0.25140474, -0.82988386, -0.49006039, -0.31625743,  0.71465875,
       -0.46068381, -1.14094416,  0.03170804, -0.07115081,  1.69713118,
       -2.35980591,  1.00149632, -1.38869168,  0.35137092,  0.5265581 ,
       -0.39871527,  0.48552713,  2.5490247 , -1.11252866, -0.45089502,
        0.57813286, -0.08907831,  1.54521576, -0.72023692, -1.87150139,
       -0.89590164, -0.09362826,  0.7519406 , -0.32684636, -0.05630606,
       -0.48696323,  0.48686714, -1.28441471,  0.11183341, -2.14885838,
        0.42251969,  0.65284983,  0.37740285, -0.83718075, -1.56

In [59]:
# Devide the list into four groups equally
quartiles = pd.qcut(data, 4, precision=2)

quartiles

[(-0.23, 0.47], (-2.3899999999999997, -0.91], (0.47, 2.7], (-0.91, -0.23], (0.47, 2.7], ..., (-0.23, 0.47], (-0.23, 0.47], (-2.3899999999999997, -0.91], (-0.91, -0.23], (-2.3899999999999997, -0.91]]
Length: 100
Categories (4, interval[float64, right]): [(-2.3899999999999997, -0.91] < (-0.91, -0.23] < (-0.23, 0.47] < (0.47, 2.7]]

In [60]:
# Check numbers of values in each quartile
pd.value_counts(quartiles)

(-2.3899999999999997, -0.91]    25
(-0.91, -0.23]                  25
(-0.23, 0.47]                   25
(0.47, 2.7]                     25
dtype: int64

# 6. Detect and Filter Outliers

In [67]:
# Create a DataFrame
data = pd.DataFrame(np.random.standard_normal((500, 4)))

data.describe()

Unnamed: 0,0,1,2,3
count,500.0,500.0,500.0,500.0
mean,0.040843,0.076733,0.086647,0.041168
std,1.052834,1.019603,1.010063,1.034483
min,-3.399111,-3.132194,-3.286591,-3.814614
25%,-0.678805,-0.565901,-0.633233,-0.688399
50%,-0.00535,0.026022,0.103408,-0.009361
75%,0.79011,0.761963,0.809247,0.797428
max,3.126225,3.580144,3.068059,3.230067


In [70]:
# Find rows with values exceeding absolute 3 (|3|) in a specific column

column = data[0]

column[column.abs() > 3]

127   -3.024364
150    3.126225
379   -3.399111
Name: 0, dtype: float64

In [69]:
# Find rows with values exceeding |3| in all columns
data[(data.abs() > 3).any(axis="columns")]

Unnamed: 0,0,1,2,3
127,-3.024364,1.270351,-2.516453,1.101539
149,-0.533766,3.580144,1.234423,-0.268789
150,3.126225,-0.449722,1.952979,-0.246195
158,0.276649,-0.294758,2.14089,-3.814614
214,0.520504,-3.132194,0.332053,0.588424
230,-1.596789,0.918525,-3.286591,-1.876586
274,-1.355351,0.779738,0.995252,3.230067
279,0.522126,0.485177,3.068059,-0.99342
284,2.36528,-0.396193,-2.410846,3.142005
379,-3.399111,-2.17576,-1.427802,0.61304


# 7. Permutation and Random Sampling

In [72]:
# 7.2.7 (p303)