- Note: I refer to Chapter 07 < Python for Data Analysis: Data Wrangling with Pandas, NumPy, and IPython (3rd) > (2023, Wes McKinney)

# 0. Preparation

In [32]:
# Import modules
import pandas as pd
import numpy as np

- Methods
  - 'duplicated': Return a Boolean Series -> True for duplicated values
  - 'drop_duplicates': Drop duplicated rows
      - 'drop_duplicates(subset=["COLUMN_NAME"]): Filter duplicates based on "COLUMN_NAME"
      - 'drop_duplicates(keep="last"): Drop duplicates from the last observed value

# 1. Remove Duplicates

In [5]:
# Create a DataFrame
data = pd.DataFrame({"Column1": ["Apple", "Banana"] * 3 + ["Banana"],
                   "Column2": [1, 1, 2, 3, 3, 4, 4]})

data

Unnamed: 0,Column1,Column2
0,Apple,1
1,Banana,1
2,Apple,2
3,Banana,3
4,Apple,3
5,Banana,4
6,Banana,4


In [6]:
# Use 'Duplicated' method to check duplicates
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [7]:
# Drop duplicated rows
data.drop_duplicates()

Unnamed: 0,Column1,Column2
0,Apple,1
1,Banana,1
2,Apple,2
3,Banana,3
4,Apple,3
5,Banana,4


In [8]:
# Add a column for next step
data["Column3"] = range(7)

data

Unnamed: 0,Column1,Column2,Column3
0,Apple,1,0
1,Banana,1,1
2,Apple,2,2
3,Banana,3,3
4,Apple,3,4
5,Banana,4,5
6,Banana,4,6


In [9]:
# Use 'drop_duplicated(subset=["COLUMN"])' to drop duplicates based on the column
data.drop_duplicates(subset=["Column1"])


Unnamed: 0,Column1,Column2,Column3
0,Apple,1,0
1,Banana,1,1


In [10]:
# Use 'drop_duplicates(keep="last") to drop duplicates from the last one
data.drop_duplicates(["Column1", "Column2"], keep="last")

Unnamed: 0,Column1,Column2,Column3
0,Apple,1,0
1,Banana,1,1
2,Apple,2,2
3,Banana,3,3
4,Apple,3,4
6,Banana,4,6


# 3. Transform Data using Functions or Mapping

- Use 'map' method
  - Transform data based on values in arrays, Series, or columns in a DataFrame

In [21]:
# Create a DataFrame
data1 = pd.DataFrame ({"Fruits": ["Apple", "Banana", "Mango", "Pineapple", "Guava", "Grape", "Orange", "Melon"],
                     "Weights": [3, 5, 24, 4, 6.2, 31, 6, 1.8]})

data2 = pd.DataFrame ({"Fruits": ["Apple", "Banana", "Mango", "Pineapple", "Guava", "Grape", "Orange", "Melon"],
                     "Weights": [3, 5, 24, 4, 6.2, 31, 6, 1.8]})

data1

Unnamed: 0,Fruits,Weights
0,Apple,3.0
1,Banana,5.0
2,Mango,24.0
3,Pineapple,4.0
4,Guava,6.2
5,Grape,31.0
6,Orange,6.0
7,Melon,1.8


In [17]:
# Add a column of colors of each fruit
colors_of_fruits = {"Apple": "Red",
                     "Banana": "Yellow",
                     "Mango": "Golden Yellow",
                     "Pineapple": "Brown",
                     "Guava": "Green",
                     "Grape": "Purple",
                     "Orange": "Orange",
                     "Melon": "Green"
}

## 3.1. Map

In [22]:
# Use 'map' method to transform values
data1["Colors"] = data1["Fruits"].map(colors_of_fruits)

data1

Unnamed: 0,Fruits,Weights,Colors
0,Apple,3.0,Red
1,Banana,5.0,Yellow
2,Mango,24.0,Golden Yellow
3,Pineapple,4.0,Brown
4,Guava,6.2,Green
5,Grape,31.0,Purple
6,Orange,6.0,Orange
7,Melon,1.8,Green


## 3.2. Function

In [24]:
# Use 'map' method to transform values

# Define a function
def get_colors(x):
    return colors_of_fruits[x]

In [26]:
data2["Colors"] = data2["Fruits"].map(get_colors)

data2

Unnamed: 0,Fruits,Weights,Colors
0,Apple,3.0,Red
1,Banana,5.0,Yellow
2,Mango,24.0,Golden Yellow
3,Pineapple,4.0,Brown
4,Guava,6.2,Green
5,Grape,31.0,Purple
6,Orange,6.0,Orange
7,Melon,1.8,Green


# 4. Replace Values

- Use 'replace' method

In [29]:
# Create a Seriese
data = pd.Series([1., -999, 2., -999, 3., -1000])

data

0       1.0
1    -999.0
2       2.0
3    -999.0
4       3.0
5   -1000.0
dtype: float64

In [34]:
# Replace a single value
data1 = data.replace(-999, np.nan)

data1

0       1.0
1       NaN
2       2.0
3       NaN
4       3.0
5   -1000.0
dtype: float64

In [35]:
# Replace multiple values to a single new value
data2 = data.replace([-999, -1000], np.nan)

data2

0    1.0
1    NaN
2    2.0
3    NaN
4    3.0
5    NaN
dtype: float64

In [36]:
# Replace multiple values to each value: 'replace' method
data3 = data.replace([-999, -1000], [np.nan, 0.0])

data3

0    1.0
1    NaN
2    2.0
3    NaN
4    3.0
5    0.0
dtype: float64

In [37]:
# Replace multiple values to each value: Dictionary type
data4 = data.replace({-999: np.nan, -1000:0.0})

data4

0    1.0
1    NaN
2    2.0
3    NaN
4    3.0
5    0.0
dtype: float64

# 5. Rename Axis Indexes

In [46]:
# Create a DataFrame
data = pd.DataFrame(np.arange(24).reshape((4, 6)),
                   index = ["Math", "Music", "English", "Science"],
                   columns = ["Year1", "Year2", "Year3", "Year4", "Year5", "Year6"])

data

Unnamed: 0,Year1,Year2,Year3,Year4,Year5,Year6
Math,0,1,2,3,4,5
Music,6,7,8,9,10,11
English,12,13,14,15,16,17
Science,18,19,20,21,22,23


In [47]:
# Replace with Capital letters: Use 'upper()' option and 'map' method

def transform(x):
    return x[:5].upper()

data2 = data.index.map(transform)
data2

Index(['MATH', 'MUSIC', 'ENGLI', 'SCIEN'], dtype='object')

In [48]:
# use 'rename' method -> Not modify the original data

data3 = data.rename(index=str.title, columns=str.upper)

data3

Unnamed: 0,YEAR1,YEAR2,YEAR3,YEAR4,YEAR5,YEAR6
Math,0,1,2,3,4,5
Music,6,7,8,9,10,11
English,12,13,14,15,16,17
Science,18,19,20,21,22,23


In [52]:
# Use 'rename' method to modify a subset of axis lables
data4 = data.rename(index = {"Math": "Mathmatics"},
                   columns = {"Year6": "Year Six"})

data4

Unnamed: 0,Year1,Year2,Year3,Year4,Year5,Year Six
Mathmatics,0,1,2,3,4,5
Music,6,7,8,9,10,11
English,12,13,14,15,16,17
Science,18,19,20,21,22,23
