# Transformation / Feature Engineering

In [13]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_csv("csv1.csv") 

In [15]:
df2 = df.copy()

In [16]:
df2

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,,Canada,Female,62000.0
3,3,Alex,,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,
5,5,Li Wei,27.0,China,Male,51000.0
6,6,,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0


In [17]:
# apply a fnx to a row or col
df2["tax"] = df2["income"].apply(lambda x : "10%" if x < 60000 else "20%")
# a function is passed as parameter in apply(). Can be written outside and be input as parameter too.
df2

Unnamed: 0,id,name,age,country,gender,income,tax
0,1,John Doe,29.0,USA,Male,55000.0,10%
1,1,John Doe,29.0,USA,Male,55000.0,10%
2,2,Jane Smith,,Canada,Female,62000.0,20%
3,3,Alex,,USA,Unknown,47000.0,10%
4,4,Maria Garcia,34.0,Spain,Female,,20%
5,5,Li Wei,27.0,China,Male,51000.0,10%
6,6,,45.0,India,Female,73000.0,20%
7,7,Ahmed Khan,38.0,,Male,68000.0,20%
8,8,Rachel Lee,29.0,USA,Female,62000.0,20%
9,9,Carlos Ruiz,,Mexico,Male,45000.0,10%


In [18]:
df2["gender"].unique()

array(['Male', 'Female', 'Unknown', nan], dtype=object)

In [19]:
# map values to another
new_gender_vals = {"Male": "M", "Female": "F", "Unknown": "U"}
df2["gender"] = df2["gender"].map(new_gender_vals)

In [20]:
df2

Unnamed: 0,id,name,age,country,gender,income,tax
0,1,John Doe,29.0,USA,M,55000.0,10%
1,1,John Doe,29.0,USA,M,55000.0,10%
2,2,Jane Smith,,Canada,F,62000.0,20%
3,3,Alex,,USA,U,47000.0,10%
4,4,Maria Garcia,34.0,Spain,F,,20%
5,5,Li Wei,27.0,China,M,51000.0,10%
6,6,,45.0,India,F,73000.0,20%
7,7,Ahmed Khan,38.0,,M,68000.0,20%
8,8,Rachel Lee,29.0,USA,F,62000.0,20%
9,9,Carlos Ruiz,,Mexico,M,45000.0,10%


In [21]:
# assign some new values
df2.assign(new_income = df2["income"] * 1.1)

Unnamed: 0,id,name,age,country,gender,income,tax,new_income
0,1,John Doe,29.0,USA,M,55000.0,10%,60500.0
1,1,John Doe,29.0,USA,M,55000.0,10%,60500.0
2,2,Jane Smith,,Canada,F,62000.0,20%,68200.0
3,3,Alex,,USA,U,47000.0,10%,51700.0
4,4,Maria Garcia,34.0,Spain,F,,20%,
5,5,Li Wei,27.0,China,M,51000.0,10%,56100.0
6,6,,45.0,India,F,73000.0,20%,80300.0
7,7,Ahmed Khan,38.0,,M,68000.0,20%,74800.0
8,8,Rachel Lee,29.0,USA,F,62000.0,20%,68200.0
9,9,Carlos Ruiz,,Mexico,M,45000.0,10%,49500.0


In [22]:
# replace specific values
df2["country"].replace("USA", "United States of America")

0     United States of America
1     United States of America
2                       Canada
3     United States of America
4                        Spain
5                        China
6                        India
7                          NaN
8     United States of America
9                       Mexico
10    United States of America
Name: country, dtype: object

# Transformation - Sorting

In [24]:
df3 = df.copy()

In [26]:
# Renaming
df3.columns = ["Id", "Name", "Age", "Country", "Gender", "Income"]
df3.rename(columns={"Income": "Salary"})
df3.rename(index={0: "first"})

Unnamed: 0,Id,Name,Age,Country,Gender,Income
first,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,,Canada,Female,62000.0
3,3,Alex,,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,
5,5,Li Wei,27.0,China,Male,51000.0
6,6,,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0


In [28]:
# Sorting - values & index
df3.sort_values("Income")                     # sort values in ascending
df3.sort_values("Income", ascending=False)    # sort values in descending
df3.sort_values(["Age", "Income"])            # sorts for age, if age is same then sorts for income 

Unnamed: 0,Id,Name,Age,Country,Gender,Income
5,5,Li Wei,27.0,China,Male,51000.0
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
10,10,Emily Davis,31.0,USA,,58000.0
4,4,Maria Garcia,34.0,Spain,Female,
7,7,Ahmed Khan,38.0,,Male,68000.0
6,6,,45.0,India,Female,73000.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0
3,3,Alex,,USA,Unknown,47000.0


In [33]:
sorted_df3 = df3.sort_values(["Age", "Income"])
sorted_df3

Unnamed: 0,Id,Name,Age,Country,Gender,Income
5,5,Li Wei,27.0,China,Male,51000.0
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
10,10,Emily Davis,31.0,USA,,58000.0
4,4,Maria Garcia,34.0,Spain,Female,
7,7,Ahmed Khan,38.0,,Male,68000.0
6,6,,45.0,India,Female,73000.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0
3,3,Alex,,USA,Unknown,47000.0


In [34]:
sorted_df3.sort_index()

Unnamed: 0,Id,Name,Age,Country,Gender,Income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,,Canada,Female,62000.0
3,3,Alex,,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,
5,5,Li Wei,27.0,China,Male,51000.0
6,6,,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0


In [30]:
# Reset Index - resets index for the new sorted data
sorted_df3.reset_index()
sorted_df3.reset_index(drop=True)     # to drop original index vals


Unnamed: 0,Id,Name,Age,Country,Gender,Income
0,5,Li Wei,27.0,China,Male,51000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,1,John Doe,29.0,USA,Male,55000.0
3,8,Rachel Lee,29.0,USA,Female,62000.0
4,10,Emily Davis,31.0,USA,,58000.0
5,4,Maria Garcia,34.0,Spain,Female,
6,7,Ahmed Khan,38.0,,Male,68000.0
7,6,,45.0,India,Female,73000.0
8,9,Carlos Ruiz,,Mexico,Male,45000.0
9,3,Alex,,USA,Unknown,47000.0


In [36]:
# Ranking
df3["Ranking"] = df3["Income"].rank(ascending=False, method="dense")

# method - dense helps resolve ties in value.
# options are min, max, dense etc.

In [37]:
df3

Unnamed: 0,Id,Name,Age,Country,Gender,Income,Ranking
0,1,John Doe,29.0,USA,Male,55000.0,5.0
1,1,John Doe,29.0,USA,Male,55000.0,5.0
2,2,Jane Smith,,Canada,Female,62000.0,3.0
3,3,Alex,,USA,Unknown,47000.0,7.0
4,4,Maria Garcia,34.0,Spain,Female,,
5,5,Li Wei,27.0,China,Male,51000.0,6.0
6,6,,45.0,India,Female,73000.0,1.0
7,7,Ahmed Khan,38.0,,Male,68000.0,2.0
8,8,Rachel Lee,29.0,USA,Female,62000.0,3.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0,8.0
