# Data Transformation

## Sorting

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 22],
    'Salary': [50000, 60000, 45000]
})
df

Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,30,60000
2,Charlie,22,45000


In [3]:
df.sort_values(by="Age")

Unnamed: 0,Name,Age,Salary
2,Charlie,22,45000
0,Alice,25,50000
1,Bob,30,60000


In [4]:
df.sort_values(by="Salary", ascending=False)

Unnamed: 0,Name,Age,Salary
1,Bob,30,60000
0,Alice,25,50000
2,Charlie,22,45000


In [5]:
df.sort_values(by="Name", ascending=False)

Unnamed: 0,Name,Age,Salary
2,Charlie,22,45000
1,Bob,30,60000
0,Alice,25,50000


In [6]:
df2=df.sort_values(by="Age")
df2

Unnamed: 0,Name,Age,Salary
2,Charlie,22,45000
0,Alice,25,50000
1,Bob,30,60000


In [7]:
df2.sort_index()

Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,30,60000
2,Charlie,22,45000


## Filtering

In [8]:
data = {
    'Department': ['HR', 'Finance', 'IT', 'HR', 'IT', 'Finance', 'HR'],
    'Employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace'],
    'Salary': [5000, 6000, 7000, 5200, 7500, 6400, 5500],
    'Experience': [3, 4, 5, 2, 6, 5, 3]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Department,Employee,Salary,Experience
0,HR,Alice,5000,3
1,Finance,Bob,6000,4
2,IT,Charlie,7000,5
3,HR,David,5200,2
4,IT,Eve,7500,6
5,Finance,Frank,6400,5
6,HR,Grace,5500,3


In [9]:
condition=df['Salary'] > 6000

In [10]:
df[condition]

Unnamed: 0,Department,Employee,Salary,Experience
2,IT,Charlie,7000,5
4,IT,Eve,7500,6
5,Finance,Frank,6400,5


In [11]:
df[df["Salary"]>6000]

Unnamed: 0,Department,Employee,Salary,Experience
2,IT,Charlie,7000,5
4,IT,Eve,7500,6
5,Finance,Frank,6400,5


In [12]:
condition=(df['Salary'] > 6000) &( df["Department"]=="IT")
condition

0    False
1    False
2     True
3    False
4     True
5    False
6    False
dtype: bool

In [13]:
df[condition]

Unnamed: 0,Department,Employee,Salary,Experience
2,IT,Charlie,7000,5
4,IT,Eve,7500,6


In [14]:
#one liner
df[(df['Salary'] > 6000) & (df["Department"]=="IT")]

Unnamed: 0,Department,Employee,Salary,Experience
2,IT,Charlie,7000,5
4,IT,Eve,7500,6


### applying functions (Transformation)
- map() method(Series only)
- apply() method(Both series and dataframe)
- applymap() method(Dataframe only)

In [15]:
df

Unnamed: 0,Department,Employee,Salary,Experience
0,HR,Alice,5000,3
1,Finance,Bob,6000,4
2,IT,Charlie,7000,5
3,HR,David,5200,2
4,IT,Eve,7500,6
5,Finance,Frank,6400,5
6,HR,Grace,5500,3


In [16]:
type(df["Department"])

pandas.core.series.Series

In [17]:
l=[1,2,3,4,5,6,7,8,9,10]
s=pd.Series(l)
s

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int64

In [18]:
s=s.map(lambda x:f"The value is {str(x)}")
s

0     The value is 1
1     The value is 2
2     The value is 3
3     The value is 4
4     The value is 5
5     The value is 6
6     The value is 7
7     The value is 8
8     The value is 9
9    The value is 10
dtype: object

In [19]:
l=[1,2,3,4,5,6,7,8,9,10]
s=pd.Series(l)
s

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int64

In [20]:
s=s.map(lambda x:x**2+3*x-2)
s

0      2
1      8
2     16
3     26
4     38
5     52
6     68
7     86
8    106
9    128
dtype: int64

In [21]:
l=[1,2,3,4,5,6,7,8,9,10]
s=pd.Series(l)
s

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int64

In [22]:
s=s.apply(lambda x:x**2+3*x-2)
s

0      2
1      8
2     16
3     26
4     38
5     52
6     68
7     86
8    106
9    128
dtype: int64

In [23]:
df

Unnamed: 0,Department,Employee,Salary,Experience
0,HR,Alice,5000,3
1,Finance,Bob,6000,4
2,IT,Charlie,7000,5
3,HR,David,5200,2
4,IT,Eve,7500,6
5,Finance,Frank,6400,5
6,HR,Grace,5500,3


In [24]:
df["Salary_transformed"]=df["Salary"].apply(lambda x: (x/1000)+1)
df

Unnamed: 0,Department,Employee,Salary,Experience,Salary_transformed
0,HR,Alice,5000,3,6.0
1,Finance,Bob,6000,4,7.0
2,IT,Charlie,7000,5,8.0
3,HR,David,5200,2,6.2
4,IT,Eve,7500,6,8.5
5,Finance,Frank,6400,5,7.4
6,HR,Grace,5500,3,6.5


In [25]:
df.applymap(lambda x: x**2 if type(x)==int else x)

  df.applymap(lambda x: x**2 if type(x)==int else x)


Unnamed: 0,Department,Employee,Salary,Experience,Salary_transformed
0,HR,Alice,25000000,9,6.0
1,Finance,Bob,36000000,16,7.0
2,IT,Charlie,49000000,25,8.0
3,HR,David,27040000,4,6.2
4,IT,Eve,56250000,36,8.5
5,Finance,Frank,40960000,25,7.4
6,HR,Grace,30250000,9,6.5


In [26]:
df[["a", "b"]]=df[["Salary", "Experience"]].apply(lambda x: x**2)
df

Unnamed: 0,Department,Employee,Salary,Experience,Salary_transformed,a,b
0,HR,Alice,5000,3,6.0,25000000,9
1,Finance,Bob,6000,4,7.0,36000000,16
2,IT,Charlie,7000,5,8.0,49000000,25
3,HR,David,5200,2,6.2,27040000,4
4,IT,Eve,7500,6,8.5,56250000,36
5,Finance,Frank,6400,5,7.4,40960000,25
6,HR,Grace,5500,3,6.5,30250000,9


In [27]:
df

Unnamed: 0,Department,Employee,Salary,Experience,Salary_transformed,a,b
0,HR,Alice,5000,3,6.0,25000000,9
1,Finance,Bob,6000,4,7.0,36000000,16
2,IT,Charlie,7000,5,8.0,49000000,25
3,HR,David,5200,2,6.2,27040000,4
4,IT,Eve,7500,6,8.5,56250000,36
5,Finance,Frank,6400,5,7.4,40960000,25
6,HR,Grace,5500,3,6.5,30250000,9


In [28]:
data = {
    'Department': ['HR', 'Finance', 'IT', 'HR', 'IT', 'Finance', 'HR'],
    'Employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace'],
    'Salary': [5000, 6000, 7000, 5200, 7500, 6400, 5500],
    'Experience': [3, 4, 5, 2, 6, 5, 3],
    'Distance_Travelled': [10000,20000, 30000, 40000, 50000, 60000, 70000]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Department,Employee,Salary,Experience,Distance_Travelled
0,HR,Alice,5000,3,10000
1,Finance,Bob,6000,4,20000
2,IT,Charlie,7000,5,30000
3,HR,David,5200,2,40000
4,IT,Eve,7500,6,50000
5,Finance,Frank,6400,5,60000
6,HR,Grace,5500,3,70000


In [29]:
df[["Salary (in thoudan)", "Distanc_Travelled (in kms)"]]=df[["Salary", "Distance_Travelled"]].apply(lambda x: x/1000)
df

Unnamed: 0,Department,Employee,Salary,Experience,Distance_Travelled,Salary (in thoudan),Distanc_Travelled (in kms)
0,HR,Alice,5000,3,10000,5.0,10.0
1,Finance,Bob,6000,4,20000,6.0,20.0
2,IT,Charlie,7000,5,30000,7.0,30.0
3,HR,David,5200,2,40000,5.2,40.0
4,IT,Eve,7500,6,50000,7.5,50.0
5,Finance,Frank,6400,5,60000,6.4,60.0
6,HR,Grace,5500,3,70000,5.5,70.0


In [30]:
df[["Department", "Employee"]]=df[["Department", "Employee"]].applymap(lambda x: x.upper()) 
df

  df[["Department", "Employee"]]=df[["Department", "Employee"]].applymap(lambda x: x.upper())


Unnamed: 0,Department,Employee,Salary,Experience,Distance_Travelled,Salary (in thoudan),Distanc_Travelled (in kms)
0,HR,ALICE,5000,3,10000,5.0,10.0
1,FINANCE,BOB,6000,4,20000,6.0,20.0
2,IT,CHARLIE,7000,5,30000,7.0,30.0
3,HR,DAVID,5200,2,40000,5.2,40.0
4,IT,EVE,7500,6,50000,7.5,50.0
5,FINANCE,FRANK,6400,5,60000,6.4,60.0
6,HR,GRACE,5500,3,70000,5.5,70.0
