In [1]:
import numpy as np
import pandas as pd

In [2]:
data = {
    'Name' : ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age' : [24,np.nan,22,25,30],
    'Salary' : [50000, 54000, 32000, np.nan, 40000],
    'Department' : ['HR','Finance','IT','HR','Finance']
}

df = pd.DataFrame(data)
print(df)

      Name   Age   Salary Department
0    Alice  24.0  50000.0         HR
1      Bob   NaN  54000.0    Finance
2  Charlie  22.0  32000.0         IT
3    David  25.0      NaN         HR
4      Eve  30.0  40000.0    Finance


In [3]:
df.head()

Unnamed: 0,Name,Age,Salary,Department
0,Alice,24.0,50000.0,HR
1,Bob,,54000.0,Finance
2,Charlie,22.0,32000.0,IT
3,David,25.0,,HR
4,Eve,30.0,40000.0,Finance


In [4]:
df.describe()

Unnamed: 0,Age,Salary
count,4.0,4.0
mean,25.25,44000.0
std,3.40343,9933.109617
min,22.0,32000.0
25%,23.5,38000.0
50%,24.5,45000.0
75%,26.25,51000.0
max,30.0,54000.0


In [6]:
print(df.isnull().sum())

Name          0
Age           1
Salary        1
Department    0
dtype: int64


In [9]:
# Filling Missing Age Value with the Mean()
df['Age'].fillna(df['Age'].mean(),inplace=True)

#Filling Missing Salary Value with the Median()
df['Salary'].fillna(df['Salary'].median(),inplace=True)

print(df)

      Name    Age   Salary Department
0    Alice  24.00  50000.0         HR
1      Bob  25.25  54000.0    Finance
2  Charlie  22.00  32000.0         IT
3    David  25.00  45000.0         HR
4      Eve  30.00  40000.0    Finance


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].median(),inplace=True)


In [10]:
# Min-Max Normalization Of Salary:

df['Salary'] = (df['Salary']-df['Salary'].min())/(df['Salary'].max()-df['Salary'].min())
print(df)


      Name    Age    Salary Department
0    Alice  24.00  0.818182         HR
1      Bob  25.25  1.000000    Finance
2  Charlie  22.00  0.000000         IT
3    David  25.00  0.590909         HR
4      Eve  30.00  0.363636    Finance


In [12]:
# One-Hot encoding for the Department Column.
df = pd.get_dummies(df,columns=['Department'])

print(df)

      Name    Age    Salary  Department_Finance  Department_HR  Department_IT
0    Alice  24.00  0.818182               False           True          False
1      Bob  25.25  1.000000                True          False          False
2  Charlie  22.00  0.000000               False          False           True
3    David  25.00  0.590909               False           True          False
4      Eve  30.00  0.363636                True          False          False


In [14]:
# Creating A new feature : Experience Level

conditions = [
    (df['Age']<=25),
    (df['Age']>25)
]

choice = ['Junior','Senior']
df['Experience_level'] = np.select(conditions,choice,default='Unknown')
print(df)

      Name    Age    Salary  Department_Finance  Department_HR  Department_IT  \
0    Alice  24.00  0.818182               False           True          False   
1      Bob  25.25  1.000000                True          False          False   
2  Charlie  22.00  0.000000               False          False           True   
3    David  25.00  0.590909               False           True          False   
4      Eve  30.00  0.363636                True          False          False   

  Experience_level  
0           Junior  
1           Senior  
2           Junior  
3           Junior  
4           Senior  


In [15]:
# Binnings Age into categories:
bins = [20,25,30,35]
labels = ['20-25','25-30','30-35']
df['Age_group'] = pd.cut(df['Age'],bins=bins,labels=labels)
print(df)

      Name    Age    Salary  Department_Finance  Department_HR  Department_IT  \
0    Alice  24.00  0.818182               False           True          False   
1      Bob  25.25  1.000000                True          False          False   
2  Charlie  22.00  0.000000               False          False           True   
3    David  25.00  0.590909               False           True          False   
4      Eve  30.00  0.363636                True          False          False   

  Experience_level Age_group  
0           Junior     20-25  
1           Senior     25-30  
2           Junior     20-25  
3           Junior     20-25  
4           Senior     25-30  


In [17]:
# Group by Experience Level:

agg_df = df.groupby('Experience_level').agg({'Salary':'mean'})
print(agg_df)

                    Salary
Experience_level          
Junior            0.469697
Senior            0.681818


In [18]:
# Apply a custom function to create a salary rank

df['Salary_Rank'] = df['Salary'].rank(method='dense')
print(df)

      Name    Age    Salary  Department_Finance  Department_HR  Department_IT  \
0    Alice  24.00  0.818182               False           True          False   
1      Bob  25.25  1.000000                True          False          False   
2  Charlie  22.00  0.000000               False          False           True   
3    David  25.00  0.590909               False           True          False   
4      Eve  30.00  0.363636                True          False          False   

  Experience_level Age_group  Salary_Rank  
0           Junior     20-25          4.0  
1           Senior     25-30          5.0  
2           Junior     20-25          1.0  
3           Junior     20-25          3.0  
4           Senior     25-30          2.0  


In [19]:
df.to_csv('tranformed_data.csv',index=False)