In [81]:
import os 
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder


In [82]:
data={
    'age':[19,12,23,21,21,12,45,43,54,23],
    'salary':[10000,110012021,np.nan,123121,344432,np.nan,121212,424,534342,98844],
    'name':['devansh','varad','raj','rohit','devarshi','divyansh','rani','saksham','harshita','ayushi'],
    'department':['cse','cse','mech','mech','cse','cse','mech','cse','cse','mech'],
    'education':['btech','mba','btech','mba','btech','btech','btech','mba','mba','mba']
};
df=pd.DataFrame(data);
df.head()

Unnamed: 0,age,salary,name,department,education
0,19,10000.0,devansh,cse,btech
1,12,110012021.0,varad,cse,mba
2,23,,raj,mech,btech
3,21,123121.0,rohit,mech,mba
4,21,344432.0,devarshi,cse,btech


In [83]:
os.getcwd()

'C:\\Users\\varad\\Desktop\\py'

## Normalization

In [84]:
print("data before normalization: ");
df['salary'].head()

data before normalization: 


0        10000.0
1    110012021.0
2            NaN
3       123121.0
4       344432.0
Name: salary, dtype: float64

In [85]:
# normalization of salary column 
scaler=MinMaxScaler()
df['salary_min_max']=MinMaxScaler().fit_transform(df[['salary']]) 

In [86]:
print("data after normalization: ");
df['salary_min_max'].head()

data after normalization: 


0    0.000087
1    1.000000
2         NaN
3    0.001115
4    0.003127
Name: salary_min_max, dtype: float64

## Standardization

In [87]:
print("data before serialization : ");
df['salary'].head()

data before serialization : 


0        10000.0
1    110012021.0
2            NaN
3       123121.0
4       344432.0
Name: salary, dtype: float64

In [88]:
std_scaler=StandardScaler()
df['income_zscore']=StandardScaler().fit_transform(df[['salary']]) 

In [89]:
print("data after normalization: ");
df['income_zscore'].head()

data after normalization: 


0   -0.382532
1    2.645723
2         NaN
3   -0.379418
4   -0.373325
Name: income_zscore, dtype: float64

## DataScaling 

In [90]:

from sklearn.preprocessing import RobustScaler
scaler=RobustScaler()
scaledData=scaler.fit_transform(df[['salary']])

In [91]:
scaledData

array([[-3.55771838e-01],
       [ 3.48550731e+02],
       [            nan],
       [ 3.02750126e-03],
       [ 7.04985941e-01],
       [            nan],
       [-3.02750126e-03],
       [-3.86145177e-01],
       [ 1.30734609e+00],
       [-7.39747491e-02]])

In [92]:
## encoding 
label=LabelEncoder()
df['encoded']=label.fit_transform(df['education']) 

In [93]:
print(f"before label encoding on education column: \n{df['education'].head()}\n")
print(f"after label encoding on education column: \n{df['encoded'].head()}")


before label encoding on education column: 
0    btech
1      mba
2    btech
3      mba
4    btech
Name: education, dtype: object

after label encoding on education column: 
0    0
1    1
2    0
3    1
4    0
Name: encoded, dtype: int64


In [94]:
one_hot =pd.get_dummies(df,columns=['department'])
one_hot 

Unnamed: 0,age,salary,name,education,salary_min_max,income_zscore,encoded,department_cse,department_mech
0,19,10000.0,devansh,btech,8.7e-05,-0.382532,0,True,False
1,12,110012021.0,varad,mba,1.0,2.645723,1,True,False
2,23,,raj,btech,,,0,False,True
3,21,123121.0,rohit,mba,0.001115,-0.379418,1,False,True
4,21,344432.0,devarshi,btech,0.003127,-0.373325,0,True,False
5,12,,divyansh,btech,,,0,True,False
6,45,121212.0,rani,btech,0.001098,-0.37947,0,False,True
7,43,424.0,saksham,mba,0.0,-0.382795,1,True,False
8,54,534342.0,harshita,mba,0.004853,-0.368097,1,True,False
9,23,98844.0,ayushi,mba,0.000895,-0.380086,1,False,True


## Handling missing data

In [95]:

print("Missing values per column:")
print(df.isnull().sum())


Missing values per column:
age               0
salary            2
name              0
department        0
education         0
salary_min_max    2
income_zscore     2
encoded           0
dtype: int64


In [96]:
df['salary_fill_mean'] = df['salary'].fillna(df['salary'].mean())

print("Salary column after filling with mean:")
print(df['salary_fill_mean'].head())


Salary column after filling with mean:
0        10000.0
1    110012021.0
2     13905549.5
3       123121.0
4       344432.0
Name: salary_fill_mean, dtype: float64


In [97]:

# Fill missing categorical with mode
df['department_fill_mode'] = df['department'].fillna(df['department'].mode()[0])
print("Department column after filling with mode:")
print(df['department_fill_mode'].head())

# Drop rows with missing values
print("Shape before dropping rows:", df.shape)
df_dropna = df.dropna()
print("Shape after dropping rows:", df_dropna.shape)



Department column after filling with mode:
0     cse
1     cse
2    mech
3    mech
4     cse
Name: department_fill_mode, dtype: object
Shape before dropping rows: (10, 10)
Shape after dropping rows: (8, 10)


## Agreegation

In [98]:
print("Group by Department (mean Salary, avg Age):")
print(df.groupby('department')[['salary_fill_mean','age']].mean())

pivot = pd.pivot_table(df, values='salary_fill_mean', index='education', columns='department', aggfunc='mean')
print(pivot)

Group by Department (mean Salary, avg Age):
            salary_fill_mean        age
department                             
cse             2.080113e+07  26.833333
mech            3.562182e+06  28.000000
department           cse        mech
education                           
btech       4.753327e+06  7013380.75
mba         3.684893e+07   110982.50


## binning

In [99]:
bins = [0, 25, 40, 100]
labels = ['Young','Middle-aged','Old']
df['age_bin'] = pd.cut(df['age'], bins=bins, labels=labels)
print(df[['age','age_bin']].head())

   age age_bin
0   19   Young
1   12   Young
2   23   Young
3   21   Young
4   21   Young


In [100]:
df['salary_bin'] = pd.qcut(df['salary_fill_mean'], q=3, labels=['Low','Medium','High'])
print(df[['salary_fill_mean','salary_bin']].head())

   salary_fill_mean salary_bin
0           10000.0        Low
1       110012021.0       High
2        13905549.5       High
3          123121.0     Medium
4          344432.0     Medium
