### Handling Missing Values

In [1]:
import pandas as pd
import numpy as np

# Sample Data
data = {'Name': ['John', 'Anna', 'Mike', 'Sarah', 'David'],
        'Age': [28, np.nan, 35, 40, np.nan],
        'Salary': [50000, 55000, 60000, np.nan, 70000]}

In [2]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Name,Age,Salary
0,John,28.0,50000.0
1,Anna,,55000.0
2,Mike,35.0,60000.0
3,Sarah,40.0,
4,David,,70000.0


In [3]:
# filling missing values
df['Age'].fillna(df['Age'].median(), inplace = True)       # median impute
df['Salary'].fillna(df['Salary'].mean(), inplace = True)   # mean impute

df.head()

Unnamed: 0,Name,Age,Salary
0,John,28.0,50000.0
1,Anna,35.0,55000.0
2,Mike,35.0,60000.0
3,Sarah,40.0,58750.0
4,David,35.0,70000.0


### Handling Outliers

In [8]:
from scipy import stats

# Z-Score method
df['Z_score'] = stats.zscore(df['Salary'])
df_outliers = df[abs(df['Z_score']) > 1]
print("\nOutliers Detected Using Z-Score:\n", df_outliers)


Outliers Detected Using Z-Score:
     Name   Age   Salary   Z_score
0   John  28.0  50000.0 -1.322876
4  David  35.0  70000.0  1.700840


In [9]:
# Interquantile Range (IQR) method

Q1 = df['Salary'].quantile(0.25)
Q3 = df['Salary'].quantile(0.75)
IQR = Q3 - Q1

df_no_outliers = df[(df['Salary'] >= (Q1 - 1.5 * IQR)) & (df['Salary'] <= (Q3 + 1.5 * IQR))]
df_no_outliers

Unnamed: 0,Name,Age,Salary,Z_score
0,John,28.0,50000.0,-1.322876
1,Anna,35.0,55000.0,-0.566947
2,Mike,35.0,60000.0,0.188982
3,Sarah,40.0,58750.0,0.0


### Handling Categorical Data

In [12]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

df1 = pd.DataFrame({'Color': ['Red', 'Blue', 'Green', 'Red', 'Blue']})

ohe = OneHotEncoder(sparse = False)
encoded_data = ohe.fit_transform(df1[['Color']])
df_encoded = pd.DataFrame(encoded_data, columns = ohe.get_feature_names_out())
print('Ohe : ', df_encoded)

le = LabelEncoder()
df1['Color_label'] = le.fit_transform(df1['Color'])
print('le', df1)

Ohe :     Color_Blue  Color_Green  Color_Red
0         0.0          0.0        1.0
1         1.0          0.0        0.0
2         0.0          1.0        0.0
3         0.0          0.0        1.0
4         1.0          0.0        0.0
le    Color  Color_label
0    Red            2
1   Blue            0
2  Green            1
3    Red            2
4   Blue            0




### Data Transformation

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df['Salary_Standardized'] = scaler.fit_transform(df[['Salary']])
df.head()

Unnamed: 0,Name,Age,Salary,Z_score,Salary_Standardized
0,John,28.0,50000.0,-1.322876,-1.322876
1,Anna,35.0,55000.0,-0.566947,-0.566947
2,Mike,35.0,60000.0,0.188982,0.188982
3,Sarah,40.0,58750.0,0.0,0.0
4,David,35.0,70000.0,1.70084,1.70084


In [17]:
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()
df['Salary_Scaled'] = scaler.fit_transform(df[['Salary']])
df.head()

Unnamed: 0,Name,Age,Salary,Z_score,Salary_Standardized,Salary_Scaled
0,John,28.0,50000.0,-1.322876,-1.322876,-1.322876
1,Anna,35.0,55000.0,-0.566947,-0.566947,-0.566947
2,Mike,35.0,60000.0,0.188982,0.188982,0.188982
3,Sarah,40.0,58750.0,0.0,0.0,0.0
4,David,35.0,70000.0,1.70084,1.70084,1.70084


### Feature Engineering

In [19]:
# feature creation

df['Experience_Level'] = df['Age'].apply(lambda x: 'Senior' if x > 35 else 'Junior')
df.head()


Unnamed: 0,Name,Age,Salary,Z_score,Salary_Standardized,Salary_Scaled,Experience_Level
0,John,28.0,50000.0,-1.322876,-1.322876,-1.322876,Junior
1,Anna,35.0,55000.0,-0.566947,-0.566947,-0.566947,Junior
2,Mike,35.0,60000.0,0.188982,0.188982,0.188982,Junior
3,Sarah,40.0,58750.0,0.0,0.0,0.0,Senior
4,David,35.0,70000.0,1.70084,1.70084,1.70084,Junior


In [20]:
# binning

bins = [0, 30, 50, 100]
labels = ['Young', 'Middle-aged', 'Senior']
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels)
df.head()

Unnamed: 0,Name,Age,Salary,Z_score,Salary_Standardized,Salary_Scaled,Experience_Level,Age_Group
0,John,28.0,50000.0,-1.322876,-1.322876,-1.322876,Junior,Young
1,Anna,35.0,55000.0,-0.566947,-0.566947,-0.566947,Junior,Middle-aged
2,Mike,35.0,60000.0,0.188982,0.188982,0.188982,Junior,Middle-aged
3,Sarah,40.0,58750.0,0.0,0.0,0.0,Senior,Middle-aged
4,David,35.0,70000.0,1.70084,1.70084,1.70084,Junior,Middle-aged


### Handling Imbalanced Data

In [24]:
# SMOTE -> Synthetic Minority Over-sampling Technique
from imblearn.over_sampling import SMOTE
from collections import Counter

X = df[['Age', 'Salary']]
y = [0,1,0,1,0] 

smote = SMOTE(sampling_strategy='auto', k_neighbors=1)
X_resampled, y_resampled = smote.fit_resample(X,y)
print(Counter(y_resampled))

Counter({0: 3, 1: 3})
