In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
def data_preprocessing(data):
    #separating the numeric and object datatypes
    numeric_features = data.select_dtypes(include=['float', 'int']).columns
    categorical_features = data.select_dtypes(include='object').columns
    
    #handle missing values in dataset for numeric features
    data[numeric_features] = data[numeric_features].fillna(data[numeric_features].mean())
    
    #detect and handle outliers in numeric features using IQR
    for features in numeric_features:
        Q1 = data[features].quantile(0.25)
        Q3 = data[features].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - (1.5 * IQR)
        upper_bound = Q1 + (1.5 * IQR)
        data[features] = np.where((data[features] < lower_bound) | (data[features] > upper_bound), data[features].mean(),
                                  data[features])
    
    #Stamdardizing the numeric features
    scaler = StandardScaler()
    data[numeric_features] = scaler.fit_transform(data[numeric_features])
    
    #handling missing values in categorical features
    data[categorical_features] = data[categorical_features].fillna(data[categorical_features].mode().iloc[0])
    
    return data

In [4]:
data = pd.read_csv(r'C:\Users\vyshn\Downloads\data.csv')

print('Original Data:')
print(data)

Original Data:
   NumericFeature1  NumericFeature2 CategoricalFeature
0              1.0                7                  A
1              2.0                8                  B
2              NaN                9                NaN
3              4.0               10                  A
4              5.0               11                  B
5              6.0               50                  C


In [5]:
cleaned_data= data_preprocessing(data)

print('Preprocessd data:')
print(cleaned_data)

Preprocessd data:
   NumericFeature1  NumericFeature2 CategoricalFeature
0         -1.66463        -1.099370                  A
1         -0.90798        -0.749128                  B
2          0.30266        -0.398886                  A
3          0.60532        -0.048645                  A
4          1.36197         0.301597                  B
5          0.30266         1.994431                  C
