
# Data Preprocessing Techniques in Python
This notebook covers the most important data preprocessing techniques used in data science, including:
- Handling missing data
- Encoding categorical variables
- Feature scaling
- Feature selection
- Outlier detection and handling
- Data normalization


In [2]:

# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import IsolationForest


## 1. Handling Missing Data

In [5]:
# Create sample data with missing values
data = {
    'age': [25, 30, np.nan, 35, 40],
    'salary': [50000, 60000, np.nan, 80000, 90000],
    'city': ['New York', 'Los Angeles', 'New York', 'Chicago', np.nan]
}
df = pd.DataFrame(data)
print("Original Data:")
print(df)

# Impute missing numerical values with the mean
imputer = SimpleImputer(strategy='mean')
df['age'] = imputer.fit_transform(df[['age']])
df['salary'] = imputer.fit_transform(df[['salary']])

# Impute missing categorical values with the most frequent value
imputer = SimpleImputer(strategy='most_frequent')
df['city'] = imputer.fit_transform(df[['city']])

print("\nData after handling missing values:")
print(df)


Original Data:
    age   salary         city
0  25.0  50000.0     New York
1  30.0  60000.0  Los Angeles
2   NaN      NaN     New York
3  35.0  80000.0      Chicago
4  40.0  90000.0          NaN


ValueError: 2

## 2. Encoding Categorical Variables

In [None]:



# Label Encoding for ordinal data
label_encoder = LabelEncoder()
df['city_label'] = label_encoder.fit_transform(df['city'])
print("\nData after Label Encoding:")
print(df)

# One-Hot Encoding for nominal data
df_onehot = pd.get_dummies(df, columns=['city'])
print("\nData after One-Hot Encoding:")
print(df_onehot)


## 3. Feature Scaling

In [None]:



# Standardization (Z-score scaling)
scaler = StandardScaler()
df['age_scaled'] = scaler.fit_transform(df[['age']])
df['salary_scaled'] = scaler.fit_transform(df[['salary']])
print("\nData after Standardization:")
print(df)

# Min-Max Scaling (Normalization)
minmax_scaler = MinMaxScaler()
df['age_minmax'] = minmax_scaler.fit_transform(df[['age']])
df['salary_minmax'] = minmax_scaler.fit_transform(df[['salary']])
print("\nData after Min-Max Scaling:")
print(df)


## 4. Feature Selection

In [None]:



# Feature selection using SelectKBest with chi-squared (for categorical target variable)

# Create a sample dataset
X = df[['age', 'salary']]
y = df['city_label']  # Example target

# Apply SelectKBest to select top 1 feature
selector = SelectKBest(chi2, k=1)
X_new = selector.fit_transform(X, y)
print("\nFeatures selected by SelectKBest (top 1):")
print(X_new)


## 5. Outlier Detection

In [3]:



# Create a sample dataset
data_outliers = {'age': [25, 30, 35, 40, 1000], 'salary': [50000, 60000, 70000, 80000, 90000]}
df_outliers = pd.DataFrame(data_outliers)
print("\nData with potential outliers:")
print(df_outliers)

# Using Isolation Forest to detect outliers
iso_forest = IsolationForest(contamination=0.2)
outliers = iso_forest.fit_predict(df_outliers)
df_outliers['outlier'] = outliers
print("\nData after outlier detection (-1 indicates outlier):")
print(df_outliers)



Data with potential outliers:
    age  salary
0    25   50000
1    30   60000
2    35   70000
3    40   80000
4  1000   90000

Data after outlier detection (-1 indicates outlier):
    age  salary  outlier
0    25   50000        1
1    30   60000        1
2    35   70000        1
3    40   80000        1
4  1000   90000       -1


## 6. Data Normalization

In [4]:



# Using Min-Max Scaling to normalize data between 0 and 1
df_normalized = df.copy()
df_normalized[['age', 'salary']] = minmax_scaler.fit_transform(df[['age', 'salary']])
print("\nData after normalization:")
print(df_normalized)


NameError: name 'df' is not defined