## Data Normalization

# 1. Min-Max Normalization
Min-Max normalization scales the data to a fixed range, typically [0, 1].

In [8]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the Iris dataset (assuming the dataset is in the same directory)
df = pd.read_csv("iris.csv")
print(df)

      ID  Sepal.Length  Sepal.Width  Petal.Length  Petal.Width    Species
0      1           5.1          3.5           1.4          0.2     setosa
1      2           4.9          3.0           1.4          0.2     setosa
2      3           4.7          3.2           1.3          0.2     setosa
3      4           4.6          3.1           1.5          0.2     setosa
4      5           5.0          3.6           1.4          0.2     setosa
..   ...           ...          ...           ...          ...        ...
145  146           6.7          3.0           5.2          2.3  virginica
146  147           6.3          2.5           5.0          1.9  virginica
147  148           6.5          3.0           5.2          2.0  virginica
148  149           6.2          3.4           5.4          2.3  virginica
149  150           5.9          3.0           5.1          1.8  virginica

[150 rows x 6 columns]


In [10]:
# Separate the features and the labels
X = df.drop('Species', axis=1)  # Features
y = df['Species']  # Labels

print(X)
print(y)

      ID  Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
0      1           5.1          3.5           1.4          0.2
1      2           4.9          3.0           1.4          0.2
2      3           4.7          3.2           1.3          0.2
3      4           4.6          3.1           1.5          0.2
4      5           5.0          3.6           1.4          0.2
..   ...           ...          ...           ...          ...
145  146           6.7          3.0           5.2          2.3
146  147           6.3          2.5           5.0          1.9
147  148           6.5          3.0           5.2          2.0
148  149           6.2          3.4           5.4          2.3
149  150           5.9          3.0           5.1          1.8

[150 rows x 5 columns]
0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: Species, Length: 150, dtype

In [11]:
# Step 1: Apply Min-Max Normalization
scaler = MinMaxScaler()  # By default, scales to [0, 1]
X_min_max_normalized = scaler.fit_transform(X)
print(X_min_max_normalized)

[[0.         0.22222222 0.625      0.06779661 0.04166667]
 [0.00671141 0.16666667 0.41666667 0.06779661 0.04166667]
 [0.01342282 0.11111111 0.5        0.05084746 0.04166667]
 [0.02013423 0.08333333 0.45833333 0.08474576 0.04166667]
 [0.02684564 0.19444444 0.66666667 0.06779661 0.04166667]
 [0.03355705 0.30555556 0.79166667 0.11864407 0.125     ]
 [0.04026846 0.08333333 0.58333333 0.06779661 0.08333333]
 [0.04697987 0.19444444 0.58333333 0.08474576 0.04166667]
 [0.05369128 0.02777778 0.375      0.06779661 0.04166667]
 [0.06040268 0.16666667 0.45833333 0.08474576 0.        ]
 [0.06711409 0.30555556 0.70833333 0.08474576 0.04166667]
 [0.0738255  0.13888889 0.58333333 0.10169492 0.04166667]
 [0.08053691 0.13888889 0.41666667 0.06779661 0.        ]
 [0.08724832 0.         0.41666667 0.01694915 0.        ]
 [0.09395973 0.41666667 0.83333333 0.03389831 0.04166667]
 [0.10067114 0.38888889 1.         0.08474576 0.125     ]
 [0.10738255 0.30555556 0.79166667 0.05084746 0.125     ]
 [0.11409396 0

In [12]:
# Convert the result back to a DataFrame for easier interpretation
X_min_max_df = pd.DataFrame(X_min_max_normalized, columns=X.columns)

print(X_min_max_df)

           ID  Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
0    0.000000      0.222222     0.625000      0.067797     0.041667
1    0.006711      0.166667     0.416667      0.067797     0.041667
2    0.013423      0.111111     0.500000      0.050847     0.041667
3    0.020134      0.083333     0.458333      0.084746     0.041667
4    0.026846      0.194444     0.666667      0.067797     0.041667
..        ...           ...          ...           ...          ...
145  0.973154      0.666667     0.416667      0.711864     0.916667
146  0.979866      0.555556     0.208333      0.677966     0.750000
147  0.986577      0.611111     0.416667      0.711864     0.791667
148  0.993289      0.527778     0.583333      0.745763     0.916667
149  1.000000      0.444444     0.416667      0.694915     0.708333

[150 rows x 5 columns]


In [13]:
# Print the first few rows of the normalized data
print("Min-Max Normalized Data:")
print(X_min_max_df.head())

Min-Max Normalized Data:
         ID  Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
0  0.000000      0.222222     0.625000      0.067797     0.041667
1  0.006711      0.166667     0.416667      0.067797     0.041667
2  0.013423      0.111111     0.500000      0.050847     0.041667
3  0.020134      0.083333     0.458333      0.084746     0.041667
4  0.026846      0.194444     0.666667      0.067797     0.041667


# 2. Z-Score Normalization (Standardization)

![2.png](attachment:2.png)

In [14]:
from sklearn.preprocessing import StandardScaler

# Step 2: Apply Z-Score Normalization (Standardization)
scaler = StandardScaler()
X_zscore_normalized = scaler.fit_transform(X)

# Convert the result back to a DataFrame for easier interpretation
X_zscore_df = pd.DataFrame(X_zscore_normalized, columns=X.columns)

# Print the first few rows of the normalized data
print("\nZ-Score Normalized Data:")
print(X_zscore_df.head())



Z-Score Normalized Data:
         ID  Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
0 -1.720542     -0.900681     1.019004     -1.340227    -1.315444
1 -1.697448     -1.143017    -0.131979     -1.340227    -1.315444
2 -1.674353     -1.385353     0.328414     -1.397064    -1.315444
3 -1.651258     -1.506521     0.098217     -1.283389    -1.315444
4 -1.628164     -1.021849     1.249201     -1.340227    -1.315444


# 3. Normalization by Decimal Scaling


In [15]:
import numpy as np

# Step 3: Apply Decimal Scaling Normalization
def decimal_scaling_normalization(X):
    # Find the maximum absolute value in each column
    abs_max = X.abs().max()
    
    # Apply decimal scaling: Divide by 10^j where j is the number of digits in the max absolute value
    X_decimal_scaled = X / (10 ** abs_max.apply(lambda x: np.ceil(np.log10(x))))
    
    return X_decimal_scaled

# Convert the features to a DataFrame for easier manipulation
X_df = pd.DataFrame(X, columns=X.columns)

# Apply decimal scaling normalization
X_decimal_scaled_df = decimal_scaling_normalization(X_df)

# Print the first few rows of the normalized data
print("\nDecimal Scaling Normalized Data:")
print(X_decimal_scaled_df.head())



Decimal Scaling Normalized Data:
      ID  Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
0  0.001          0.51         0.35          0.14         0.02
1  0.002          0.49         0.30          0.14         0.02
2  0.003          0.47         0.32          0.13         0.02
3  0.004          0.46         0.31          0.15         0.02
4  0.005          0.50         0.36          0.14         0.02


Summary of What Each Step Does:
Min-Max Normalization: Scales the data to a range [0, 1]. It helps when you want to normalize data based on the minimum and maximum values.

Z-Score Normalization: Transforms the data such that the mean is 0 and the standard deviation is 1. This is useful when the data follows a Gaussian distribution or when you want to remove the effect of different scales.

Decimal Scaling: This technique shifts the decimal point of values so that the maximum absolute value in each feature becomes less than 1.