In [22]:
import pandas as pd
df=pd.read_csv(r'C:\Users\Tanvi\Desktop\Datasets\SampleFile_FeatureScaling.csv')
df.head()

Unnamed: 0,LotArea,MSSubClass
0,8450,60
1,9600,20
2,11250,60
3,9550,70
4,14260,60


# Train Test Split

In [50]:
# Before applying any feature Scaling it is very important to always split our data into Train and Test 
from sklearn.model_selection import train_test_split
Xtrain,Xtest,Ytrain,Ytest=train_test_split(df['LotArea'],df['MSSubClass'],test_size=0.2,random_state=10)
# Test szie=0.2 means 20% of the data will be used for testing and the rest 80% will be used for training
# If we run this query multiple time and  we print the Xtrain model , then each time the model will return us new values 
# If we don't want our values to change we put thr random state =10.Here 10 is just a random number we use to fix our split.


In [51]:
len(Xtrain)

1168

In [52]:
len(Xtest)

292

In [53]:
len(Ytrain)

1168

In [54]:
len(Ytest)

292

In [55]:
Xtrain

1216     8930
339     12400
1057    29959
482      2500
529     32668
        ...  
1393    10800
1344    11103
527     14948
1149     9000
1289    11065
Name: LotArea, Length: 1168, dtype: int64

# Applying Absolute Maximum Scaling 

In [7]:
# For this . we first need to evaluate the absolute maximum values of the columns
import numpy as np
maxval=np.max(np.abs(df))
print(maxval)

# Then we need to subtract these values from the data and then divide its results from maxval as well
# SO formula will be like df-maxval/maxval
print((df-maxval)/maxval)

# This will return us a range of values between 0  and 1. 

LotArea       215245
MSSubClass       190
dtype: int64
       LotArea  MSSubClass
0    -0.960742   -0.684211
1    -0.955400   -0.894737
2    -0.947734   -0.684211
3    -0.955632   -0.631579
4    -0.933750   -0.684211
...        ...         ...
1455 -0.963219   -0.684211
1456 -0.938791   -0.894737
1457 -0.957992   -0.631579
1458 -0.954856   -0.894737
1459 -0.953834   -0.894737

[1460 rows x 2 columns]


# Min Max Scaling 

In [12]:
# For this we need to first find the min and max values of a column , then subtract min value from the original
# datapoint X and then divide this result by the difference between min and max
# values. So formula will be like (X-min)/min-max  where X is the original data point
# As we are finding the minimum and maximum values for a column , the method of scaling is prone to outliers though it will 
# return us a result in range of 0 to 1
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaled_data=scaler.fit_transform(df)
print(scaled_data)

[[0.0334198  0.23529412]
 [0.03879502 0.        ]
 [0.04650728 0.23529412]
 ...
 [0.03618687 0.29411765]
 [0.03934189 0.        ]
 [0.04037019 0.        ]]


In [13]:
# Converting this scaled data to a dataframe
Scaled_df=pd.DataFrame(scaled_data,columns=df.columns)
print(Scaled_df)

       LotArea  MSSubClass
0     0.033420    0.235294
1     0.038795    0.000000
2     0.046507    0.235294
3     0.038561    0.294118
4     0.060576    0.235294
...        ...         ...
1455  0.030929    0.235294
1456  0.055505    0.000000
1457  0.036187    0.294118
1458  0.039342    0.000000
1459  0.040370    0.000000

[1460 rows x 2 columns]


# Normalization

In [15]:
# In this method we will be subtracting the mean from X which is original data point and then divide the result by the
# difference between the mix and max values. So the formula will be like (X-mean)/min-max.This will return a range between 0
# and 1.

from sklearn.preprocessing import Normalizer
scaler=Normalizer()
scaled_data=scaler.fit_transform(df)
scaled_df=pd.DataFrame(scaled_data,columns=df.columns)

In [16]:
scaled_df.head()

Unnamed: 0,LotArea,MSSubClass
0,0.999975,0.0071
1,0.999998,0.002083
2,0.999986,0.005333
3,0.999973,0.00733
4,0.999991,0.004208


# Standardization

In [19]:
# In this method we will be subtracting the Mean from X which is the original data value, but this time we will be dividing 
# the result by the standard deviation. This fromula is the Z score formula of a standard Normal Distribution . The difference
# between Normalization and Standardization is the standardization will convert the values to a standard normal form using
# the z-score formula where mean =0 and std=1 while Normalization will just convert the data into a scale which will range 
# between 0 and 1. 
# Standardization is also called Z-score Normalization.

from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
scaled_data=scaler.fit_transform(df)
scaled_df=pd.DataFrame(scaled_data,columns=df.columns)

In [20]:
scaled_df.head()

Unnamed: 0,LotArea,MSSubClass
0,-0.207142,0.073375
1,-0.091886,-0.872563
2,0.07348,0.073375
3,-0.096897,0.309859
4,0.375148,0.073375


# Robust Scaling

In [23]:
# In this scaling we need to first calculate the Median and IQR
# Then we need to subtract Median from X which is our original data point and divide the result by IQR
# So formula will be like (X-Median)/IQR.

from sklearn.preprocessing import RobustScaler

scaler=RobustScaler()
scaled_data=scaler.fit_transform(df)
scaled_df=pd.DataFrame(scaled_data,columns=df.columns)

In [24]:
scaled_df.head()

Unnamed: 0,LotArea,MSSubClass
0,-0.254076,0.2
1,0.030015,-0.6
2,0.437624,0.2
3,0.017663,0.4
4,1.181201,0.2
