### Feature Scaling

##### 1. Standardization
done with z-score

In [2]:
import seaborn as sns
import pandas as pd
df = sns.load_dataset('tips')
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[['total_bill','tip']])
scaled_df = pd.DataFrame(scaled_data, columns = ['total_bill', 'tip'])
scaled_df.head()


Unnamed: 0,total_bill,tip
0,-0.314711,-1.439947
1,-1.063235,-0.969205
2,0.13778,0.363356
3,0.438315,0.225754
4,0.540745,0.44302


##### 2. Min-Max Scaling


In [3]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_data1 = scaler.fit_transform(df[['total_bill','tip']])
scaled_df1 = pd.DataFrame(scaled_data1, columns= ['total_bill', 'tip'])
scaled_df1.head()

Unnamed: 0,total_bill,tip
0,0.291579,0.001111
1,0.152283,0.073333
2,0.375786,0.277778
3,0.431713,0.256667
4,0.450775,0.29


##### 3. Normalization

In [4]:
from sklearn.preprocessing import Normalizer

scaler = Normalizer()
scaled_data = scaler.fit_transform(df[['total_bill','tip']])
scaled_df = pd.DataFrame(scaled_data, columns= ['total_bill', 'tip'])
print(scaled_df.head())

   total_bill       tip
0    0.998238  0.059342
1    0.987357  0.158512
2    0.986407  0.164323
3    0.990372  0.138435
4    0.989395  0.145251


#### 4. Absolute Maximum Scaling

In [5]:
total_bill = df['total_bill']
total_bill = total_bill/max(total_bill)
total_bill.head()


0    0.334383
1    0.203503
2    0.413501
3    0.466050
4    0.483960
Name: total_bill, dtype: float64

#### 5. Robust Scaling

In [6]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaled_data = scaler.fit_transform(df[['total_bill','tip']])
scaled_df = pd.DataFrame(scaled_data, columns=['total_bill', 'tip'])
print(scaled_df.head())

   total_bill     tip
0   -0.074675 -1.2096
1   -0.691558 -0.7936
2    0.298237  0.3840
3    0.545918  0.2624
4    0.630334  0.4544


#### 6. Unit Vector

In [10]:
from sklearn.preprocessing import normalize

scaled_df = pd.DataFrame(normalize(df[['total_bill','tip']]),columns= ['total_bill', 'tip'])
# scaled_data = scaler.fit_transform(df[['total_bill','tip']])
# scaled_df = pd.DataFrame(scaled_data, columns= ['total_bill', 'tip'])
print(scaled_df.head())

   total_bill       tip
0    0.998238  0.059342
1    0.987357  0.158512
2    0.986407  0.164323
3    0.990372  0.138435
4    0.989395  0.145251


### Data Encoding

to convert categorical feature into numerical faeture to be used to train ML Model

#### 1. simple encoding for binary features

In [13]:

# data preprocessing
import pandas as pd  
# for linear calculations
import numpy as np    
# Plotting Graphs
import seaborn as sns 
df = pd.read_csv("Encoding Data.csv")
# displaying top 10 results
df.head()   

Unnamed: 0,id,bin_1,bin_2,nom_0,ord_2
0,0,F,N,Red,Hot
1,1,F,Y,Blue,Warm
2,2,F,N,Blue,Cold
3,3,F,N,Green,Warm
4,4,T,N,Red,Cold


In [26]:
# you can always use simple mapping on binary features.
df['bin_1'] = df['bin_1'].apply(
    lambda x: 1 if x == 'T' else (0 if x == 'F' else None))
df['bin_2'] = df['bin_2'].apply(
    lambda x: 1 if x == 'Y' else (0 if x == 'N' else None))
df.head()

Unnamed: 0,id,bin_1,bin_2,nom_0,ord_2
0,0,0,0,Red,1
1,1,0,1,Blue,2
2,2,0,0,Blue,0
3,3,0,0,Green,2
4,4,1,0,Red,0


#### 2. Label Encoding 
Label encoding algorithm is quite simple and it considers an order for encoding, Hence can be used for encoding ordinal data. 

In [29]:
# labelEncoder present in scikitlearn library
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv("Encoding Data.csv")
le = LabelEncoder()
df['ord_2_new'] = le.fit_transform(df['ord_2'])
df.head()

Unnamed: 0,id,bin_1,bin_2,nom_0,ord_2,ord_2_new
0,0,F,N,Red,Hot,1
1,1,F,Y,Blue,Warm,2
2,2,F,N,Blue,Cold,0
3,3,F,N,Green,Warm,2
4,4,T,N,Red,Cold,0


#### 3. One-Hot Encoding
To overcome the Disadvantage of Label Encoding as it considers some hierarchy in the columns which can be misleading to nominal features present in the data. we can use the One-Hot Encoding strategy. 
One-hot encoding is processed in 2 steps:

Splitting of categories into different columns.
Put ‘0 for others and ‘1’ as an indicator for the appropriate column.

Disadvantages:
1. creates sparse matrix which usually leads to over-fitting
2. it will increase the number of features

In [17]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
# transforming the column after fitting
enc = enc.fit_transform(df[['nom_0']]).toarray()
# converting arrays to a dataframe
encoded_colm = pd.DataFrame(enc)
# concatenating dataframes
df = pd.concat([df, encoded_colm], axis=1)
# removing the encoded column.
df = df.drop(['nom_0'], axis=1)
df.head()

Unnamed: 0,id,bin_1,bin_2,ord_2,0,1,2
0,0,F,N,1,0.0,0.0,1.0
1,1,F,Y,2,1.0,0.0,0.0
2,2,F,N,0,1.0,0.0,0.0
3,3,F,N,2,0.0,1.0,0.0
4,4,T,N,0,0.0,0.0,1.0


#### 4. Frequency Encoding
 We can also encode considering the frequency distribution. This method can be effective at times for nominal features.

In [28]:
# grouping by frequency
df = pd.read_csv("Encoding Data.csv")
fq = df.groupby('nom_0').size()/len(df)
# mapping values to dataframe
df.loc[:, "{}_freq_encode".format('nom_0')] = df['nom_0'].map(fq)
# drop original column.
df = df.drop(['nom_0'], axis=1)
df.head()

Unnamed: 0,id,bin_1,bin_2,ord_2,nom_0_freq_encode
0,0,F,N,Hot,0.5
1,1,F,Y,Warm,0.3
2,2,F,N,Cold,0.3
3,3,F,N,Warm,0.2
4,4,T,N,Cold,0.5


#### 5. Ordinal Encoding
We can use Ordinal Encoding provided in Scikit learn class to encode Ordinal features. It ensures that ordinal nature of the variables is sustained. 

In [32]:

from sklearn.preprocessing import OrdinalEncoder
df = pd.read_csv("Encoding Data.csv")
encoder = OrdinalEncoder(categories=[['Hot', 'Warm', 'Cold']])
# fitting and transforming the dataframe
df["ord_2_new"] = encoder.fit_transform(df[['ord_2']]) 
df.head()

Unnamed: 0,id,bin_1,bin_2,nom_0,ord_2,ord_2_new
0,0,F,N,Red,Hot,0.0
1,1,F,Y,Blue,Warm,1.0
2,2,F,N,Blue,Cold,2.0
3,3,F,N,Green,Warm,1.0
4,4,T,N,Red,Cold,2.0


#### 6. Binary Encoding
Initially, categories are encoded as Integer and then converted into binary code, then the digits from that binary string are placed into separate columns. 

In [None]:
from category_encoders import BinaryEncoder
df = pd.read_csv("Encoding Data.csv")
encoder = BinaryEncoder(cols =['ord_2'], return_df=True) 
# transforming the column after fitting
newdata = encoder.fit_transform(df['ord_2'])
# concatenating dataframe
df = pd.concat([df, newdata], axis = 1) 
# dropping old column 
df = df.drop(['ord_2'], axis = 1)
df.head()

#### 7. HashEncoding
Hashing is the process of converting of a string of characters into a unique hash value with applying a hash function. This process is quite useful as it can deal with a higher number of categorical data and its low memory usage. 

In [None]:
from sklearn.feature_extraction import FeatureHasher
df = pd.read_csv("Encoding Data.csv")
# n_features contains the number of bits you want in your hash value.
h = FeatureHasher(n_features = 3, input_type ='string') 
# transforming the column after fitting
hashed_Feature = h.fit_transform(df['nom_0'])
hashed_Feature = hashed_Feature.toarray()
df = pd.concat([df, pd.DataFrame(hashed_Feature)], axis = 1)
df.head()

#### 8. Target Guided Ordinal Encoding

it is a technique used to encode categorical variables based on their relationship with the terget variable. This encoding technique is useful when we have a categorical variable with a large number of unique categories, and we want to use this variable as a feature in our maching learning model. 

In Target Guided Ordinal Encoding, each category from the categorical data will be assigned an numerical value based on the mean or median ot the traget variable of that category. This creates a monotonic relationship between the categorical variable and the target variable, which can improve the predictive power of our model.

In [48]:
df = pd.DataFrame({
    'city': ['New York', 'London', 'Paris', 'Tokyo', 'New York', 'Paris'],
    'price': [200,150,300,250,180,320]
})
# calculate the mean price for each city
mean_price = df.groupby('city')['price'].mean().to_dict()

# replace each city with its mean price
df['city_encoded'] = df['city'].map(mean_price)
df.head()


Unnamed: 0,city,price,city_encoded
0,New York,200,190.0
1,London,150,150.0
2,Paris,300,310.0
3,Tokyo,250,250.0
4,New York,180,190.0


In [None]:
# importing TargetEncoder
from category_encoders import TargetEncoder
Targetenc = TargetEncoder()
# transforming the column after fitting
values = Targetenc.fit_transform(X = df.nom_0, y = df.Target)
# concatenating values with dataframe
df = pd.concat([df, values], axis = 1)
df.head(10)