In [None]:
# importing Libraries
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler



# data
data = np.array([[18], [22], [30], [45], [80]])


# Normalization (Min-Max)
norm = MinMaxScaler()
data_norm = norm.fit_transform(data)


# Scaling (Standardization)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)


print("Normalized:\n", data_norm)
print("Scaled:\n", data_scaled)

Normalized:
 [[0.        ]
 [0.06451613]
 [0.19354839]
 [0.43548387]
 [1.        ]]
Scaled:
 [[-0.93393309]
 [-0.75604107]
 [-0.40025704]
 [ 0.26683803]
 [ 1.82339317]]


### Feature Engineering 

In [8]:
import pandas as pd


In [9]:
#
df=pd.read_csv('/employee_promotion.csv')
df.head()

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,City,Promotion
0,28,132612,9,2,1,Data Engineer,High School,Seattle,Got Promotion
1,41,116641,8,4,3,Data Scientist,Master's,Chicago,No Promotion
2,36,113811,11,1,6,Web developer,PhD,Chicago,Got Promotion
3,32,102160,2,2,6,Data Engineer,High School,Austin,No Promotion
4,29,101313,7,4,7,Product Manager,Bachelor's,San Francisco,No Promotion


In [12]:
print(df.shape)

(5000, 9)


In [14]:
# feature encoding
from sklearn.preprocessing import LabelEncoder

# coping the original dataset
df_encoded=df.copy()

# label encoding for promotion (mostly for target variable)
label_Encoder=LabelEncoder()
df_encoded['Promotion_Enc']=label_Encoder.fit_transform(df_encoded['Promotion'])


In [15]:
df_encoded

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,City,Promotion,Promotion_Enc
0,28,132612,9,2,1,Data Engineer,High School,Seattle,Got Promotion,0
1,41,116641,8,4,3,Data Scientist,Master's,Chicago,No Promotion,1
2,36,113811,11,1,6,Web developer,PhD,Chicago,Got Promotion,0
3,32,102160,2,2,6,Data Engineer,High School,Austin,No Promotion,1
4,29,101313,7,4,7,Product Manager,Bachelor's,San Francisco,No Promotion,1
...,...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,Austin,No Promotion,1
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Chicago,Got Promotion,0
4997,34,81007,4,0,4,Web developer,PhD,Austin,Got Promotion,0
4998,41,106947,12,2,10,Data Scientist,Bachelor's,Seattle,No Promotion,1


In [17]:
# label encoding with order for Education (for feature having ordinal categories)

education_encoded_dict ={"High School":0,"Bachelor's":1,"Master's":2,'PhD':3}
df_encoded['Education_Enc']=df_encoded['Education'].map(education_encoded_dict)
df_encoded


Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,City,Promotion,Promotion_Enc,Education_Enc
0,28,132612,9,2,1,Data Engineer,High School,Seattle,Got Promotion,0,0
1,41,116641,8,4,3,Data Scientist,Master's,Chicago,No Promotion,1,2
2,36,113811,11,1,6,Web developer,PhD,Chicago,Got Promotion,0,3
3,32,102160,2,2,6,Data Engineer,High School,Austin,No Promotion,1,0
4,29,101313,7,4,7,Product Manager,Bachelor's,San Francisco,No Promotion,1,1
...,...,...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,Austin,No Promotion,1,0
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Chicago,Got Promotion,0,1
4997,34,81007,4,0,4,Web developer,PhD,Austin,Got Promotion,0,3
4998,41,106947,12,2,10,Data Scientist,Bachelor's,Seattle,No Promotion,1,1


In [18]:
# One- Hot Encoding for city

df_encoded=pd.get_dummies(df_encoded,columns=['City'], prefix="City")
df_encoded

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,Promotion,Promotion_Enc,Education_Enc,City_Austin,City_Chicago,City_New York,City_San Francisco,City_Seattle
0,28,132612,9,2,1,Data Engineer,High School,Got Promotion,0,0,False,False,False,False,True
1,41,116641,8,4,3,Data Scientist,Master's,No Promotion,1,2,False,True,False,False,False
2,36,113811,11,1,6,Web developer,PhD,Got Promotion,0,3,False,True,False,False,False
3,32,102160,2,2,6,Data Engineer,High School,No Promotion,1,0,True,False,False,False,False
4,29,101313,7,4,7,Product Manager,Bachelor's,No Promotion,1,1,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,No Promotion,1,0,True,False,False,False,False
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Got Promotion,0,1,False,True,False,False,False
4997,34,81007,4,0,4,Web developer,PhD,Got Promotion,0,3,True,False,False,False,False
4998,41,106947,12,2,10,Data Scientist,Bachelor's,No Promotion,1,1,False,False,False,False,True


In [21]:
# Frequency Encoding for JobTitle

job_title_counts=df_encoded['JobTitle'].value_counts().to_dict()
df_encoded['JobTitle_Freq_Enc']=df_encoded['JobTitle'].map(job_title_counts)
job_title_counts


{'Product Manager': 750,
 'Web developer': 733,
 'Business Analyst': 730,
 'Data Scientist': 705,
 'Software Engineer': 701,
 'Data Engineer': 699,
 'Data Analyst': 682}

In [22]:
df_encoded


Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,Promotion,Promotion_Enc,Education_Enc,City_Austin,City_Chicago,City_New York,City_San Francisco,City_Seattle,JobTitle_Freq_Enc
0,28,132612,9,2,1,Data Engineer,High School,Got Promotion,0,0,False,False,False,False,True,699
1,41,116641,8,4,3,Data Scientist,Master's,No Promotion,1,2,False,True,False,False,False,705
2,36,113811,11,1,6,Web developer,PhD,Got Promotion,0,3,False,True,False,False,False,733
3,32,102160,2,2,6,Data Engineer,High School,No Promotion,1,0,True,False,False,False,False,699
4,29,101313,7,4,7,Product Manager,Bachelor's,No Promotion,1,1,False,False,False,True,False,750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,No Promotion,1,0,True,False,False,False,False,705
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Got Promotion,0,1,False,True,False,False,False,705
4997,34,81007,4,0,4,Web developer,PhD,Got Promotion,0,3,True,False,False,False,False,733
4998,41,106947,12,2,10,Data Scientist,Bachelor's,No Promotion,1,1,False,False,False,False,True,705


In [24]:
# Target Encoding for Education (Encoding it based on the average promotion rate)
education_target_mean=df_encoded.groupby('Education')['Promotion_Enc'].mean().to_dict()
df_encoded['Education_Target_Enc']=df_encoded['Education'].map(education_target_mean)
education_target_mean

{"Bachelor's": 0.7185929648241206,
 'High School': 0.6976923076923077,
 "Master's": 0.7188264058679706,
 'PhD': 0.7028928850664582}

In [25]:
df_encoded

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,Promotion,Promotion_Enc,Education_Enc,City_Austin,City_Chicago,City_New York,City_San Francisco,City_Seattle,JobTitle_Freq_Enc,Education_Target_Enc
0,28,132612,9,2,1,Data Engineer,High School,Got Promotion,0,0,False,False,False,False,True,699,0.697692
1,41,116641,8,4,3,Data Scientist,Master's,No Promotion,1,2,False,True,False,False,False,705,0.718826
2,36,113811,11,1,6,Web developer,PhD,Got Promotion,0,3,False,True,False,False,False,733,0.702893
3,32,102160,2,2,6,Data Engineer,High School,No Promotion,1,0,True,False,False,False,False,699,0.697692
4,29,101313,7,4,7,Product Manager,Bachelor's,No Promotion,1,1,False,False,False,True,False,750,0.718593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,No Promotion,1,0,True,False,False,False,False,705,0.697692
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Got Promotion,0,1,False,True,False,False,False,705,0.718593
4997,34,81007,4,0,4,Web developer,PhD,Got Promotion,0,3,True,False,False,False,False,733,0.702893
4998,41,106947,12,2,10,Data Scientist,Bachelor's,No Promotion,1,1,False,False,False,False,True,705,0.718593


## Feature Scaling

In [28]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#coping the data set

df_scaled=df_encoded.copy()

#standard Scaler
standard_scaler=StandardScaler()
df_scaled['Salary_StandardScaled']=standard_scaler.fit_transform(df_scaled[["Salary"]]) # fit_transform is used only in training data but not on test data we only use transform.

# min-max Scaler
min_max_scaler=MinMaxScaler()
df_scaled['Salary_MinMaxScaled']=min_max_scaler.fit_transform(df_scaled[["Salary"]])

# displaying the first few rows
df_scaled[["Salary","Salary_StandardScaled","Salary_MinMaxScaled"]].head()

Unnamed: 0,Salary,Salary_StandardScaled,Salary_MinMaxScaled
0,132612,1.12006,0.826129
1,116641,0.572455,0.666363
2,113811,0.475421,0.638053
3,102160,0.075938,0.521503
4,101313,0.046897,0.51303
