In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import sklearn.preprocessing
from sklearn.preprocessing import MinMaxScaler

For normalization I used MinMaxScaler, where every features is changed into range 0-1

In [2]:
df_train = pd.read_csv('./5.complete_train.csv')
df_test = pd.read_csv('./5.complete_test.csv')

In [3]:
print(df_train.shape)
print(df_test.shape)

(19337, 22)
(6000, 22)


In [4]:
df_merge_data = pd.concat([df_train,df_test])
print(df_merge_data.shape)

(25337, 22)


In [5]:
y = df_merge_data["Best Performance"] #label
df_merge_data.drop(['Best Performance'], inplace=True, axis=1)

In [6]:
attr_names = list(df_merge_data.columns.values) 
print(attr_names)

['job_level', 'job_duration_in_current_job_level', 'person_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'Employee_type', 'gender', 'age', 'marital_status_maried(Y/N)', 'number_of_dependences', 'Education_level', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual leave', 'sick_leaves', 'Last_achievement_%', 'Achievement_above_100%_during3quartal']


In [7]:
df_merge_data.head()

Unnamed: 0,job_level,job_duration_in_current_job_level,person_level,job_duration_in_current_person_level,job_duration_in_current_branch,Employee_type,gender,age,marital_status_maried(Y/N),number_of_dependences,...,GPA,year_graduated,job_duration_from_training,branch_rotation,job_rotation,assign_of_otherposition,annual leave,sick_leaves,Last_achievement_%,Achievement_above_100%_during3quartal
0,5,1.31,6,1.0,1.385641,1,2,54,1,2.0,...,3.16,34,5.0,3.0,6.0,0.0,4.0,0.0,47.42,0.0
1,4,1.118034,3,1.118034,0.412311,2,2,30,0,0.0,...,3.16,12,3.0,2.0,2.0,0.0,3.0,0.0,41.45,0.0
2,5,1.31,6,1.0,1.224745,1,2,54,1,3.0,...,3.16,33,5.0,5.0,6.0,0.0,2.0,0.0,53.71,0.0
3,4,1.224745,3,1.224745,1.153256,2,1,30,0,0.0,...,3.16,12,2.0,2.0,2.0,0.0,2.0,1.0,45.16,0.0
4,4,1.224745,3,1.224745,1.118034,2,2,32,1,1.0,...,3.16,14,2.0,2.0,2.0,0.0,2.0,0.0,97.3,1.0


In [8]:
x = df_merge_data.loc[:].values
print(x)

[[ 5.          1.31        6.         ...  0.         47.42
   0.        ]
 [ 4.          1.11803399  3.         ...  0.         41.45
   0.        ]
 [ 5.          1.31        6.         ...  0.         53.71
   0.        ]
 ...
 [ 4.          1.38564065  3.         ...  0.         13.02
   0.        ]
 [ 5.          1.38564065  5.         ...  0.         82.26
   0.        ]
 [ 4.          1.35277493  3.         ...  0.         66.62
   0.        ]]


In [9]:
min_max_scaler = MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(x)
print(X_train_minmax)

[[0.66666667 0.48518402 0.71428571 ... 0.         0.31183333 0.        ]
 [0.33333333 0.21970541 0.28571429 ... 0.         0.26208333 0.        ]
 [0.66666667 0.48518402 0.71428571 ... 0.         0.36425    0.        ]
 ...
 [0.33333333 0.58979095 0.28571429 ... 0.         0.02516667 0.        ]
 [0.66666667 0.58979095 0.57142857 ... 0.         0.60216667 0.        ]
 [0.33333333 0.54433943 0.28571429 ... 0.         0.47183333 0.        ]]


In [10]:
normalised_data = pd.DataFrame(X_train_minmax,columns=attr_names)

In [11]:
normalised_data.shape

(25337, 21)

In [12]:
y.shape

(25337,)

In [13]:
normalised_data.head()

Unnamed: 0,job_level,job_duration_in_current_job_level,person_level,job_duration_in_current_person_level,job_duration_in_current_branch,Employee_type,gender,age,marital_status_maried(Y/N),number_of_dependences,...,GPA,year_graduated,job_duration_from_training,branch_rotation,job_rotation,assign_of_otherposition,annual leave,sick_leaves,Last_achievement_%,Achievement_above_100%_during3quartal
0,0.666667,0.485184,0.714286,0.0,0.69282,0.0,1.0,0.882353,1.0,0.4,...,0.563218,0.868421,0.375,0.333333,0.833333,0.0,0.444444,0.0,0.311833,0.0
1,0.333333,0.219705,0.285714,0.194699,0.206155,0.5,1.0,0.176471,0.0,0.0,...,0.563218,0.289474,0.125,0.166667,0.166667,0.0,0.333333,0.0,0.262083,0.0
2,0.666667,0.485184,0.714286,0.0,0.612372,0.0,1.0,0.882353,1.0,0.6,...,0.563218,0.842105,0.375,0.666667,0.833333,0.0,0.222222,0.0,0.36425,0.0
3,0.333333,0.367281,0.285714,0.370721,0.576628,0.5,0.0,0.176471,0.0,0.0,...,0.563218,0.289474,0.0,0.166667,0.166667,0.0,0.222222,0.5,0.293,0.0
4,0.333333,0.367281,0.285714,0.370721,0.559017,0.5,1.0,0.235294,1.0,0.2,...,0.563218,0.342105,0.0,0.166667,0.166667,0.0,0.222222,0.0,0.7275,0.5


In [14]:
y.head()

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: Best Performance, dtype: float64

In [15]:
normalised_data.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

In [16]:
result = pd.concat([normalised_data, y], axis=1)
result.shape

(25337, 22)

In [17]:
result.head()

Unnamed: 0,job_level,job_duration_in_current_job_level,person_level,job_duration_in_current_person_level,job_duration_in_current_branch,Employee_type,gender,age,marital_status_maried(Y/N),number_of_dependences,...,year_graduated,job_duration_from_training,branch_rotation,job_rotation,assign_of_otherposition,annual leave,sick_leaves,Last_achievement_%,Achievement_above_100%_during3quartal,Best Performance
0,0.666667,0.485184,0.714286,0.0,0.69282,0.0,1.0,0.882353,1.0,0.4,...,0.868421,0.375,0.333333,0.833333,0.0,0.444444,0.0,0.311833,0.0,1.0
1,0.333333,0.219705,0.285714,0.194699,0.206155,0.5,1.0,0.176471,0.0,0.0,...,0.289474,0.125,0.166667,0.166667,0.0,0.333333,0.0,0.262083,0.0,1.0
2,0.666667,0.485184,0.714286,0.0,0.612372,0.0,1.0,0.882353,1.0,0.6,...,0.842105,0.375,0.666667,0.833333,0.0,0.222222,0.0,0.36425,0.0,1.0
3,0.333333,0.367281,0.285714,0.370721,0.576628,0.5,0.0,0.176471,0.0,0.0,...,0.289474,0.0,0.166667,0.166667,0.0,0.222222,0.5,0.293,0.0,1.0
4,0.333333,0.367281,0.285714,0.370721,0.559017,0.5,1.0,0.235294,1.0,0.2,...,0.342105,0.0,0.166667,0.166667,0.0,0.222222,0.0,0.7275,0.5,1.0


In [18]:
df_train_new = result.iloc[:len(df_train),:] 
df_test_new = result.iloc[len(df_train):,:]

In [19]:
print(df_train_new.shape)
print(df_test_new.shape)

(19337, 22)
(6000, 22)


In [20]:
df_train_new.to_csv("6.normalized_train.csv", index=False)
df_test_new.to_csv("6.normalized_test.csv", index=False)