In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
from numpy import nan

import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_csv('./2.balance_train.csv')
df_test = pd.read_csv('./0.test.csv')

In [3]:
print(df_train.shape)
print(df_test.shape)

(19337, 22)
(6000, 21)


In [4]:
df_train.dtypes

job_level                                 object
job_duration_in_current_job_level        float64
person_level                              object
job_duration_in_current_person_level     float64
job_duration_in_current_branch           float64
Employee_type                             object
gender                                     int64
age                                        int64
marital_status_maried(Y/N)                object
number_of_dependences                      int64
Education_level                           object
GPA                                      float64
year_graduated                             int64
job_duration_from_training                 int64
branch_rotation                            int64
job_rotation                               int64
assign_of_otherposition                    int64
annual leave                               int64
sick_leaves                                int64
Last_achievement_%                       float64
Achievement_above_10

There are 21 attributes with different types of value such as int, float, and object. 
We only convert the object type to int types.

# job_level                                 

In [5]:
print(df_train['job_level'].unique())
print(df_test['job_level'].unique())

['JG05' 'JG04' 'JG03']
['JG04' 'JG05' 'JG03' 'JG06']


In [6]:
df_train['job_level'] = df_train['job_level'].replace("JG05", 5)
df_train['job_level'] = df_train['job_level'].replace("JG04", 4)
df_train['job_level'] = df_train['job_level'].replace("JG03", 3)
df_train['job_level'] = df_train['job_level'].replace("JG06", 6)

df_test['job_level'] = df_test['job_level'].replace("JG04", 4)
df_test['job_level'] = df_test['job_level'].replace("JG03", 3)
df_test['job_level'] = df_test['job_level'].replace("JG05", 5)
df_test['job_level'] = df_test['job_level'].replace("JG06", 6)

print(df_train['job_level'].unique())
print(df_test['job_level'].unique())

[5 4 3]
[4 5 3 6]


# person_level                              

In [7]:
print(df_train['person_level'].unique())
print(df_test['person_level'].unique())

['PG06' 'PG03' 'PG04' 'PG02' 'PG05' 'PG01' 'PG08' 'PG07']
['PG03' 'PG04' 'PG06' 'PG07' 'PG05' 'PG01' 'PG02']


In [8]:
for i in range(8):
    temp='PG0'+str(i+1)
    df_train['person_level'] = df_train['person_level'].replace(temp, (i+1))
    df_test['person_level'] = df_test['person_level'].replace(temp, (i+1))
    
print(df_train['person_level'].unique())
print(df_test['person_level'].unique())

[6 3 4 2 5 1 8 7]
[3 4 6 7 5 1 2]


# Employee_type                             

In [9]:
print(df_train['Employee_type'].unique())
print(df_test['Employee_type'].unique())

['RM_type_A' 'RM_type_B' 'RM_type_C']
['RM_type_A' 'RM_type_B' 'RM_type_C']


In [10]:
df_train['Employee_type'] = df_train['Employee_type'].replace("RM_type_A", 1)
df_train['Employee_type'] = df_train['Employee_type'].replace("RM_type_B", 2)
df_train['Employee_type'] = df_train['Employee_type'].replace("RM_type_C", 3)

df_test['Employee_type'] = df_test['Employee_type'].replace("RM_type_A", 1)
df_test['Employee_type'] = df_test['Employee_type'].replace("RM_type_B", 2)
df_test['Employee_type'] = df_test['Employee_type'].replace("RM_type_C", 3)

print(df_train['Employee_type'].unique())
print(df_test['Employee_type'].unique())

[1 2 3]
[1 2 3]


In [13]:
df_train['gender'] = df_train['gender'].replace("Male", 1)
df_train['gender'] = df_train['gender'].replace("Female", 0)

df_test['gender'] = df_test['gender'].replace("Male", 1)
df_test['gender'] = df_test['gender'].replace("Female", 0)

print(df_train['gender'].unique())
print(df_test['gender'].unique())

[2 1]
[1 2]


# marital_status_maried(Y/N)

In [14]:
print(df_train['marital_status_maried(Y/N)'].unique())
print(df_test['marital_status_maried(Y/N)'].unique())

['Y' 'N']
['N' 'Y']


In [15]:
df_train['marital_status_maried(Y/N)'] = df_train['marital_status_maried(Y/N)'].replace("Y", 1)
df_train['marital_status_maried(Y/N)'] = df_train['marital_status_maried(Y/N)'].replace("N", 0)

df_test['marital_status_maried(Y/N)'] = df_test['marital_status_maried(Y/N)'].replace("Y", 1)
df_test['marital_status_maried(Y/N)'] = df_test['marital_status_maried(Y/N)'].replace("N", 0)

print(df_train['marital_status_maried(Y/N)'].unique())
print(df_test['marital_status_maried(Y/N)'].unique())

[1 0]
[0 1]


# Education_level                           

In [16]:
print(df_train['Education_level'].unique())
print(df_test['Education_level'].unique())

['level_1' 'level_3' 'level_4' 'level_5' 'level_0' 'level_2']
['level_3' 'level_4' 'level_1' 'level_5' 'level_0' 'level_2']


In [17]:
def convert_edu_level(x):
    try:
        if x=="level_0":
            return 0
        elif x=="level_1":
            return 1
        elif x=="level_2":
            return 2
        elif x=="level_3":
            return 3
        elif x=="level_4":
            return 4
        elif x=="level_5":
            return 5
    except:
        return nan

In [18]:
df_train['Education_level'] = df_train['Education_level'].apply(convert_edu_level)
df_test['Education_level'] = df_test['Education_level'].apply(convert_edu_level)

print(df_train['Education_level'].unique())
print(df_test['Education_level'].unique())

[1 3 4 5 0 2]
[3 4 1 5 0 2]


# year_graduated                            

In [19]:
print(df_train['year_graduated'].unique())
print(df_test['year_graduated'].unique())

[1987 2009 1988 2007 2004 1985 2013 1993 2006 1984 1989 1986 2011 2010
 1994 1991 1992 2018 2014 2008 2001 2002 2017 2012 2005 2015 1996 2003
 1995 1990 2000 1998 1999 2016 1997 2019 1983 1982]
[2009 2014 2011 2007 2004 2012 2015 2008 2010 2017 2002 2013 2006 2005
 2003 1986 2001 2016 1993 1995 1994 1999 2000 1990 1996 1992 1988 1998
 1989 2018 1987 1997 1991 2020 1984 2019 1985]


In [20]:
def filter_year(x):
    try:
        x = int(x)
        if x > 2021:
            return nan #make invalid data into nan/empty instead
        elif x < 1945:
            return nan #make invalid data into nan/empty instead
        elif x > 1945 and x<=2021:
            return 2021 - int(x)
    except:
        return nan

In [21]:
df_train['year_graduated'] = df_train['year_graduated'].apply(filter_year)
df_test['year_graduated'] = df_test['year_graduated'].apply(filter_year)

print(df_train['year_graduated'].unique())
print(df_test['year_graduated'].unique())

[34 12 33 14 17 36  8 28 15 37 32 35 10 11 27 30 29  3  7 13 20 19  4  9
 16  6 25 18 26 31 21 23 22  5 24  2 38 39]
[12  7 10 14 17  9  6 13 11  4 19  8 15 16 18 35 20  5 28 26 27 22 21 31
 25 29 33 23 32  3 34 24 30  1 37  2 36]


# age

In [25]:
print(df_train['age'].unique())
print(df_test['age'].unique())

[1967 1991 1989 1986 1966 1974 1988 1968 1965 1969 1993 1976 1973 1963
 1970 1971 1990 1987 1979 1985 1980 1992 1984 1994 1983 1981 1982 1964
 1972 1977 1975 1978 1996 1995 1997]
[1988 1991 1989 1990 1985 1982 1983 1993 1987 1992 1986 1978 1981 1972
 1984 1980 1979 1974 1967 1975 1971 1976 1970 1964 1968 1963 1977 1973
 1965 1969 1966 1994 1995]


In [26]:
def filter_age(x):
    try:
        x = int(x)
        if x > 2004:
            return nan #make invalid data into nan/empty instead
        elif x < 1945:
            return nan #make invalid data into nan/empty instead
        elif x > 1945 and x<=2004:
            return 2021 - int(x)
    except:
        return nan

In [27]:
df_train['age'] = df_train['age'].apply(filter_age)
df_test['age'] = df_test['age'].apply(filter_age)

print(df_train['age'].unique())
print(df_test['age'].unique())

[54 30 32 35 55 47 33 53 56 52 28 45 48 58 51 50 31 34 42 36 41 29 37 27
 38 40 39 57 49 44 46 43 25 26 24]
[33 30 32 31 36 39 38 28 34 29 35 43 40 49 37 41 42 47 54 46 50 45 51 57
 53 58 44 48 56 52 55 27 26]


In [28]:
df_train.to_csv("3.num_train.csv", index=False)
df_test.to_csv("3.num_test.csv", index=False)