In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
# Đọc dữ liệu
df = pd.read_csv('data/data6.csv')

In [3]:
# Kiểm tra dữ liệu
print(df.head())

   Age    Salary  Gender Department     City   Hire-Date
0   56   32695.0  Female         IT  Chicago  2013-12-16
1   46   78190.0    Male    Finance  Phoenix  2021-07-01
2   32   35258.0    Male    Finance  Houston  2020-09-28
3   60  117538.0  Female         IT  Houston  2010-09-11
4   25   69504.0    Male         IT  Houston  2018-09-19


In [4]:
# Điền giá trị thiếu trong cột Salary bằng trung vị
df['Salary'].fillna(df['Salary'].median(), inplace=True)

In [5]:
# Min-Max Scaling
scaler_minmax = MinMaxScaler()
df[['Age_scaled', 'Salary_scaled']] = scaler_minmax.fit_transform(df[['Age', 'Salary']])

In [6]:
# Standardization (Z-score Scaling)
scaler_standard = StandardScaler()
df[['Age_std', 'Salary_std']] = scaler_standard.fit_transform(df[['Age', 'Salary']])

In [7]:
# Kết quả
print(df[['Age', 'Age_scaled', 'Age_std']].head())
print(df[['Salary', 'Salary_scaled', 'Salary_std']].head())

   Age  Age_scaled   Age_std
0   56    0.826087  1.086153
1   46    0.608696  0.367798
2   32    0.304348 -0.637899
3   60    0.913043  1.373495
4   25    0.152174 -1.140748
     Salary  Salary_scaled  Salary_std
0   32695.0       0.020895   -1.660470
1   78190.0       0.402827   -0.330065
2   35258.0       0.042412   -1.585521
3  117538.0       0.733155    0.820585
4   69504.0       0.329908   -0.584069


In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [9]:
# Label Encoding cho Gender
le = LabelEncoder()
df['Gender_Label'] = le.fit_transform(df['Gender'])

In [10]:
# One-Hot Encoding cho Department
df = pd.get_dummies(df, columns=['Department'], prefix='Dept')

In [11]:
# Hiển thị kết quả
print(df[['Gender', 'Gender_Label']].head())
print(df.filter(like='Dept_').head())

   Gender  Gender_Label
0  Female             0
1    Male             1
2    Male             1
3  Female             0
4    Male             1
   Dept_Finance  Dept_HR  Dept_IT  Dept_Marketing  Dept_Sales
0             0        0        1               0           0
1             1        0        0               0           0
2             1        0        0               0           0
3             0        0        1               0           0
4             0        0        1               0           0


In [12]:
from datetime import datetime

# Chuyển đổi cột ngày tuyển dụng thành kiểu datetime
df['Hire-Date'] = pd.to_datetime(df['Hire-Date'])

In [14]:
# Tính số năm kinh nghiệm
df['Experience'] = datetime.now().year - df['Hire-Date'].dt.year

In [15]:
# Tạo nhóm lương (Low, Medium, High)
df['Salary_Level'] = pd.cut(df['Salary'], bins=[0, 50000, 100000, np.inf], labels=['Low', 'Medium', 'High'])

In [17]:
# Hiển thị kết quả
print(df[['Hire-Date', 'Experience']].head())
print(df[['Salary', 'Salary_Level']].head())

   Hire-Date  Experience
0 2013-12-16          12
1 2021-07-01           4
2 2020-09-28           5
3 2010-09-11          15
4 2018-09-19           7
     Salary Salary_Level
0   32695.0          Low
1   78190.0       Medium
2   35258.0          Low
3  117538.0         High
4   69504.0       Medium
