In [40]:
import pandas as pd 
import numpy as np

In [2]:
df= pd.read_csv(r'C:\Data_for_learning_overall\data_machine_learning\hr data\HR_comma_sep.csv')

In [3]:
df

Unnamed: 0,Department,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_at_company,Work_accident,any_promotion_last_5years,salary,left
0,sales,0.38,0.53,2,157,3,0,0,low,1
1,sales,0.80,0.86,5,262,6,0,0,medium,1
2,sales,0.11,0.88,7,272,4,0,0,medium,1
3,sales,0.72,0.87,5,223,5,0,0,low,1
4,sales,0.37,0.52,2,159,3,0,0,low,1
...,...,...,...,...,...,...,...,...,...,...
14994,support,0.40,0.57,2,151,3,0,0,low,1
14995,support,0.37,0.48,2,160,3,0,0,low,1
14996,support,0.37,0.53,2,143,3,0,0,low,1
14997,support,0.11,0.96,6,280,4,0,0,low,1


In [8]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

# Function to select specific columns from DataFrame
class DataFrameSelector:
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names]

# Numerical pipeline
numerical_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["satisfaction_level", "last_evaluation", "number_project", "average_montly_hours", "time_spend_at_company"])),
    ("std_scaler", StandardScaler()),
])
# Categorical pipeline with a customizable encoder
def categorical_pipeline():
    cat_columns = ["Department", "salary"]
    cat_encoders = {
        "Department": OneHotEncoder(sparse_output=False, handle_unknown='ignore'),
        "salary": OrdinalEncoder(categories=[['low', 'medium', 'high']])
    }

    transformers = []
    for col in cat_columns:
        transformers.append((col + '_encoder', cat_encoders.get(col, OneHotEncoder()), [col]))

    return Pipeline([
        ("select_cat", DataFrameSelector(cat_columns)),
        ("cat_encoder", ColumnTransformer(transformers))
    ])

# Complete preprocessing pipeline using FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
    ("numerical_pipeline", numerical_pipeline),
    ("categorical_pipeline", categorical_pipeline()),
])
# Example usage:
# transformed_data = preprocess_pipeline.fit_transform(train_data)
preprocess_pipeline

In [43]:
from sklearn.model_selection import train_test_split

# Assuming 'df' is your DataFrame
X = df.drop(columns=['left'])
y = df['left']

# Split the data into training and testing sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Transform features using the preprocessing pipeline on training set
X_train = preprocess_pipeline.fit_transform(X_train)

# Transform features using the preprocessing pipeline on test set
X_test = preprocess_pipeline.transform(X_test)


In [44]:
# Print the shapes
print('-----------------------TẬP TRAIN-----------------------') 
print('kích thước:', X_train.shape) 
print('-----------------------TẬP TEST-----------------------')
print('kích thước:', X_test.shape)

-----------------------TẬP TRAIN-----------------------
kích thước: (11999, 16)
-----------------------TẬP TEST-----------------------
kích thước: (3000, 16)


In [45]:
import time

from sklearn.decomposition import PCA
import pandas as pd 
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

#Khai báo các thuật toán phân lớp sử dụng
dict_classifiers = {
    "Logistic Regression": LogisticRegression(solver='lbfgs', max_iter=5000),
    "Nearest Neighbors": KNeighborsClassifier(),
    "Linear SVM": SVC(gamma = 'auto'),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=18),
    "Neural Net": MLPClassifier(alpha=1),
    "Naive Bayes": GaussianNB()
}

In [50]:
no_classifiers = len(dict_classifiers.keys())

#Xây dựng hàm huấn luyện theo các thuật toán
#Xác định thời gian chạy khi huấn luyện của mỗi thuật toán
#Cho biết Độ chính xác của mô hình khi chạy huấn luyện
#Thời gian chạy huấn luyện của mô hình
def batch_classify(X_train, Y_train, verbose = True):
    df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,3)), columns = ['classifier', 'train_score', 'training_time'])
    count = 0
    for key, classifier in dict_classifiers.items():
        t_start = time.process_time()
        classifier.fit(X_train, Y_train)
        t_end = time.process_time()
        t_diff = t_end - t_start
        train_score = classifier.score(X_train, Y_train)
        df_results.loc[count,'classifier'] = key
        df_results.loc[count,'train_score'] = train_score
        df_results.loc[count,'training_time'] = t_diff
        if verbose:
            print("trained {c} in {f:.2f} s".format(c=key, f=t_diff))
        count+=1
    return df_results
# Function to evaluate models on the test set
def evaluate_on_test_set(X_test, y_test, classifiers):
    df_results_test = pd.DataFrame(data=np.zeros(shape=(len(classifiers), 2)), columns=['classifier', 'test_score'])
    
    count = 0
    for key, classifier in classifiers.items():
        test_score = classifier.score(X_test, y_test)
        df_results_test.loc[count, 'classifier'] = key
        df_results_test.loc[count, 'test_score'] = test_score
        count += 1

    return df_results_test

In [54]:
#Thực hiện việc huấn luyện các mô hình với dữ liệu huấn luyện
df_results = batch_classify(X_train, y_train,verbose=True)
print("----------RESULT ON TRAIN SET----------")
print(df_results.sort_values(by='train_score', ascending=False))
print("----------RESULT ON TEST SET (Accuracy Score)----------")
df_results_test = evaluate_on_test_set(X_test_processed, y_test, dict_classifiers)
print(df_results_test.sort_values(by='test_score', ascending=False))


trained Logistic Regression in 0.16 s
trained Nearest Neighbors in 0.00 s
trained Linear SVM in 1.27 s
trained Gradient Boosting Classifier in 0.73 s
trained Decision Tree in 0.03 s
trained Random Forest in 0.14 s
trained Neural Net in 7.42 s
trained Naive Bayes in 0.00 s
----------RESULT ON TRAIN SET----------
                     classifier  train_score  training_time
4                 Decision Tree     1.000000       0.031250
5                 Random Forest     0.998750       0.140625
3  Gradient Boosting Classifier     0.978498       0.734375
1             Nearest Neighbors     0.964664       0.000000
2                    Linear SVM     0.958913       1.265625
6                    Neural Net     0.956996       7.421875
7                   Naive Bayes     0.814651       0.000000
0           Logistic Regression     0.775315       0.156250
----------RESULT ON TEST SET----------
                     classifier  test_score
5                 Random Forest    0.988333
3  Gradient Boosting

In [55]:
from sklearn.model_selection import cross_val_score

# Logistic Regression
log_reg = LogisticRegression(solver='lbfgs', max_iter=5000)
log_scores = cross_val_score(log_reg, X_train, y_train, cv=5)
log_reg_mean = log_scores.mean()

# SVC
svc_clf = SVC(gamma='auto')
svc_scores = cross_val_score(svc_clf, X_train, y_train, cv=5)
svc_mean = svc_scores.mean()

# KNearestNeighbors
knn_clf = KNeighborsClassifier()
knn_scores = cross_val_score(knn_clf, X_train, y_train, cv=5)
knn_mean = knn_scores.mean()

# Decision Tree
tree_clf = tree.DecisionTreeClassifier()
tree_scores = cross_val_score(tree_clf, X_train, y_train, cv=5)
tree_mean = tree_scores.mean()

# Gradient Boosting Classifier
grad_clf = GradientBoostingClassifier()
grad_scores = cross_val_score(grad_clf, X_train, y_train, cv=5)
grad_mean = grad_scores.mean()

# Random Forest Classifier
rand_clf = RandomForestClassifier(n_estimators=18)
rand_scores = cross_val_score(rand_clf, X_train, y_train, cv=5)
rand_mean = rand_scores.mean()

# NeuralNet Classifier
neural_clf = MLPClassifier(alpha=1)
neural_scores = cross_val_score(neural_clf, X_train, y_train, cv=5)
neural_mean = neural_scores.mean()

# Naives Bayes
nav_clf = GaussianNB()
nav_scores = cross_val_score(nav_clf, X_train, y_train, cv=5)
nav_mean = neural_scores.mean()

# Create a Dataframe with the results.
d = {'Classifiers': ['Logistic Reg.', 'SVC', 'KNN', 'Dec Tree', 'Grad B CLF', 'Rand FC', 'Neural Classifier', 'Naives Bayes'],
    'Crossval Mean Scores': [log_reg_mean, svc_mean, knn_mean, tree_mean, grad_mean, rand_mean, neural_mean, nav_mean]}

result_df = pd.DataFrame(data=d)

In [56]:
#Hiển thị kết quả của các thuật toán với kỹ thuật Cross Validation
result_df = result_df.sort_values(by=['Crossval Mean Scores'], ascending=False)
result_df

Unnamed: 0,Classifiers,Crossval Mean Scores
5,Rand FC,0.988999
3,Dec Tree,0.977832
4,Grad B CLF,0.976998
6,Neural Classifier,0.957163
7,Naives Bayes,0.957163
1,SVC,0.955913
2,KNN,0.950163
0,Logistic Reg.,0.776397
