In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report



讀取CSV

In [23]:
train_df = pd.read_csv("HW2_hr-analytics_train.csv")
train_df.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,left
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.609706,0.715803,3.8037,200.8838,3.5024,0.1444,0.0213,0.2392
std,0.247832,0.172137,1.238757,49.915824,1.478652,0.351512,0.14439,0.426616
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,0.81,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


將數字欄位的缺失值變成NaN
經過檢查，發現沒有缺失值

In [24]:
nan_col = train_df.isna().sum()  #.isna()會回傳與train.df相同大小的datafram，若是有缺失直，就會回傳True
#.sum 就是對每個column的True數目求和


#下面兩個方法完全一樣，可以看出哪個column有缺失值
nan_isnull = train_df.isnull().any()
nan_isna = train_df.isna().any(axis = 0)  

print(nan_col)

print(nan_isna)

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
promotion_last_5years    0
sales                    0
salary                   0
left                     0
dtype: int64
satisfaction_level       False
last_evaluation          False
number_project           False
average_montly_hours     False
time_spend_company       False
Work_accident            False
promotion_last_5years    False
sales                    False
salary                   False
left                     False
dtype: bool


將非數字類型進行編碼(機器學習模型通常只能處理數字。因此，需要將非數字的資料轉換成數字，才能讓模型學習。)

#### 選擇適當的編碼方式：

類別數量少且無序：使用 One-Hot Encoding。

類別數量多：可以考慮 Binary Encoding 或 Frequency Encoding。

類別有自然順序：使用 Ordinal Encoding

類別與目標變數有強相關性：考慮 Target Encoding，但需注意資料洩漏。

In [25]:
sales_class = train_df['sales'].unique()
salary_class = train_df['salary'].unique()

print("unique classes in sales:",sales_class)
print('\n')
print("unuque classes in salary:",salary_class)

unique classes in sales: ['sales' 'accounting' 'hr' 'technical' 'support' 'management' 'IT'
 'product_mng' 'marketing' 'RandD']


unuque classes in salary: ['low' 'medium' 'high']


經過上面觀察後，發現sales資料無相關性，所以使用one hot encoding
salary 資料有相關性，所以使用label encoding

In [26]:
## one hot encoding
sales_encoded = pd.get_dummies(train_df['sales'],prefix='sales')
#print("sales encoded:\n",sales_encoded.head())


## label encoding
mapping = {'low' : 0,'medium' :1,'high' : 2}
train_df['salary_encoded' ] = train_df['salary'].map(mapping)


將encoding結束的資料寫回新的csv

In [27]:
train_df = pd.concat([train_df,sales_encoded],axis = 1)#將one hot encoding的結果整合進去train_df
train_df = train_df.drop('sales',axis = 1)
train_df = train_df.drop('salary',axis = 1)
train_df.to_csv('encoded_train_data.csv',index= False)




建立Logistic Regression 模型並進行訓練。請呈現訓練後模型預測
的混淆矩陣。

In [28]:
from sklearn.metrics import confusion_matrix


train_df = pd.read_csv("encoded_train_data.csv")

#分離X以及Y
x = train_df.drop('left',axis = 1)
y = train_df['left']

#分離測試資料
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.1,random_state = 42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

#評估accuracy
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy:{accuracy}")

#建立混淆矩陣
conf_matrix = confusion_matrix(y_test,y_pred)
print(f"confusion matrix:\n {conf_matrix}")


class_report = classification_report(y_test, y_pred)
print(f"Classification Report:\n{class_report}")

Accuracy:0.79
confusion matrix:
 [[700  48]
 [162  90]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.94      0.87       748
           1       0.65      0.36      0.46       252

    accuracy                           0.79      1000
   macro avg       0.73      0.65      0.67      1000
weighted avg       0.77      0.79      0.77      1000



### 利用訓練後的模型預測測試資料HW2_hr-analytics_test.csv的離職情況
#### 必須先對test data 做一樣的preprocessing

In [29]:
test_df = pd.read_csv("HW2_hr-analytics_test.csv")

## one hot encoding
sales_encoded = pd.get_dummies(test_df['sales'],prefix='sales')
#print("sales encoded:\n",sales_encoded.head())

## label encoding
mapping = {'low' : 0,'medium' :1,'high' : 2}
test_df['salary_encoded' ] = test_df['salary'].map(mapping)

#丟棄原本encode前的column
test_df = pd.concat([test_df,sales_encoded],axis = 1)#將one hot encoding的結果整合進去train_df
test_df = test_df.drop('sales',axis = 1)
test_df = test_df.drop('salary',axis = 1)
test_df.to_csv('encoded_train_data.csv',index= False)

#### 進行預測並輸出結果成HW2_hr-analytics_test_sol.csv

In [33]:
y_pred = model.predict(test_df)

save_df = pd.DataFrame(y_pred,columns = ['left'])
print(save_df)

save_df.to_csv('HW2_hr-analytics_test_sol.csv',index =False)

      left
0        0
1        0
2        1
3        1
4        0
...    ...
4995     0
4996     0
4997     1
4998     0
4999     0

[5000 rows x 1 columns]
