## Part 1: Preprocessing

In [42]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder,OneHotEncoder
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,Dense

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [2]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [3]:
# Create y_df with the Attrition and Department columns
y_df=attrition_df[['Attrition','Department']]
y_df


Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development
...,...,...
1465,No,Research & Development
1466,No,Research & Development
1467,No,Research & Development
1468,No,Sales


In [40]:
y_df['Attrition'].value_counts()

Attrition
No     1233
Yes     237
Name: count, dtype: int64

In [41]:
y_df['Department'].value_counts()

Department
Research & Development    961
Sales                     446
Human Resources            63
Name: count, dtype: int64

In [5]:
# Create a list of at least 10 column names to use as X data
X=attrition_df.drop(['Attrition','Department','BusinessTravel','EducationField','JobRole','MaritalStatus'],axis=1)


# Create X_df using your selected columns
X.info()

# Show the data types for X_df



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   DistanceFromHome          1470 non-null   int64 
 2   Education                 1470 non-null   int64 
 3   EnvironmentSatisfaction   1470 non-null   int64 
 4   HourlyRate                1470 non-null   int64 
 5   JobInvolvement            1470 non-null   int64 
 6   JobLevel                  1470 non-null   int64 
 7   JobSatisfaction           1470 non-null   int64 
 8   NumCompaniesWorked        1470 non-null   int64 
 9   OverTime                  1470 non-null   object
 10  PercentSalaryHike         1470 non-null   int64 
 11  PerformanceRating         1470 non-null   int64 
 12  RelationshipSatisfaction  1470 non-null   int64 
 13  StockOptionLevel          1470 non-null   int64 
 14  TotalWorkingYears       

In [6]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
# looking at overtime col in X
X['OverTime'].value_counts()

OverTime
No     1054
Yes     416
Name: count, dtype: int64

In [7]:
# encoding overtime
le=LabelEncoder()
X['OverTime']=le.fit_transform(X['OverTime'])
X['OverTime'].value_counts()

OverTime
0    1054
1     416
Name: count, dtype: int64

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Age                       1470 non-null   int64
 1   DistanceFromHome          1470 non-null   int64
 2   Education                 1470 non-null   int64
 3   EnvironmentSatisfaction   1470 non-null   int64
 4   HourlyRate                1470 non-null   int64
 5   JobInvolvement            1470 non-null   int64
 6   JobLevel                  1470 non-null   int64
 7   JobSatisfaction           1470 non-null   int64
 8   NumCompaniesWorked        1470 non-null   int64
 9   OverTime                  1470 non-null   int32
 10  PercentSalaryHike         1470 non-null   int64
 11  PerformanceRating         1470 non-null   int64
 12  RelationshipSatisfaction  1470 non-null   int64
 13  StockOptionLevel          1470 non-null   int64
 14  TotalWorkingYears         1470 non-null 

In [9]:
#splitting data
X_train,X_test,y_train,y_test=train_test_split(X,y_df,random_state=2,test_size=0.2)

In [10]:
# Create a StandardScaler
sc=StandardScaler()

# Fit the StandardScaler to the training data
sc.fit(X_train)

# Scale the training and testing data
X_train_scaled=sc.transform(X_train)
X_test_scaled=sc.transform(X_test)


In [11]:
# Create a OneHotEncoder for the Department column
deptencoder=OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
deptencoder.fit(y_train['Department'].values.reshape(-1,1))

# Create two new variables by applying the encoder
# to the training and testing data
y_dept_train=deptencoder.transform(y_train['Department'].values.reshape(-1,1))
y_dept_test=deptencoder.transform(y_test['Department'].values.reshape(-1,1))


In [12]:
print(y_dept_train)

[[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 ...
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


In [13]:
y_dept_train.shape

(1176, 3)

In [14]:
# Create a OneHotEncoder for the Attrition column
attencoder=OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
attencoder.fit(y_train['Attrition'].values.reshape(-1,1))

# Create two new variables by applying the encoder
# to the training and testing data
y_att_train=attencoder.transform(y_train['Attrition'].values.reshape(-1,1))
y_att_test=attencoder.transform(y_test['Attrition'].values.reshape(-1,1))

In [39]:
y_att_test

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.

## Create, Compile, and Train the Model

In [16]:
len(X_train.columns)

21

In [17]:
# Find the number of columns in the X training data


# Create the input layer
input_layer=Input(shape=(len(X_train.columns),), name='input_layer')

# Create at least two shared layers
dense1=Dense(64,activation='relu')(input_layer)
dense2=Dense(64,activation='relu')(dense1)

In [18]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
dense_dept=Dense(64,activation='relu')(dense2)

# Create the output layer
dept_output=Dense(3,activation='softmax',name='dept')(dense_dept)


In [19]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
att_dense=Dense(64,activation='relu')(dense2)

# Create the output layer
att_output=Dense(2,activation='softmax',name='att')(att_dense)


In [20]:
# Create the model
model=Model(inputs=input_layer,outputs=[dept_output,att_output])

# Compile the model
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics={'dept':'accuracy','att':'accuracy'})

# Summarize the model
model.summary()

In [21]:
# Train the model
model.fit(X_train_scaled,[y_dept_train,y_att_train], epochs=100)


Epoch 1/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 969us/step - att_accuracy: 0.7047 - att_loss: 0.5782 - dept_accuracy: 0.5662 - dept_loss: 0.9554 - loss: 1.5338
Epoch 2/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 747us/step - att_accuracy: 0.8379 - att_loss: 0.4161 - dept_accuracy: 0.6496 - dept_loss: 0.7652 - loss: 1.1814
Epoch 3/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 720us/step - att_accuracy: 0.8491 - att_loss: 0.3647 - dept_accuracy: 0.6533 - dept_loss: 0.7393 - loss: 1.1038
Epoch 4/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 720us/step - att_accuracy: 0.8564 - att_loss: 0.3786 - dept_accuracy: 0.6301 - dept_loss: 0.7582 - loss: 1.1367
Epoch 5/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - att_accuracy: 0.8854 - att_loss: 0.3153 - dept_accuracy: 0.6599 - dept_loss: 0.7216 - loss: 1.03698
Epoch 6/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x296d004c650>

In [25]:
# Evaluate the model with the testing data
model.evaluate(X_test_scaled,[y_dept_test,y_att_test])

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - att_accuracy: 0.8216 - att_loss: 2.0166 - dept_accuracy: 0.5940 - dept_loss: 3.0103 - loss: 5.0705 


[5.5063581466674805,
 3.372403621673584,
 1.8944021463394165,
 0.8129251599311829,
 0.5714285969734192]

In [26]:
prediction=model.predict(X_test_scaled)

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


In [33]:
dept_pred=np.round(prediction[0])

In [32]:
att_pred=np.round(prediction[1])

In [35]:
att_pred[0:5]

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [37]:
# Print the accuracy for both department and attrition
print(f'Department predictions accuracy:{accuracy_score(y_dept_test,dept_pred)}')
print(f'Attrition predictions accuracy:{accuracy_score(y_att_test,att_pred)}')

Department predictions accuracy:0.5714285714285714
Attrition predictions accuracy:0.8129251700680272


In [43]:
print(f'dept classification report:{classification_report(dept_pred,y_dept_test)}')

dept classification report:              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.67      0.71      0.69       195
           2       0.38      0.31      0.34        97

   micro avg       0.57      0.57      0.57       294
   macro avg       0.35      0.34      0.34       294
weighted avg       0.57      0.57      0.57       294
 samples avg       0.57      0.57      0.57       294



In [45]:
print(f'att classification report')
print(classification_report(att_pred,y_att_test))

att classification report
              precision    recall  f1-score   support

           0       0.91      0.87      0.89       257
           1       0.31      0.41      0.35        37

   micro avg       0.81      0.81      0.81       294
   macro avg       0.61      0.64      0.62       294
weighted avg       0.84      0.81      0.82       294
 samples avg       0.81      0.81      0.81       294



# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. Not really, it doesn't give enough insight into how the model is performing, a classification report would be more helpful, also validation_accuracy would show if the model is overfitting.
2. 'softmax' because the outputs have more than 1 class, at least department does, attrition could be binary.
3. Using a resampling method because the data is pretty unbalanced. Or more data.