## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [2]:
# Determine the number of unique values in each column
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [3]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]
y_df.head()

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [7]:
# Create a list of at least 10 column names to use as X data
selected_columns = ['Age',
           'BusinessTravel',
           'MaritalStatus',
           'DistanceFromHome',
           'Education',
           'EducationField',
           'EnvironmentSatisfaction',
           'JobInvolvement',
           'HourlyRate',
           'NumCompaniesWorked'
]

# Create X_df using your selected columns
X_df = attrition_df[selected_columns]
X_df.head()

# Show the data types for X_df
X_df.dtypes

Unnamed: 0,0
Age,int64
BusinessTravel,object
MaritalStatus,object
DistanceFromHome,int64
Education,int64
EducationField,object
EnvironmentSatisfaction,int64
JobInvolvement,int64
HourlyRate,int64
NumCompaniesWorked,int64


In [9]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_df, y_df,
    random_state=42
    )

In [None]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Identify numeric and categorical columns *after* splitting the data
numeric_cols = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()


ohe_X = OneHotEncoder(handle_unknown='ignore')

# Fit and transform the categorical data using the identified columns
X_train_cat = ohe_X.fit_transform(X_train[categorical_cols]).toarray() 
X_test_cat  = ohe_X.transform(X_test[categorical_cols]).toarray()   

X_train_num = X_train[numeric_cols].values
X_test_num  = X_test[numeric_cols].values

X_train_enc = np.hstack([X_train_num, X_train_cat])
X_test_enc  = np.hstack([X_test_num,  X_test_cat])

In [17]:
from sklearn.preprocessing import OneHotEncoder

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_enc)
X_test_scaled  = scaler.transform(X_test_enc)

In [19]:
# Remove the 'sparse=False' argument
dept_ohe = OneHotEncoder(handle_unknown='ignore')
y_train_dept = dept_ohe.fit_transform(y_train[['Department']])
y_test_dept  = dept_ohe.transform(y_test[['Department']])

# Remove the 'sparse=False' argument
attr_ohe = OneHotEncoder(handle_unknown='ignore')
y_train_attr = attr_ohe.fit_transform(y_train[['Attrition']])
y_test_attr  = attr_ohe.transform(y_test[['Attrition']])

# Convert to dense arrays for use with the neural network
y_train_dept = y_train_dept.toarray()
y_test_dept = y_test_dept.toarray()
y_train_attr = y_train_attr.toarray()
y_test_attr = y_test_attr.toarray()


## Part 2: Create, Compile, and Train the Model

In [20]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

input_dim = X_train_scaled.shape[1]
print("Input features:", input_dim)


Input features: 7


In [22]:
inputs = Input(shape=(input_dim,), name='input_layer')
shared = Dense(64, activation='relu')(inputs)
shared = Dense(32, activation='relu')(shared)


In [23]:
dept_branch = Dense(16, activation='relu')(shared)
dept_output = Dense(
    y_train_dept.shape[1],
    activation='softmax',
    name='department_output'
)(dept_branch)


attr_branch = Dense(16, activation='relu')(shared)
attr_output = Dense(
    y_train_attr.shape[1],
    activation='softmax',
    name='attrition_output'
)(attr_branch)


In [25]:
# Create the model
model = Model(inputs=inputs, outputs=[dept_output, attr_output])


# Compile the model
model.compile(
    optimizer='adam',
    loss={
        'department_output': 'categorical_crossentropy',
        'attrition_output':  'categorical_crossentropy'
    },
    metrics={
        'department_output': 'accuracy',
        'attrition_output':  'accuracy'
    }
)

# Summarize the model
model.summary()

In [26]:
# Train the model
history = model.fit(
    X_train_scaled,
    [y_train_dept, y_train_attr],
    validation_split=0.2,
    epochs=20,
    batch_size=32
)

Epoch 1/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - attrition_output_accuracy: 0.7108 - attrition_output_loss: 0.6283 - department_output_accuracy: 0.3432 - department_output_loss: 1.0762 - loss: 1.7050 - val_attrition_output_accuracy: 0.7873 - val_attrition_output_loss: 0.5345 - val_department_output_accuracy: 0.5701 - val_department_output_loss: 0.8998 - val_loss: 1.4350
Epoch 2/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - attrition_output_accuracy: 0.8244 - attrition_output_loss: 0.4774 - department_output_accuracy: 0.6608 - department_output_loss: 0.8258 - loss: 1.3032 - val_attrition_output_accuracy: 0.7873 - val_attrition_output_loss: 0.5160 - val_department_output_accuracy: 0.6063 - val_department_output_loss: 0.8428 - val_loss: 1.3612
Epoch 3/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - attrition_output_accuracy: 0.8496 - attrition_output_loss: 0.4164 - department_output_accurac

In [27]:
# Evaluate the model with the testing data
eval_results = model.evaluate(
    X_test_scaled,
    [y_test_dept, y_test_attr]
)

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - attrition_output_accuracy: 0.8610 - attrition_output_loss: 0.4083 - department_output_accuracy: 0.6477 - department_output_loss: 0.8229 - loss: 1.2284 


In [28]:
# Print the accuracy for both department and attrition
print(f"Department Test Accuracy: {eval_results[3]:.4f}")
print(f"Attrition Test Accuracy:  {eval_results[4]:.4f}")

Department Test Accuracy: 0.8696
Attrition Test Accuracy:  0.6522


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?


3. Can you name a few ways that this model might be improved?


YOUR ANSWERS HERE

1. Accuracy would not be the best metric due to the 0.6522 accuracy it has outputted.
2. I used sigmoid as my output layers.
3. Ways to improve the model:

  1. experiment with adding/removing layers
  2. collect more data
  3. Look at recall, F1, ROC-AUC metrics