## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

#save the csv file to your desktop


  from pandas.core import (
2024-05-29 22:26:55.021747: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [2]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [3]:
# Create y_df with the Attrition and Department columns

y_df = attrition_df[['Attrition', 'Department']]

In [4]:
# Create a list of at least 10 column names to use as X data
X = ['Age', 'BusinessTravel', 'TotalWorkingYears', 'DistanceFromHome', 'Education', 'EducationField', 'YearsWithCurrManager', 'TrainingTimesLastYear', 'EnvironmentSatisfaction', 'YearsSinceLastPromotion']


# Create X_df using your selected columns
X_df = attrition_df[X]

# Show the data types for X_df

print(X_df.dtypes)

print(X_df)

Age                         int64
BusinessTravel             object
TotalWorkingYears           int64
DistanceFromHome            int64
Education                   int64
EducationField             object
YearsWithCurrManager        int64
TrainingTimesLastYear       int64
EnvironmentSatisfaction     int64
YearsSinceLastPromotion     int64
dtype: object
      Age     BusinessTravel  TotalWorkingYears  DistanceFromHome  Education  \
0      41      Travel_Rarely                  8                 1          2   
1      49  Travel_Frequently                 10                 8          1   
2      37      Travel_Rarely                  7                 2          2   
3      33  Travel_Frequently                  8                 3          4   
4      27      Travel_Rarely                  6                 2          1   
...   ...                ...                ...               ...        ...   
1465   36  Travel_Frequently                 17                23          2   
1466  

In [5]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=78)

In [6]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
X_train = pd.get_dummies(X_df, columns=['BusinessTravel', 'EducationField'], drop_first=True)

X_train = X_train.astype(int)

print(X_train.head())

print(X_train.shape)

   Age  TotalWorkingYears  DistanceFromHome  Education  YearsWithCurrManager  \
0   41                  8                 1          2                     5   
1   49                 10                 8          1                     7   
2   37                  7                 2          2                     0   
3   33                  8                 3          4                     0   
4   27                  6                 2          1                     2   

   TrainingTimesLastYear  EnvironmentSatisfaction  YearsSinceLastPromotion  \
0                      0                        2                        0   
1                      3                        3                        1   
2                      3                        4                        0   
3                      3                        4                        3   
4                      3                        1                        2   

   BusinessTravel_Travel_Frequently  BusinessTrave

In [7]:
X_test = pd.get_dummies(X_df, columns=['BusinessTravel', 'EducationField'], drop_first=True)

X_test = X_test.astype(int)

print(X_test.head())

print(X_test.shape)

   Age  TotalWorkingYears  DistanceFromHome  Education  YearsWithCurrManager  \
0   41                  8                 1          2                     5   
1   49                 10                 8          1                     7   
2   37                  7                 2          2                     0   
3   33                  8                 3          4                     0   
4   27                  6                 2          1                     2   

   TrainingTimesLastYear  EnvironmentSatisfaction  YearsSinceLastPromotion  \
0                      0                        2                        0   
1                      3                        3                        1   
2                      3                        4                        0   
3                      3                        4                        3   
4                      3                        1                        2   

   BusinessTravel_Travel_Frequently  BusinessTrave

In [8]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
scaler.fit(X_train)

# Scale the training and testing data
scaler.transform(X_train)
scaler.transform(X_test)


array([[ 0.4463504 , -0.42164246, -1.01090934, ..., -0.67914108,
        -0.24305927, -0.31409347],
       [ 1.32236521, -0.1645114 , -0.14714972, ..., -0.67914108,
        -0.24305927, -0.31409347],
       [ 0.008343  , -0.55020799, -0.88751511, ..., -0.67914108,
         4.1142228 , -0.31409347],
       ...,
       [-1.08667552, -0.67877352, -0.64072665, ..., -0.67914108,
        -0.24305927, -0.31409347],
       [ 1.32236521,  0.7354473 , -0.88751511, ...,  1.47244811,
        -0.24305927, -0.31409347],
       [-0.32016256, -0.67877352, -0.14714972, ...,  1.47244811,
        -0.24305927, -0.31409347]])

In [9]:
# Create a OneHotEncoder for the Department column

from sklearn.preprocessing import OneHotEncoder

preprocessordep = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), ['Department'])
    ]
)

# Fit the encoder to the training data



# Create two new variables by applying the encoder
# to the training and testing data

y_train_dep_transformed = preprocessordep.fit_transform(y_train)
y_test_dep_transformed = preprocessordep.transform(y_test)

print(y_train_dep_transformed.shape)
print(y_test_dep_transformed.shape)


(1102, 3)
(368, 3)


In [10]:
# Create a OneHotEncoder for the Attrition column

preprocessoratt = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), ['Attrition'])
    ]
)

# Fit the encoder to the training data


# Create two new variables by applying the encoder
# to the training and testing data

y_train_att_transformed = preprocessoratt.fit_transform(y_train)
y_test_att_transformed = preprocessoratt.transform(y_test)


## Create, Compile, and Train the Model

In [11]:
# Find the number of columns in the X training data

X_train.shape[1]

# Create the input layer

input_layer = layers.Input(shape=(X_train.shape[1],))

# Create at least two shared layers

shared_layer1 = layers.Dense(10, activation='relu')(input_layer)
shared_layer2 = layers.Dense(10, activation='relu')(shared_layer1)

In [12]:
# Create a branch for Department
# with a hidden layer and an output layer

branch_dep = layers.Dense(10, activation='relu')(shared_layer1)

# Create the hidden layer

dep_hidden_layer = layers.Dense(10, activation='relu')(branch_dep)

# Create the output layer

dep_output_layer = layers.Dense(3, activation='softmax')(shared_layer2)


In [21]:
# Create a branch for Attrition
# with a hidden layer and an output layer

branch_att = layers.Dense(10, activation='relu')(shared_layer1)

# Create the hidden layer

att_hidden_layer = layers.Dense(10, activation='relu')(branch_att)

# Create the output layer

att_output_layer = layers.Dense(2, activation='sigmoid')(shared_layer2)


In [22]:
# Create the model

model = Model(inputs=input_layer, outputs=[dep_output_layer, att_output_layer])

# Compile the model

model.compile(optimizer='adam', loss={'dep_output_layer':'categorical_crossentropy', 'att_output_layer':'categorical_crossentropy'}, 
              metrics={'dep_output_layer':'accuracy', 'att_output_layer':'accuracy'})

# Summarize the model

model.summary()

In [15]:
print(X_train.shape)
print(y_train_dep_transformed.shape)
print(y_train_att_transformed.shape)

(1470, 15)
(1102, 3)
(1102, 2)


In [23]:
# Train the model

model.fit(X_train, [y_train_dep_transformed, y_train_att_transformed], epochs=100, batch_size=32, validation_split=0.2)

ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 1176
'y' sizes: 1102, 1102


In [19]:
# Evaluate the model with the testing data

model.evaluate(X_test, [y_test_dep_transformed, y_test_att_transformed])

ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 1470
'y' sizes: 368, 368


In [20]:
# Print the accuracy for both department and attrition

print(f"Department Accuracy: {results[3]*100:.2f}%")
print(f"Attrition Accuracy: {results[4]*100:.2f}%")

NameError: name 'results' is not defined

# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. I'm not sure, I wasn't able to get the model to produce results to do to a consistent 'Data cardinality is ambiguous' error. Despite a lot of troubleshooting, I wasn't able to figure out where the rows amounts for each array deviated. I've never seen that error in the code so I must've made a mistake somewhere in the preprocessing, but I couldn't figure out where splitting the training and testing sets and encoding them created that problem. I think I did identify that my target variable arrays somehow got to the wrong size compared to the features.
2. I went with the relu initially but I likely should've gone with a sigmoid function for the attrition output function since that is a binary outcome. The department has more than one classification so I think Softmax would've been fine.
3. There's a lot of different ways to improve or adjust the model, depending on the results I might've looked into increasing or decreasing the epochs, the batching size, I could've also added or removed different layers and tried different variations of those. 