## Part 1: Preprocessing

In [24]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

from sklearn.preprocessing import LabelEncoder, OneHotEncoder


#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [25]:
# Determine the number of unique values in each column
attrition_df.nunique()
# [Optional] Attition and Department value counts:
display(attrition_df["Attrition"].value_counts())
display(attrition_df["Department"].value_counts())
# [Optional] List columns
display(attrition_df.columns)

Attrition
No     1233
Yes     237
Name: count, dtype: int64

Department
Research & Development    961
Sales                     446
Human Resources            63
Name: count, dtype: int64

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [26]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[["Attrition", "Department"]]

In [27]:
# Create a list of at least 10 column names to use as X data
column_names = [x_column for x_column in attrition_df.columns if x_column not in [
    "Attrition", "Department", "DistanceFromHome","Education",  
    "MaritalStatus", "PercentSalaryHike", "EducationField", "JobRole",
    "PerformanceRating", "RelationshipSatisfaction", "StockOptionLevel", 
    "TotalWorkingYears", "TrainingTimesLastYear", "YearsInCurrentRole", 
    "YearsWithCurrManager", "YearsAtCompany", "YearsSinceLastPromotion"]]

# Create X_df using your selected columns
X_df = attrition_df[column_names]

# Show the data types for X_df
X_df.dtypes

Age                         int64
BusinessTravel             object
EnvironmentSatisfaction     int64
HourlyRate                  int64
JobInvolvement              int64
JobLevel                    int64
JobSatisfaction             int64
NumCompaniesWorked          int64
OverTime                   object
WorkLifeBalance             int64
dtype: object

In [35]:
# Department not is X_df?
print("🚀 Checking columns in X_df before splitting:")
print(X_df.columns)

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
# [Optional] Determine how the object columns need to be split
display(attrition_df["BusinessTravel"].value_counts())
display(attrition_df["OverTime"].value_counts())
# Train-Test Split (Before Encoding!)

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)

# Department is not is X_train?
print("🚀 Checking columns in X_train after splitting:")
print(X_train.columns)

# 1. One-Hot Encode `BusinessTravel`
business_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# Fit on training data only and transform both train & test sets
X_train_business_encoded = business_encoder.fit_transform(X_train[["BusinessTravel"]])
X_test_business_encoded = business_encoder.transform(X_test[["BusinessTravel"]])

# 2. Label Encode `OverTime` (Yes=1, No=0)
# NOTE: In general Label Encode is for y dataset, 
# however, since OverTime is binary, there shouldn't 
# be a problem with false ordinal relationships
# 
overtime_encoder = LabelEncoder()

# Fit on training set and transform both train & test sets
X_train = X_train.assign(OverTime_encoded=overtime_encoder.fit_transform(X_train["OverTime"]))
X_test = X_test.assign(OverTime_encoded=overtime_encoder.transform(X_test["OverTime"]))

# 3. Drop Original Categorical Data:
X_train = X_train.drop(columns=["BusinessTravel", "OverTime"])
X_test = X_test.drop(columns=["BusinessTravel", "OverTime"])

🚀 Checking columns in X_df before splitting:
Index(['Age', 'BusinessTravel', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'NumCompaniesWorked',
       'OverTime', 'WorkLifeBalance'],
      dtype='object')


BusinessTravel
Travel_Rarely        1043
Travel_Frequently     277
Non-Travel            150
Name: count, dtype: int64

OverTime
No     1054
Yes     416
Name: count, dtype: int64

🚀 Checking columns in X_train after splitting:
Index(['Age', 'BusinessTravel', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'NumCompaniesWorked',
       'OverTime', 'WorkLifeBalance'],
      dtype='object')


In [36]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
print(X_df.dtypes)
print()
print(X_train.dtypes)

Age                         int64
BusinessTravel             object
EnvironmentSatisfaction     int64
HourlyRate                  int64
JobInvolvement              int64
JobLevel                    int64
JobSatisfaction             int64
NumCompaniesWorked          int64
OverTime                   object
WorkLifeBalance             int64
dtype: object

Age                        int64
EnvironmentSatisfaction    int64
HourlyRate                 int64
JobInvolvement             int64
JobLevel                   int64
JobSatisfaction            int64
NumCompaniesWorked         int64
WorkLifeBalance            int64
OverTime_encoded           int64
dtype: object


In [37]:
# Create a StandardScaler
sc = StandardScaler()

# Fit the StandardScaler to the training data
X_train_scaled = sc.fit(X_train)

# Scale the training and testing data
X_test_scaled = sc.transform(X_train)
X_test_scaled = sc.transform(X_test)

In [42]:
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder for the Department column
department_ohe = OneHotEncoder(sparse_output=False )

# Fit the encoder to the training data
department_encoded = department_ohe.fit(y_df[["Department"]])

# Create two new variables by applying the encoder
# to the numpy_array converted training and testing data
y_train_department_converted = department_encoded.transform(np.array(y_train[["Department"]]))
y_test_department_converted = department_encoded.transform(np.array(y_test[["Department"]]))




In [43]:
# Create a OneHotEncoder for the Attrition column
attrition_ohe = OneHotEncoder(sparse_output=False )

# Fit the encoder to the training data
attrition_encoded = attrition_ohe.fit(y_train[["Attrition"]])

# Create two new variables by applying the encoder
# to the numpy_array converted (training and testing data)
y_train_attrition_converted = attrition_ohe.transform(np.array(y_train[["Attrition"]]))
y_test_attriton_converted = attrition_ohe.transform(np.array(y_test[["Attrition"]]))




## Part 2: Create, Compile, and Train the Model

In [None]:
# Find the number of columns in the X training data.


# Create the input layer


# Create at least two shared layers


In [None]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer


# Create the output layer


In [None]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer


# Create the output layer


In [None]:
# Create the model


# Compile the model


# Summarize the model


In [None]:
# Train the model


Epoch 1/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - attrition_output_accuracy: 0.6931 - department_output_accuracy: 0.5956 - loss: 1.4833
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - attrition_output_accuracy: 0.8335 - department_output_accuracy: 0.6383 - loss: 1.2524
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - attrition_output_accuracy: 0.8421 - department_output_accuracy: 0.6393 - loss: 1.1859
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - attrition_output_accuracy: 0.8389 - department_output_accuracy: 0.6692 - loss: 1.1473
Epoch 5/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attrition_output_accuracy: 0.8714 - department_output_accuracy: 0.6484 - loss: 1.0626
Epoch 6/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - attrition_output_accuracy: 0.8700 - department_

In [None]:
# Evaluate the model with the testing data


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_accuracy: 0.7868 - department_output_accuracy: 0.4877 - loss: 4.0711  


[4.038690090179443, 0.7880434989929199, 0.5]

In [None]:
# Print the accuracy for both department and attrition


Attrition predictions accuracy: 0.7880434989929199
Department predictions accuracy: 0.5


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. 
2. 
3. 