## Part 1: Preprocessing

In [511]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers
import re


import matplotlib.pyplot as plt

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [512]:
# Determine the number of unique values in each column
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [513]:
attrition_df.shape

(1470, 27)

In [514]:
attrition_df.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EnvironmentSatisfaction      int64
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
NumCompaniesWorked           int64
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StockOptionLevel             int64
TotalWorkingYears            int64
TrainingTimesLastYear        int64
WorkLifeBalance              int64
YearsAtCompany               int64
YearsInCurrentRole           int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
dtype: object

In [515]:
print(attrition_df.isna().sum())

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EnvironmentSatisfaction     0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
NumCompaniesWorked          0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64


In [516]:
attrition_df.describe()

Unnamed: 0,Age,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,9.192517,2.912925,2.721769,65.891156,2.729932,2.063946,2.728571,2.693197,15.209524,3.153741,2.712245,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,8.106864,1.024165,1.093082,20.329428,0.711561,1.10694,1.102846,2.498009,3.659938,0.360824,1.081209,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,1.0,1.0,1.0,30.0,1.0,1.0,1.0,0.0,11.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,2.0,2.0,2.0,48.0,2.0,1.0,2.0,1.0,12.0,3.0,2.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,7.0,3.0,3.0,66.0,3.0,2.0,3.0,2.0,14.0,3.0,3.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,14.0,4.0,4.0,83.75,3.0,3.0,4.0,4.0,18.0,3.0,4.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,29.0,5.0,4.0,100.0,4.0,5.0,4.0,9.0,25.0,4.0,4.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [517]:
def getStatDf(in_df):
    initStats_df = pd.DataFrame(in_df.dtypes, columns = ["datatype"])
    initStats_df = initStats_df.join(pd.DataFrame(in_df.nunique(), columns=["nunique"]))
    initStats_df = initStats_df.join(pd.DataFrame(in_df.isnull().sum(), columns=["isnull"]))
    initStats_df = initStats_df.join(pd.DataFrame(in_df.describe().transpose()))
    return initStats_df

In [518]:
from IPython.display import display, HTML

In [519]:
def baseDataExploration(in_df):
   """
    Display statistics to help to learn more about the data in the dataframe

    Args:
        in_df: The dataframe to be explored.

    Returns:
        A dataframe that contains some of the statistics that were displayed by this function.

   Raises:
      ValueError
   """
   if not isinstance(in_df, pd.DataFrame):
      raise ValueError("Invalid input: It must be of type DataFrame")
   
   col_st = getStatDf(in_df)
   display(HTML(col_st.to_html()))
   print("\n+++++ Shape: ", in_df.shape)
   print("\n+++++ Data Sample - head: ")
   display(HTML(in_df. head().to_html()))
   print("\n++++ Data Sample - tail: ")
   display(HTML(in_df.tail().to_html()))

   return col_st


In [520]:
baseDataExploration(attrition_df)

Unnamed: 0,datatype,nunique,isnull,count,mean,std,min,25%,50%,75%,max
Age,int64,43,0,1470.0,36.92381,9.135373,18.0,30.0,36.0,43.0,60.0
Attrition,object,2,0,,,,,,,,
BusinessTravel,object,3,0,,,,,,,,
Department,object,3,0,,,,,,,,
DistanceFromHome,int64,29,0,1470.0,9.192517,8.106864,1.0,2.0,7.0,14.0,29.0
Education,int64,5,0,1470.0,2.912925,1.024165,1.0,2.0,3.0,4.0,5.0
EducationField,object,6,0,,,,,,,,
EnvironmentSatisfaction,int64,4,0,1470.0,2.721769,1.093082,1.0,2.0,3.0,4.0,4.0
HourlyRate,int64,71,0,1470.0,65.891156,20.329428,30.0,48.0,66.0,83.75,100.0
JobInvolvement,int64,4,0,1470.0,2.729932,0.711561,1.0,2.0,3.0,3.0,4.0



+++++ Shape:  (1470, 27)

+++++ Data Sample - head: 


Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,2,Sales Executive,4,Single,8,Yes,11,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,2,Research Scientist,2,Married,1,No,23,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,1,Laboratory Technician,3,Single,6,Yes,15,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,1,Research Scientist,3,Married,1,Yes,11,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,1,Laboratory Technician,2,Married,9,No,12,3,4,1,6,3,3,2,2,2,2



++++ Data Sample - tail: 


Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1465,36,No,Travel_Frequently,Research & Development,23,2,Medical,3,41,4,2,Laboratory Technician,4,Married,4,No,17,3,3,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,Research & Development,6,1,Medical,4,42,2,3,Healthcare Representative,1,Married,4,No,15,3,1,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,Research & Development,4,3,Life Sciences,2,87,4,2,Manufacturing Director,2,Married,1,Yes,20,4,2,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,Sales,2,3,Medical,4,63,2,2,Sales Executive,2,Married,2,No,14,3,4,0,17,3,2,9,6,0,8
1469,34,No,Travel_Rarely,Research & Development,8,3,Medical,2,82,4,2,Laboratory Technician,3,Married,2,No,12,3,1,0,6,3,4,4,3,1,2


Unnamed: 0,datatype,nunique,isnull,count,mean,std,min,25%,50%,75%,max
Age,int64,43,0,1470.0,36.92381,9.135373,18.0,30.0,36.0,43.0,60.0
Attrition,object,2,0,,,,,,,,
BusinessTravel,object,3,0,,,,,,,,
Department,object,3,0,,,,,,,,
DistanceFromHome,int64,29,0,1470.0,9.192517,8.106864,1.0,2.0,7.0,14.0,29.0
Education,int64,5,0,1470.0,2.912925,1.024165,1.0,2.0,3.0,4.0,5.0
EducationField,object,6,0,,,,,,,,
EnvironmentSatisfaction,int64,4,0,1470.0,2.721769,1.093082,1.0,2.0,3.0,4.0,4.0
HourlyRate,int64,71,0,1470.0,65.891156,20.329428,30.0,48.0,66.0,83.75,100.0
JobInvolvement,int64,4,0,1470.0,2.729932,0.711561,1.0,2.0,3.0,3.0,4.0


In [521]:
PREDICTION_COLS = ["Attrition", "Department"] 

In [522]:
# Create y_df with the Attrition and Department columns

y_df = attrition_df[PREDICTION_COLS]
y_df

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development
...,...,...
1465,No,Research & Development
1466,No,Research & Development
1467,No,Research & Development
1468,No,Sales


In [523]:
y_df2 = y_df.copy()
print(y_df2["Department"].value_counts())

y_df2["Department"] = y_df2["Department"].apply(lambda x: re.sub(r'[^a-zA-Z\s ]', '', str(x)))

print(y_df2["Department"].value_counts())

Department
Research & Development    961
Sales                     446
Human Resources            63
Name: count, dtype: int64
Department
Research  Development    961
Sales                    446
Human Resources           63
Name: count, dtype: int64


In [524]:

#Stats for those that left the company 
left_df = attrition_df[attrition_df["Attrition"] == "Yes"]
left_df.describe()

Unnamed: 0,Age,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0
mean,33.607595,10.632911,2.839662,2.464135,65.57384,2.518987,1.637131,2.468354,2.940928,15.097046,3.156118,2.599156,0.527426,8.244726,2.624473,2.658228,5.130802,2.902954,1.945148,2.852321
std,9.68935,8.452525,1.008244,1.169791,20.099958,0.773405,0.940594,1.118058,2.678519,3.770294,0.363735,1.125437,0.856361,7.169204,1.254784,0.816453,5.949984,3.174827,3.153077,3.143349
min,18.0,1.0,1.0,1.0,31.0,1.0,1.0,1.0,0.0,11.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,28.0,3.0,2.0,1.0,50.0,2.0,1.0,1.0,1.0,12.0,3.0,2.0,0.0,3.0,2.0,2.0,1.0,0.0,0.0,0.0
50%,32.0,9.0,3.0,3.0,66.0,3.0,1.0,3.0,1.0,14.0,3.0,3.0,0.0,7.0,2.0,3.0,3.0,2.0,1.0,2.0
75%,39.0,17.0,4.0,4.0,84.0,3.0,2.0,3.0,5.0,17.0,3.0,4.0,1.0,10.0,3.0,3.0,7.0,4.0,2.0,5.0
max,58.0,29.0,5.0,4.0,100.0,4.0,5.0,4.0,9.0,25.0,4.0,4.0,3.0,40.0,6.0,4.0,40.0,15.0,15.0,14.0


In [525]:
target_col = "EducationField"
attrition_df[target_col].value_counts().sort_index()

EducationField
Human Resources      27
Life Sciences       606
Marketing           159
Medical             464
Other                82
Technical Degree    132
Name: count, dtype: int64

In [526]:
left_df[target_col].value_counts().sort_index()

EducationField
Human Resources      7
Life Sciences       89
Marketing           35
Medical             63
Other               11
Technical Degree    32
Name: count, dtype: int64

In [527]:
X_COLS = ["Age", 
          "DistanceFromHome", 
          "EnvironmentSatisfaction", 
          "JobLevel",
          "JobSatisfaction", 
          "NumCompaniesWorked", 
          "TotalWorkingYears", 
          "YearsAtCompany", 
          "YearsInCurrentRole", 
          "YearsWithCurrManager",
          "JobRole",
          "OverTime",
          "EducationField"]

In [528]:
# Create a list of at least 10 column names to use as X data
X_columns = X_COLS

# Create X_df using your selected columns
X_df = attrition_df[X_columns]

# Show the data types for X_df
X_df.dtypes

Age                         int64
DistanceFromHome            int64
EnvironmentSatisfaction     int64
JobLevel                    int64
JobSatisfaction             int64
NumCompaniesWorked          int64
TotalWorkingYears           int64
YearsAtCompany              int64
YearsInCurrentRole          int64
YearsWithCurrManager        int64
JobRole                    object
OverTime                   object
EducationField             object
dtype: object

In [529]:
# NTS: del
def binaryEncode(in_df, in_column_name, in_true_value):

    for i in range(len(in_df)):
        if str(in_df.iloc[i]).lower() == in_true_value.lower():
            in_df.iloc[i] = "1"
        else:
            in_df.iloc[i] = "0"

In [530]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

#X_train, X_test, y_train, y_test = train_test_split(X_df, y_df)




NTS

In [531]:
# Note to self, split y in y_train_dept, y_train_attr and y_test_dept, y_test_attr 
y_dept_df = y_df["Department"].apply(lambda x: re.sub(r'[^a-zA-Z\s ]', '', str(x)))
y_attr_df = y_df["Attrition"].apply(lambda x: re.sub(r'[^a-zA-Z\s ]', '', str(x)))
X_train, X_test, y_dept_train, y_dept_test, y_attr_train, y_attr_test = train_test_split(X_df, y_dept_df, y_attr_df)

In [532]:
X_train[ohe_cols[0]]

589      Laboratory Technician
272         Research Scientist
843      Laboratory Technician
42       Laboratory Technician
427            Sales Executive
                 ...          
962                    Manager
1266     Laboratory Technician
1232    Manufacturing Director
982         Research Scientist
1085        Research Scientist
Name: JobRole, Length: 1102, dtype: object

In [533]:
# Convert your X data to numeric data types however you see fit

ohe_cols = ["JobRole", "EducationField", "OverTime"]

job_role_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
ed_fieldencoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
overtime_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

job_col = X_train[ohe_cols[0]]



In [534]:
X_train_encoded = X_train.copy()

X_train_encoded[job_role_encoder.get_feature_names_out()] = job_role_encoder.fit_transform(pd.DataFrame(X_train[ohe_cols[0]]))
X_train_encoded[ed_fieldencoder.get_feature_names_out()]  = ed_fieldencoder.fit_transform(pd.DataFrame(X_train[ohe_cols[1]]))
X_train_encoded[overtime_encoder.get_feature_names_out()] = overtime_encoder.fit_transform(pd.DataFrame(X_train[ohe_cols[2]]))

X_train_encoded = X_train_encoded.drop(columns=ohe_cols)


In [535]:
X_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1102 entries, 589 to 1085
Data columns (total 27 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                1102 non-null   int64  
 1   DistanceFromHome                   1102 non-null   int64  
 2   EnvironmentSatisfaction            1102 non-null   int64  
 3   JobLevel                           1102 non-null   int64  
 4   JobSatisfaction                    1102 non-null   int64  
 5   NumCompaniesWorked                 1102 non-null   int64  
 6   TotalWorkingYears                  1102 non-null   int64  
 7   YearsAtCompany                     1102 non-null   int64  
 8   YearsInCurrentRole                 1102 non-null   int64  
 9   YearsWithCurrManager               1102 non-null   int64  
 10  JobRole_Healthcare Representative  1102 non-null   float64
 11  JobRole_Human Resources            1102 non-null   float64


In [536]:
X_test_encoded = X_test.copy()
X_test_encoded[job_role_encoder.get_feature_names_out()] = job_role_encoder.fit_transform(pd.DataFrame(X_test[ohe_cols[0]]))
X_test_encoded[ed_fieldencoder.get_feature_names_out()] = ed_fieldencoder.fit_transform(pd.DataFrame(X_test[ohe_cols[1]]))
X_test_encoded[overtime_encoder.get_feature_names_out()] = overtime_encoder.fit_transform(pd.DataFrame(X_test[ohe_cols[2]]))

X_test_encoded = X_test_encoded.drop(columns=ohe_cols)

In [537]:
job_role_encoder.get_feature_names_out()

array(['JobRole_Healthcare Representative', 'JobRole_Human Resources',
       'JobRole_Laboratory Technician', 'JobRole_Manager',
       'JobRole_Manufacturing Director', 'JobRole_Research Director',
       'JobRole_Research Scientist', 'JobRole_Sales Executive',
       'JobRole_Sales Representative'], dtype=object)

In [538]:
X_test_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 368 entries, 886 to 435
Data columns (total 27 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                368 non-null    int64  
 1   DistanceFromHome                   368 non-null    int64  
 2   EnvironmentSatisfaction            368 non-null    int64  
 3   JobLevel                           368 non-null    int64  
 4   JobSatisfaction                    368 non-null    int64  
 5   NumCompaniesWorked                 368 non-null    int64  
 6   TotalWorkingYears                  368 non-null    int64  
 7   YearsAtCompany                     368 non-null    int64  
 8   YearsInCurrentRole                 368 non-null    int64  
 9   YearsWithCurrManager               368 non-null    int64  
 10  JobRole_Healthcare Representative  368 non-null    float64
 11  JobRole_Human Resources            368 non-null    float64
 1

In [539]:
# Create a StandardScaler
std_scaler = StandardScaler()

# Fit the StandardScaler to the training data
# Scale the training and testing data
X_scaler = std_scaler.fit(X_train_encoded)
X_train_encoded_scaled = X_scaler.transform(X_train_encoded)
X_test_encoded_scaled = X_scaler.transform(X_test_encoded)


In [540]:
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder for the Department column
yencoder_dept = OneHotEncoder(handle_unknown="ignore", sparse_output=False) 

# Fit the encoder to the training data
yencoder_dept.fit(pd.DataFrame(y_dept_train))

# Apply the encoder to the training and testing data
y_dept_train_encoded = pd.DataFrame(
    yencoder_dept.transform(pd.DataFrame(y_dept_train)), 
    columns=yencoder_dept.get_feature_names_out(),
    index=y_dept_train.index
)

y_attr_test_encoded = pd.DataFrame(
    yencoder_dept.transform(pd.DataFrame(y_dept_test)), 
    columns=yencoder_dept.get_feature_names_out(),
    index=y_dept_test.index 
)

In [541]:
# Create a OneHotEncoder for the Attrition column
yencoder_attr = OneHotEncoder(drop='first', handle_unknown="error", sparse_output=False) 

# Fit the encoder to the training data
yencoder_attr.fit(pd.DataFrame(y_attr_train))

# Apply the encoder to the training and testing data
y_attr_train_encoded = pd.DataFrame(
    yencoder_attr.transform(pd.DataFrame(y_attr_train)), 
    columns=yencoder_attr.get_feature_names_out(),
    index=y_attr_train.index
)

y_attr_test_encoded = pd.DataFrame(
    yencoder_attr.transform(pd.DataFrame(y_attr_test)), 
    columns=yencoder_attr.get_feature_names_out(),
    index=y_attr_test.index 
)


In [542]:
y_attr_train_encoded

Unnamed: 0,Attrition_Yes
589,1.0
272,0.0
843,0.0
42,1.0
427,0.0
...,...
962,0.0
1266,0.0
1232,0.0
982,0.0


## Part 2: Create, Compile, and Train the Model

In [543]:
X_train_encoded_scaled.shape[1]

27

In [544]:
# Find the number of columns in the X training data.
num_x_cols = X_train_encoded_scaled.shape[1]

# Create the input layer
input_layer = layers.Input(shape=num_x_cols, name='input_features')

# Create at least two shared layers
shared_layer1 = layers.Dense(64, activation='relu', name = "shared_layer1")(input_layer)
shared_layer2 = layers.Dense(128, activation='relu', name = "shared_layer2")(shared_layer1)

In [545]:
# Create a branch for Department
# with a hidden layer and an output layer
# Create the hidden layer
dept_hiddenlayer = layers.Dense(32, activation='relu', name='department_hiddenlayer')(shared_layer2)
# Create the output layer
department_output = layers.Dense(3, activation='softmax', name='department_output')(dept_hiddenlayer)


In [551]:
# Create a branch for Attrition
# with a hidden layer and an output layer
# Create the hidden layer

attr_hiddenlayer = layers.Dense(32, activation='relu', name='attrition_hiddenlayer')(shared_layer2)
# Create the output layer
attrition_output = layers.Dense(1, activation='sigmoid', name='attrition_output')(attr_hiddenlayer)


In [552]:
# Create the model

model = Model(inputs=input_layer, outputs=[department_output, attrition_output])

# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy', 'attrition_output': 'binary_crossentropy'},
              metrics={'department_output': 'accuracy', 'attrition_output': 'accuracy'})

# Summarize
model.summary()



Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_features (InputLayer)    [(None, 27)]         0           []                               
                                                                                                  
 shared_layer1 (Dense)          (None, 64)           1792        ['input_features[0][0]']         
                                                                                                  
 shared_layer2 (Dense)          (None, 128)          8320        ['shared_layer1[0][0]']          
                                                                                                  
 department_hiddenlayer (Dense)  (None, 32)          4128        ['shared_layer2[0][0]']          
                                                                                            

In [553]:
# Train the model
model.fit(
    X_train_encoded_scaled,
    {
        'department_output': y_dept_train_encoded,
        'attrition_output': y_attr_train_encoded
    },
    epochs=10,  # You can adjust the number of epochs based on your needs
    batch_size=32,  # You can adjust the batch size based on your available memory
    validation_split=0.2  # You can specify the validation split if you have a separate validation set
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x31452cc70>

In [554]:
# Evaluate the model with the testing data
test_results = model.evaluate(X_test, {'department_output': y_dept_test, 'attrition_output': y_attr_test})
test_results

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [None]:
# Print the accuracy for both department and attrition
test_results.

Attrition predictions accuracy: 0.7880434989929199
Department predictions accuracy: 0.5


In [None]:
loss, accuracy = model.evaluate(X_test, {'department_output': y_dept_test, 'attrition_output': y_attr_test}, verbose=0)
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. No. Precision would be better. You want to be able to very accurately predict who will leave. 
2. Sigmoid for attrition because it was a binary category. Softmax for department because it had more than two classes.
3. Add additional rows, add additional hidden layers to the model.