'''*Gender* - MODEL BASED IMPUTATION'''

In [None]:
# Importing all necessary libraries
import pandas as pd #data manipulation and analysis
from sklearn.model_selection import train_test_split #splitting the data into training and testing sets
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder #scaling the data, encoding the data
from sklearn.impute import SimpleImputer #handling missing values
from tensorflow.keras.models import Sequential #creating the model
from tensorflow.keras.layers import Dense, Dropout, Input #importing the layers
from tensorflow.keras.optimizers import Adam #the optimizer
from tensorflow.keras.callbacks import EarlyStopping #early stop the training
from tensorflow.keras.metrics import Precision, Recall, AUC, Accuracy, F1Score #metrics
from tensorflow.keras import regularizers #aids in preventing overfitting

In [None]:
model_path = r"C:\Users\D\OneDrive - Grunenthal Group\Desktop\VSC - Py\GRTend\Models\gender_model.keras"

final_dataset_path = r"C:\Users\D\OneDrive - Grunenthal Group\Desktop\VSC - Py\GRTend\Data\dataset_final.csv"

gender_path = r"C:\Users\D\OneDrive - Grunenthal Group\Desktop\VSC - Py\GRTend\Data\gender_data.csv"

In [None]:
# Load the data
df = pd.read_csv(gender_path)

df = df.dropna(subset=['Gender'])

df.info()

# Split the data into features and target
X = df.drop('Gender', axis = 1)
y = df['Gender']

df.shape

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Identify numerical and categorical columns
num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X_train.select_dtypes(include=['object']).columns

In [None]:
# Impute missing values in numerical columns
imputer = SimpleImputer(strategy='mean')
X_train[num_cols] = imputer.fit_transform(X_train[num_cols])
X_test[num_cols] = imputer.transform(X_test[num_cols])

In [None]:
# One-hot encode categorical columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cols_train = encoder.fit_transform(X_train[cat_cols])
encoded_cols_test = encoder.transform(X_test[cat_cols])

In [None]:
# Convert the numpy arrays to pandas DataFrames and reset the index
encoded_cols_train = pd.DataFrame(encoded_cols_train, columns=encoder.get_feature_names_out(cat_cols)).reset_index(drop=True)
encoded_cols_test = pd.DataFrame(encoded_cols_test, columns=encoder.get_feature_names_out(cat_cols)).reset_index(drop=True)

In [None]:
# Reset the index of X_train and X_test
X_train = X_train[num_cols].reset_index(drop=True)
X_test = X_test[num_cols].reset_index(drop=True)

In [None]:
# Concatenate numerical and encoded categorical columns
X_train = pd.concat([X_train[num_cols], encoded_cols_train], axis=1)
X_test = pd.concat([X_test[num_cols], encoded_cols_test], axis=1)

X_train.shape
X_test.shape

In [None]:
# Encode the target variable
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [None]:
# Scale the features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Define the model architecture
model = Sequential([
    Input(shape=(X_train_scaled.shape[1],)),
    Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.002)), #was 0.001
    Dropout(0.2),
    Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.002)),
    Dropout(0.3),
    Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.002)),
    Dropout(0.3),
    Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.002)),
    Dropout(0.4),
    Dense(1, activation='tanh')
])

'''
Rectified Linear Activation (ReLU)
Logistic (Sigmoid)
Hyperbolic Tangent (Tanh)
'''

In [None]:
model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=0.001, beta_1=0.90, beta_2=0.999, epsilon=1e-07, amsgrad=True),
    metrics=['accuracy', 'precision', 'recall', 'auc']
)


'''
Adam = Adaptive Moment Estimation, instead of Stochastic Gradient Descent
Learning rate = rate at which the model learns
Beta 1 = exponential decay rate for the first moment estimates
Beta 2 = exponential decay rate for the second moment estimates
Epsilon = small number to prevent division by zero



'''

In [None]:
# Define early stopping
early_stopping_loss = EarlyStopping(monitor='val_loss', patience=369, restore_best_weights=True, mode='min')
early_stopping_auc = EarlyStopping(monitor='val_auc', patience=369, restore_best_weights=True, mode='max')
#early_stopping_acc = EarlyStopping(monitor='accuracy', patience=420, restore_best_weights=True, mode='max')

In [None]:
from tqdm.keras import TqdmCallback

# Train the model with tqdm loading bar
history = model.fit(
    X_train_scaled,
    y_train,
    validation_data=(X_test_scaled, y_test),
    epochs=1000,
    batch_size=420,
    callbacks=[early_stopping_loss, early_stopping_auc, TqdmCallback(verbose=1)],
    verbose=0  # Set verbose to 0 to suppress the default progress bar
)

In [None]:
# Evaluate the model on the training data
train_loss, train_accuracy, train_precision, train_recall, train_auc = model.evaluate(X_train_scaled, y_train, verbose=0)

# Evaluate the model on the test data
test_loss, test_accuracy, test_precision, test_recall, test_auc = model.evaluate(X_test_scaled, y_test, verbose=0)

print(f"Training Evaluation:\nLoss: {train_loss}\nAccuracy: {train_accuracy}\nPrecision: {train_precision}\nRecall: {train_recall}\nAUC: {train_auc}\n")
print(model.evaluate(X_train_scaled, y_train, verbose=0))

print(f"Test Evaluation:\nLoss: {test_loss}\nAccuracy: {test_accuracy}\nPrecision: {test_precision}\nRecall: {test_recall}\nAUC: {test_auc}\n")
print(model.evaluate(X_test_scaled, y_test, verbose=0))
#model.summary()

In [None]:
# Import matplotlib for plotting
import matplotlib.pyplot as plt

# Get the history of your metrics
history_dict = history.history

# List all your metrics
metrics = ['loss', 'accuracy', 'precision', 'recall', 'auc']

# Plot each metric in a separate subplot
fig, axs = plt.subplots(5, 1, figsize=(10, 20))

for i, metric in enumerate(metrics):
    # Plot training metric
    axs[i].plot(history_dict[metric], label='train')
    # Plot validation metric
    axs[i].plot(history_dict['val_'+metric], label='val')
    # Set title, x and y labels
    axs[i].set_title('Model '+ metric)
    axs[i].set_xlabel('epoch')
    axs[i].set_ylabel(metric)
    # Show legend
    axs[i].legend()

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
#model_path = r"C:\Users\DE88342\OneDrive - Grunenthal Group\Desktop\VSC - Py\GRTend\Models\gender_model.keras"

# Save the model
model.save(model_path)

In [None]:
#load the full dataset

full_df = pd.read_csv(r"C:\Users\D\OneDrive - Grunenthal Group\Desktop\VSC - Py\GRTend\Data\data.csv")

#full_df.info()

#seperate Gender and Null-Genders
known_gender_df = full_df.dropna(subset=['Gender'])
unknown_gender_df = full_df[full_df['Gender'].isna()]

'''
trying to impute missing values directly on a slice of the DataFrame, to avoid this warning, use the .loc method
'''

#impute missing numerical values
unknown_gender_df.loc[:, num_cols] = imputer.transform(unknown_gender_df[num_cols]) 

In [None]:
# One-hot encode categorical columns
encoded_cols_unknown = encoder.transform(unknown_gender_df[cat_cols])
encoded_cols_unknown = pd.DataFrame(encoded_cols_unknown, columns=encoder.get_feature_names_out(cat_cols)).reset_index(drop=True)

In [None]:
# Reset the index of unknown_gender_df
unknown_gender_df = unknown_gender_df[num_cols].reset_index(drop=True)

In [None]:

# Concatenate numerical and encoded categorical columns
unknown_gender_df = pd.concat([unknown_gender_df[num_cols], encoded_cols_unknown], axis=1)

In [None]:
# Scale the features
unknown_gender_df_scaled = scaler.transform(unknown_gender_df)

In [None]:
# Use the trained model to predict the 'Gender' for the rows with unknown 'Gender'
predicted_gender = model.predict(unknown_gender_df_scaled)

In [None]:
# Replace the missing 'Gender' values in the original dataset with the predicted values
full_df.loc[full_df['Gender'].isna(), 'Gender'] = label_encoder.inverse_transform(predicted_gender.round().astype(int).ravel())

In [None]:
full_df.drop('Job Function', axis=1, inplace=True)
full_df.drop('Employment Details Termination Year', axis=1, inplace=True)
full_df.to_csv(final_dataset_path)

In [None]:
final_df = pd.read_csv(final_dataset_path)

'''
to exclude the unrated data we use:
'''

final_df = final_df[final_df['rating_num'] != 0]

#final_df.info()

In [None]:
import plotly.express as px

#creating scatter plot in 3d for age tenure and rating

fig = px.scatter_3d(final_df, x = 'Age', y = 'Tenure', z = 'rating_num',
                    color = 'rating_num',
                    color_continuous_scale = 'Viridis',
                    title = 'Age, Tenure and Rating',
                    labels = {'rating_num': 'Rating'})
fig.update_layout(coloraxis_colorbar=dict(title='Rating'))

fig.show()

'''
INFO: 1: Unsatisfactory, 2: Developing, 3: Performing, 4: Exceeding, 5: Outstanding
'''

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

relevant_columns = ['Age', 'Tenure', 'FTE', 'rating_num', 'Gender']
relevant_df = final_df[relevant_columns]
relevant_df['Gender'] = relevant_df['Gender'].apply(lambda x: 1 if x == 'Male' else 0)

plt.figure(figsize=(10, 8))
corr = relevant_df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Relevant Features')
plt.show()