In [24]:
# NumPy for numerical computing
import numpy as np

# Pandas for DataFrames
import pandas as pd

pd.set_option("display.max_columns", 100)

# Pickle for reading model files
import pickle

# Scikit-Learn's train_test_split function
from sklearn.model_selection import train_test_split

# Area Under ROC Curve
from sklearn.metrics import roc_auc_score

from flask import Flask, request, jsonify

In [25]:
# Load final_model.pkl as model
with open("final_model.pkl", "rb") as f:
    clf = pickle.load(f)

In [26]:
def clean_data(df):
    # Drop duplicates
    df = df.drop_duplicates()

    # Drop temporary workers
    df = df[df.department != "temp"]

    # Missing filed_complaint values should be 0
    df["filed_complaint"] = df.filed_complaint.fillna(0)

    # Missing recently_promoted values should be 0
    df["recently_promoted"] = df.recently_promoted.fillna(0)

    # Fill missing values in department with 'Missing'
    df["department"].fillna("Missing", inplace=True)

    # Indicator variable for missing last_evaluation
    df["last_evaluation_missing"] = df.last_evaluation.isnull().astype(int)

    # Fill missing values in last_evaluation with 0
    df.last_evaluation.fillna(0, inplace=True)

    # Return cleaned dataframe
    return df

In [27]:
def engineer_features(df):
    # Create indicator features
    df["underperformer"] = (
        (df.last_evaluation < 0.6) & (df.last_evaluation_missing == 0)
    ).astype(int)

    df["unhappy"] = (df.satisfaction < 0.2).astype(int)

    df["overachiever"] = ((df.last_evaluation > 0.8) & (df.satisfaction > 0.7)).astype(
        int
    )

    # Create new dataframe with dummy features
    df = pd.get_dummies(df, columns=["department", "salary"])

    # Return augmented DataFrame
    return df

In [28]:
# load raw data
raw_data = pd.read_csv("unseen_employee_data.csv")
raw_data.head()

# Create cleaned_new_data
cleaned_data = clean_data(raw_data)

# Create augmented_new_data
augmented_data = engineer_features(cleaned_data)

# Display first 5 rows
augmented_data.head()

Unnamed: 0,avg_monthly_hrs,filed_complaint,last_evaluation,n_projects,recently_promoted,satisfaction,tenure,last_evaluation_missing,underperformer,unhappy,overachiever,department_IT,department_Missing,department_admin,department_engineering,department_finance,department_information_technology,department_management,department_marketing,department_procurement,department_product,department_sales,department_support,salary_high,salary_low,salary_medium
0,228,0.0,0.735618,2,0.0,0.805661,3.0,0,0,0,0,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False
1,229,0.0,1.0,4,0.0,0.719961,4.0,0,0,0,1,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False
2,196,1.0,0.557426,4,0.0,0.749835,2.0,0,1,0,0,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False
3,207,0.0,0.715171,3,0.0,0.987447,3.0,0,0,0,0,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False
4,129,0.0,0.484818,2,0.0,0.441219,3.0,0,1,0,0,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False


In [29]:
# Predict probabilities
pred = clf.predict_proba(augmented_data)

# Print first 5 predictions
print(pred[:5])


[[1.   0.  ]
 [0.96 0.04]
 [1.   0.  ]
 [1.   0.  ]
 [0.   1.  ]]


## Construct Custom Model Class


In [30]:
class EmployeeRetentionModel:
    def __init__(self, model_location):
        with open(model_location, "rb") as f:
            self.model = pickle.load(f)

    def predict_proba(self, X_new, clean=True, augment=True):
        if clean:
            X_new = self.clean_data(X_new)

        if augment:
            X_new = self.engineer_features(X_new)

        return X_new, self.model.predict_proba(X_new)

    def clean_data(self, df):
        # Drop duplicates
        df = df.drop_duplicates()

        # Drop temporary workers
        df = df[df.department != "temp"]

        # Missing filed_complaint values should be 0
        df["filed_complaint"] = df.filed_complaint.fillna(0)

        # Missing recently_promoted values should be 0
        df["recently_promoted"] = df.recently_promoted.fillna(0)

        # Fill missing values in department with 'Missing'
        df["department"].fillna("Missing", inplace=True)

        # Indicator variable for missing last_evaluation
        df["last_evaluation_missing"] = df.last_evaluation.isnull().astype(int)

        # Fill missing values in last_evaluation with 0
        df.last_evaluation.fillna(0, inplace=True)

        # Return cleaned dataframe
        return df

    def engineer_features(self, df):
        # Create indicator features
        df["underperformer"] = (
            (df.last_evaluation < 0.6) & (df.last_evaluation_missing == 0)
        ).astype(int)

        df["unhappy"] = (df.satisfaction < 0.2).astype(int)

        df["overachiever"] = (
            (df.last_evaluation > 0.8) & (df.satisfaction > 0.7)
        ).astype(int)

        # Create new dataframe with dummy features
        df = pd.get_dummies(df, columns=["department", "salary"])

        # Return augmented DataFrame
        return df

In [31]:
# Initialize an instance
retention_model = EmployeeRetentionModel("final_model.pkl")

In [32]:
# Predict raw data
_, pred1 = retention_model.predict_proba(raw_data, clean=True, augment=True)


In [23]:
print(pred1[:5])

[[1.   0.  ]
 [0.96 0.04]
 [1.   0.  ]
 [1.   0.  ]
 [0.   1.  ]]


In [None]:
## Deploy the model using Flask

In [34]:
# Initialize the Flask app
app = Flask(__name__)

In [None]:
# Define a route for the default URL, which will be used to send data for prediction
@app.route('/predict', methods=['POST'])
def predict():
    # Get JSON input from the user
    data = request.json
    # Assuming the input data is a list of features
    features = np.array(data['features']).reshape(1, -1)
    
    # Make prediction
    prediction = model.predict(features)
    
    # Return the prediction as a JSON response
    return jsonify({'prediction': int(prediction[0])})

# Start the Flask app
if __name__ == '__main__':
    app.run(debug=True)