In [24]:
# print_function for compatibility with Python 3
from __future__ import print_function
# NumPy for numerical computing
import numpy as np
# Pandas for DataFrames
import pandas as pd
pd.set_option('display.max_columns',100)

# Pickle for reading model files
import pickle
# Scikit-Learn for Modeling
import sklearn
from sklearn.model_selection import train_test_split

In [25]:
# Area under ROC curve
from sklearn.metrics import roc_auc_score

In [26]:
# Load final_model.pkl as model
with open('final_model.pkl','r') as f:
    final_model=pickle.load(f)

In [27]:
# Display model object
final_model

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.33, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_i...imators=100, n_jobs=1,
            oob_score=False, random_state=123, verbose=0, warm_start=False))])

In [28]:
# Load analytical base table used in Module 4
abt_df=pd.read_csv('project_files/analytical base table.csv')

In [29]:
# Create separate object for target variable
y=abt_df.status

# Create separate object for input features
x=abt_df.drop('status',axis=1)

# Split X and y into train and test sets
X_train,X_test, y_train, y_test=train_test_split(x,y,test_size=.2,random_state=1234,stratify=abt_df.status)


In [30]:
# Predict X_test
pred=final_model.predict_proba(X_test)

# Get just the prediction for the postive class (1)
pred=[pre[1] for pre in pred]

# Print AUROC
print(roc_auc_score(y_test,pred))

0.991520189216


In [31]:
raw_data = pd.read_csv('project_files/unseen_raw_data.csv')

print( raw_data.shape )
raw_data.head()

(750, 9)


Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,tenure
0,228,management,,0.735618,2,,high,0.805661,3.0
1,229,product,,1.0,4,,low,0.719961,4.0
2,196,sales,1.0,0.557426,4,,low,0.749835,2.0
3,207,IT,,0.715171,3,,high,0.987447,3.0
4,129,management,,0.484818,2,,low,0.441219,3.0


In [32]:
abt_df.dtypes

avg_monthly_hrs              int64
filed_complaint            float64
last_evaluation            float64
n_projects                   int64
recently_promoted          float64
satisfaction               float64
status                       int64
tenure                     float64
last_evaluation_missing      int64
underperformer               int64
unhappy                      int64
overachiever                 int64
department_IT                int64
department_Missing           int64
department_admin             int64
department_engineering       int64
department_finance           int64
department_management        int64
department_marketing         int64
department_procurement       int64
department_product           int64
department_sales             int64
department_support           int64
salary_high                  int64
salary_low                   int64
salary_medium                int64
dtype: object

In [33]:

raw_data.dtypes

avg_monthly_hrs        int64
department            object
filed_complaint      float64
last_evaluation      float64
n_projects             int64
recently_promoted    float64
salary                object
satisfaction         float64
tenure               float64
dtype: object

In [35]:
def clean_data(df):
    # Drop duplicates
    df=df.drop_duplicates()
    # Drop temporary workers
    df=df[df['department']!='temp']
    
    # Missing filed_complaint values should be 0
    df['filed_complaint']=df.filed_complaint.fillna(0)
    # Missing recently_promoted values should be 0
    df['recently_promoted']=df.recently_promoted.fillna(0)
    
    
    # 'information_technology' should be 'IT'
    df['department'].replace('information_technology','IT',inplace=True)

    # Fill missing values in department with 'Missing'
    df.department.fillna('Missing',inplace=True)

    # Indicator variable for missing last_evaluation
    df['last_evaluation_missing']=df['last_evaluation'].isnull().astype(int)
    
    # Fill missing values in last_evaluation with 0
    df.last_evaluation.fillna(0,inplace=True)
    
    # Return cleaned dataframe
    return df

In [36]:
# Create cleaned_new_data 
cleaned_data =clean_data(raw_data)

# Display first 5 rows
cleaned_data.head()

Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,tenure,last_evaluation_missing
0,228,management,0.0,0.735618,2,0.0,high,0.805661,3.0,0
1,229,product,0.0,1.0,4,0.0,low,0.719961,4.0,0
2,196,sales,1.0,0.557426,4,0.0,low,0.749835,2.0,0
3,207,IT,0.0,0.715171,3,0.0,high,0.987447,3.0,0
4,129,management,0.0,0.484818,2,0.0,low,0.441219,3.0,0


In [37]:
def engineer_features(df):
    # Create indicator features
    df['underperformer']=((df.last_evaluation<.6) &(df.last_evaluation_missing==0)).astype(int)
    df['unhappy']=((df.satisfaction)<.2).astype(int)
    df['overachiever']=((df.last_evaluation>.8)& (df.satisfaction>.7)).astype(int)
    
        
    # Create new dataframe with dummy features
    df=pd.get_dummies(df,columns=['department','salary'])
    # Return augmented DataFrame
    return df

In [38]:
# Create augmented_new_data
augmented_data=engineer_features(cleaned_data)

# Display first 5 rows
augmented_data.head()

Unnamed: 0,avg_monthly_hrs,filed_complaint,last_evaluation,n_projects,recently_promoted,satisfaction,tenure,last_evaluation_missing,underperformer,unhappy,overachiever,department_IT,department_Missing,department_admin,department_engineering,department_finance,department_management,department_marketing,department_procurement,department_product,department_sales,department_support,salary_high,salary_low,salary_medium
0,228,0.0,0.735618,2,0.0,0.805661,3.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
1,229,0.0,1.0,4,0.0,0.719961,4.0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2,196,1.0,0.557426,4,0.0,0.749835,2.0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
3,207,0.0,0.715171,3,0.0,0.987447,3.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,129,0.0,0.484818,2,0.0,0.441219,3.0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0


In [39]:
# Predict probabilities
pred= final_model.predict_proba(augmented_data)

# Print first 5 predictions
pred[:5]

array([[ 1.  ,  0.  ],
       [ 0.98,  0.02],
       [ 1.  ,  0.  ],
       [ 1.  ,  0.  ],
       [ 0.  ,  1.  ]])

In [40]:
class EmployeeRetentionModel:
    
    def __init__(self, model_location):
        with open(model_location, 'rb') as f:
            self.model = pickle.load(f)
    
    def predict_proba(self, X_new, clean=True, augment=True):
        if clean:
            X_new = self.clean_data(X_new)
        
        if augment:
            X_new = self.engineer_features(X_new)
        
        return X_new, self.model.predict_proba(X_new)
    
    # Add functions here
    
    def clean_data(self,df):
        # Drop duplicates
        df=df.drop_duplicates()
        # Drop temporary workers
        df=df[df['department']!='temp']
        
        # Missing filed_complaint values should be 0
        df['filed_complaint']=df.filed_complaint.fillna(0)
        # Missing recently_promoted values should be 0
        df['recently_promoted']=df.recently_promoted.fillna(0)
        
        
        # 'information_technology' should be 'IT'
        df['department'].replace('information_technology','IT',inplace=True)
    
        # Fill missing values in department with 'Missing'
        df.department.fillna('Missing',inplace=True)
    
        # Indicator variable for missing last_evaluation
        df['last_evaluation_missing']=df['last_evaluation'].isnull().astype(int)
        
        # Fill missing values in last_evaluation with 0
        df.last_evaluation.fillna(0,inplace=True)
        
        # Return cleaned dataframe
        return df

    def engineer_features(self,df):
    # Create indicator features
        df['underperformer']=((df.last_evaluation<.6) &(df.last_evaluation_missing==0)).astype(int)
        df['unhappy']=((df.satisfaction)<.2).astype(int)
        df['overachiever']=((df.last_evaluation>.8)& (df.satisfaction>.7)).astype(int)


        # Create new dataframe with dummy features
        df=pd.get_dummies(df,columns=['department','salary'])
        # Return augmented DataFrame
        return df

In [41]:
# Initialize an instance
retention_model = EmployeeRetentionModel('final_model.pkl')

In [42]:
# Predict raw data
_, pred1 = retention_model.predict_proba(raw_data, clean=True, augment=True)

# Predict cleaned data
_, pred2 = retention_model.predict_proba(cleaned_data, clean=False, augment=True)

# Predict cleaned and augmented data
_, pred3 = retention_model.predict_proba(augmented_data, clean=False, augment=False)

In [43]:
# Should be true
np.array_equal(pred1, pred2) and np.array_equal(pred2, pred3)

True