<a href="https://colab.research.google.com/github/yzzhang/google-colab-notebooks/blob/master/interview_attendance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from   datetime import datetime
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

%matplotlib inline



The following code is going to upload the dataset csv file (e.g., Interview_Attendance_Data.csv) from your local machine. The dataset can be downloaded from Kaggle site: https://www.kaggle.com/vishnusraghavan/the-interview-attendance-problem/

In [2]:

from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving Interview_Attendance_Data.csv to Interview_Attendance_Data.csv
User uploaded file "Interview_Attendance_Data.csv" with length 224436 bytes


Main Goals:



1. Create a model predicting if a candidate will attend an interview. This will be indicated by the "Observed Attendance" column in the data set. Create the model only using the records where this column is not null

2. Provide a probability and a prediction for the candidates where the "Observed Attendance" column is null.

In [0]:
class OneHotEncodeData(BaseEstimator, TransformerMixin):
    def __init__(self):
        '''
        This class is to one-hot encode the categorical features.
        '''
        self.one_hot_feature_names = ['Client name', 
                        'Industry', 
                        'Location', 
                        'Position to be closed', 
                        'Nature of Skillset',
                        'Interview Type', 
                        #'Name(Cand ID)', 
                        'Gender', 
                        'Candidate Current Location',
                        'Candidate Job Location', 
                        'Interview Venue', 
                        'Candidate Native location',
                        'Have you obtained the necessary permission to start at the required time',
                        'Hope there will be no unscheduled meetings',
                        'Can I Call you three hours before the interview and follow up on your attendance for the interview',
                        'Can I have an alternative number/ desk number. I assure you that I will not trouble you too much',
                        'Have you taken a printout of your updated resume. Have you read the JD and understood the same',
                        'Are you clear with the venue details and the landmark.',
                        'Has the call letter been shared', 
                        'Marital Status']
        self.label_encoders   = None
        self.one_hot_encoders = None
        
    def fit(self, X, y=None):
        '''
        This method trains label encoders and one-hot encoders.
        '''
        
        X1 = X.copy()
        
        # one_hot_features = np.zeros((feature_values.shape[0], 0))

        label_encoders   = {}
        one_hot_encoders = {}
        for fname in self.one_hot_feature_names:
            label_encoder   = LabelEncoder()
            one_hot_encoder = OneHotEncoder(categories='auto')
            feature         = X1[fname]
            feature_label_encoded = label_encoder.fit_transform(feature)
            label_encoders[fname]   = label_encoder;
            one_hot_encoder.fit(feature_label_encoded.reshape(-1,1))
            one_hot_encoders[fname] = one_hot_encoder;
        
        # save label encoders and one-hot encoders for encoding test dataset later on
        self.label_encoders   = label_encoders
        self.one_hot_encoders = one_hot_encoders
        
        return self
    
    def transform(self, X, y=None):
        '''
        This method uses trained label encoders and one-hot encoders 
        to one-hot encodes the given catogrical fields.
        '''   
        X1 = X.copy()
        
        # one-hot encode
        one_hot_features = np.zeros((X1.shape[0], 0))
        for fname in self.one_hot_feature_names:
            label_encoder   = self.label_encoders[fname]
            one_hot_encoder = self.one_hot_encoders[fname]
            feature  = X1[fname]
            fencoded = label_encoder.transform(feature)
            f1hot    = one_hot_encoder.transform(fencoded.reshape(-1,1)).toarray()
            one_hot_features = np.c_[one_hot_features, f1hot] 
        
        # drop the original features that have just been one-hot encoded
        X1 = pd.DataFrame(X1).drop(self.one_hot_feature_names, axis=1).values
        
        # combine one-hot codes into the features array
        X1 = np.c_[X1, one_hot_features]

        return X1

In [0]:
class FeaturesUppercase(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names, drop_feature_names):
        '''
        This class is to change feature values to uppercase.
        '''
        self.feature_names      = feature_names
        self.drop_feature_names = drop_feature_names
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        '''
        This method is to change feature values to uppercase.
        '''
        X_uppercase = X.copy()
        
        for fname in self.feature_names:
            values = X_uppercase[fname]
            values = values.fillna('NaN')
            values = map(lambda x: x.strip().upper(), values)
            X_uppercase[fname] = values
        
        # drop less important features
        X_uppercase = X_uppercase.drop(self.drop_feature_names, axis=1)
            
        return X_uppercase   

In [0]:
class ParseInterviewDate(BaseEstimator, TransformerMixin):
    def __init__(self):
        '''
        This class is to splits the date of interview into day (2 digits), month (2 digits), year (4 digits).
        '''     
    def __parseDate(self, string, delimit):
        try:
            if ('&' in string):
                subs = tuple(string.split('&'))
                string = subs[0]
        except:
            print ('TypeError: {}'.format(string))
            return None
        
        string = string.strip()
        
        try:
            d = datetime.strptime(string, '%d{0}%m{0}%Y'.format(delimit))
        except:
            try:
                d = datetime.strptime(string, '%d{0}%m{0}%y'.format(delimit))
            except:
                try:
                     d = datetime.strptime(string, '%d{0}%b{0}%Y'.format(delimit))
                except:
                    try:
                         d = datetime.strptime(string, '%d{0}%b{0}%y'.format(delimit))
                    except:
                        try:
                            d = datetime.strptime(string, '%b{0}%d{0}%Y'.format(delimit))
                        except:
                            try:
                                d = datetime.strptime(string, '%b{0}%d{0}%y'.format(delimit))
                            except:
                                d = None
        return d
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        '''
        This method splits the date of interview into day (2 digits), month (2 digits), year (4 digits).
        '''
        
        X1 = X.copy()
        
        days = []
        months = []
        years = []
        ditems = X1['Date of Interview'].values
        for ditem in ditems:
            if (isinstance(ditem, str) and len(ditem) > 0):
                if ('.' in ditem):
                    d = self.__parseDate(ditem, '.')
                elif ('/' in ditem):
                    d = self.__parseDate(ditem, '/')
                elif ('-' in ditem):
                    d = self.__parseDate(ditem, '-')
                elif (' ' in ditem):
                    d = self.__parseDate(ditem, ' ')
                else:
                    d = None
                    
                if (d is None):
                    # print("{}, invalid format of interview date!".format(ditem))
                    days.append(0) # 0 - NaN
                    months.append(0)
                    years.append(0)
                else:
                    days.append(d.day) 
                    months.append(d.month)
                    years.append(d.year)
            else:
                days.append(0)
                months.append(0)
                years.append(0)
        
        X1['Year'] = years
        X1['Month'] = months
        X1['Day'] = days
         
        return X1   

In [0]:
class BucketSkillset(BaseEstimator, TransformerMixin):
    def __init__(self):
        '''
        This class is to re-bucket the skill sets and candidates location features 
        to combine small catogaries into one catogary 'Others'.
        '''
        self.skillset = ['JAVA/J2EE/Struts/Hibernate', 'Fresher', 'Accounting Operations', 'CDD KYC', 'Routine', 'Oracle', 
          'JAVA/SPRING/HIBERNATE/JSF', 'Java J2EE', 'SAS', 'Oracle Plsql', 'Java Developer', 
          'Lending and Liabilities', 'Banking Operations', 'Java', 'Core Java', 'Java J2ee', 'T-24 developer', 
          'Senior software engineer-Mednet', 'ALS Testing', 'SCCM', 'COTS Developer', 'Analytical R & D', 
          'Sr Automation Testing', 'Regulatory', 'Hadoop', 'testing', 'Java', 'ETL', 'Publishing']
        
        self.candidate_locations = ['Chennai', 'Hyderabad', 'Bangalore', 'Gurgaon', 'Cuttack', 'Cochin', 
                          'Pune', 'Coimbatore', 'Allahabad', 'Noida', 'Visakapatinam', 'Nagercoil',
                          'Trivandrum', 'Kolkata', 'Trichy', 'Vellore']
        
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        '''
        This method is to re-bucket the skill sets and candidates native locations features.
        '''
            
        X1 = X.copy()
        
        fnames = ('Nature of Skillset', 'Candidate Native location')
        fset   = (self.skillset, self.candidate_locations)
        for i, fname in enumerate(fnames):
            fvalues = X1[fname]
            X2 = map(lambda x: x if x in fset[i] else 'Others', fvalues)
            X1[fname] = pd.Series(X2)
            
        return X1  

In [0]:
class GridSearch(object):
    def __init__(self, cv=10):
        '''
        This class finds the best model via Grid Search.
        '''
        self.grid_param = [
            {'n_estimators': range(68,69), # range(60, 70) # best 68
             'max_depth'   : range(8,9)}   # range(5, 10)}  # best 8
        ]
        self.cv = cv
        self.scoring_function = make_scorer(f1_score, greater_is_better=True) 
        self.gridSearch = None
        
    def fit(self, X, y):
        rfc = RandomForestClassifier()
        self.gridSearch = GridSearchCV(rfc, self.grid_param, cv=self.cv, scoring=self.scoring_function)
        self.gridSearch.fit(X, y)
        return self.gridSearch.best_estimator_

In [0]:
class PredictInterview(object):
    def __init__(self):
        '''
        This class is to predict the probability of a candidate attending scheduled interviews.
        '''
        self.dataset_file_name = 'Interview_Attendance_Data.csv'
        self.feature_names = ['Date of Interview', 
                       'Client name', 
                       'Industry', 
                       'Location', 
                       'Position to be closed', 
                       'Nature of Skillset',
                       'Interview Type', 
                       #'Name(Cand ID)',
                       'Gender', 
                       'Candidate Current Location',
                       'Candidate Job Location', 
                       'Interview Venue', 
                       'Candidate Native location',
                       'Have you obtained the necessary permission to start at the required time',
                       'Hope there will be no unscheduled meetings',
                       'Can I Call you three hours before the interview and follow up on your attendance for the interview',
                       'Can I have an alternative number/ desk number. I assure you that I will not trouble you too much',
                       'Have you taken a printout of your updated resume. Have you read the JD and understood the same',
                       'Are you clear with the venue details and the landmark.',
                       'Has the call letter been shared', 'Marital Status']
        
        self.drop_feature_names = [
                        'Name(Cand ID)',
                        'Date of Interview', 
                        'Unnamed: 22', 
                        'Unnamed: 23', 
                        'Unnamed: 24', 
                        'Unnamed: 25', 
                        'Unnamed: 26']
        
        self.dataset = None
        self.rfc     = None
        self.gridSearch = None
        self.X_train = None
        self.y_train = None
        self.X_test  = None
        self.y_test  = None
        self.y_pred  = None
        self.X_clean = None
        self.y_clean = None
        self.X_train_encoded = None
        self.X_test_encoded  = None
        self.y_train_encoded = None
        self.accuracy_score  = None 
        self.f1_score        = None
        self.oneHotEncoder   = None
        self.X_test_name_ids = None
        self.pipeline = None
        
        
    def loadData(self, path=None):
        '''
        This method loads a dataset file as a Pandas DataFrame, assuming that the dataset file is in csv format.
        It also shuffles the loaded dataset as part of data preprocessing.
        '''
        if (path != None):
            path = os.path.join(path, self.dataset_file_name)
        else:
            path = self.dataset_file_name
            
        dataset = pd.read_csv(path)
        
        # shuffle data 
        self.dataset = dataset.sample(frac=1).reset_index(drop=True) 
        
        return self.dataset     
    
    def PreprocessData(self):
        '''
        This method preprocesses the loaded dataset before applying one-hot encoding.
        '''
            
        y = self.dataset['Observed Attendance']                # extract labels y
        X = self.dataset.drop(['Observed Attendance'], axis=1) # extract features X
        
        self.oneHotEncoder = OneHotEncodeData()
        
        self.pipeline = Pipeline([
            ('bucket_skillset', BucketSkillset()),
            ('parse_interview_date', ParseInterviewDate()),
            ('features_to_uppercase', FeaturesUppercase(self.feature_names, self.drop_feature_names)),
            ('one_hot_encoder', self.oneHotEncoder)
        ])
        
        X_1hot = self.pipeline.fit_transform(X)
        
        # fill up missing labels and then change labels to uppercase
        y = y.fillna('NaN')
        y_uppercase = map(lambda x: x.strip().upper(), y.values)
        y_uppercase = pd.Series(y_uppercase)
        
        # separate labeled records from unlabeled records
        self.X_train_encoded = X_1hot[y_uppercase != 'NAN']
        self.X_test_encoded  = X_1hot[y_uppercase == 'NAN']
        
        # save Names/ID for reporting later one
        self.X_test_name_ids = self.dataset['Name(Cand ID)'][y_uppercase == 'NAN']
        
        y_train = y_uppercase[y_uppercase != 'NAN']
        # encode labels as follows: 0 - NO, 1 - YES, NAN - NAN
        y = map(lambda x: 1 if x == 'YES' else 0, y_train)
        y = pd.Series(y)
        
        self.y_train_encoded = y.values
        
        self.X_clean = X_1hot
        self.y_clean = y_uppercase
        
        return None
    
    def __splitData(self):
        '''
        This method triggers data preprocsssing and split dataset into training and testing datasets.
        '''
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X_train_encoded, 
                                                                                self.y_train_encoded, 
                                                                                test_size = 0.25, random_state = 0)
        return (self.X_train, self.X_test, self.y_train, self.y_test)
    
    def trainModel(self):
        '''
        This method triggers splitting dataset and then find a best RandomForest model via grid search 
        using the training features and labels.
        '''
        X_train, X_test, y_train, y_test = self.__splitData()
        self.gridSearch = GridSearch()
        self.rfc = self.gridSearch.fit(X_train, y_train)
        return self.rfc
    
    def predictClasses(self):
        '''
        This method predicts classes (YES or NO) using a trained model.
        '''
        if (self.rfc is None):
            print("No trained model available, please train a model first!")
            return None
        
        self.y_pred = self.rfc.predict(self.X_test)
        return self.y_pred
    
    def getModelMetrics(self):
        '''
        This method obtains the class prediction scores: (Accuracy Score, R2, F1).
        '''
        if (self.y_test is None or self.y_pred is None):
            print('Failed to get model performance metrics because y_test is null or y_pred is null!')
            return None
        
        self.accuracy_score = accuracy_score(self.y_test, self.y_pred)
        self.f1_score = f1_score(self.y_test, self.y_pred)
        
        return (self.accuracy_score, self.f1_score)
    
    def predictNullAttendanceProbability(self):
        '''
        This method uses a trained model to predict the attendance probability for 
        the candidates where the "Observed Attendance" column is null.
        '''
        y_pred = self.rfc.predict_proba(self.X_test_encoded)
        return y_pred
    
    def predictNullAttendanceClasses(self):
        '''
        This method predicts classes (YES or NO) using a trained model for unlabeled data records.
        '''
        y_pred = self.rfc.predict(self.X_test_encoded)
        return y_pred
    
    def predictAttendanceProbability(self, X):
        '''
        Given one preprocessed (including one-hot encoding) data smaple X,
        this method returns the probability of attendance probability.
        '''
        y_pred = self.rfc.predict_proba(X)
        return y_pred
    
    def predictAttendanceClass(self, X):
        '''
        Given one preprocessed (including one-hot encoding) data smaple X,
        this method returns the attendance Yes/No.
        '''
        y_pred = self.rfc.predict(X)
        return y_pred

Task 1 

(a) Create a model predicting if a candidate will attend an interview. This will be indicated by the "Observed Attendance" column in the data set. Create the model only using the records where this column is not null

In [9]:
predictInterview = PredictInterview()
predictInterview.loadData()
predictInterview.PreprocessData()
predictInterview.trainModel()
predictInterview.predictClasses()
accuracy_score, f1_score = predictInterview.getModelMetrics()

print('accuracy score = {0}, F1 score = {1}'.format(accuracy_score, f1_score))

accuracy score = 0.6596491228070176, F1 score = 0.7759815242494226


Task 1 

(b) Provide a probability and a prediction for the candidates where the "Observed Attendance" column is null.

In [10]:
pred_probs   = predictInterview.predictNullAttendanceProbability()
pred_classes = predictInterview.predictNullAttendanceClasses()

x = predictInterview.X_test_name_ids 
z = zip(x, pred_probs, pred_classes)
answers = ('no', 'yes')

result = [[x1, p1[1], answers[c]] for x1, p1, c in z]
result_df = pd.DataFrame(np.array(result), columns=['Names/ID', 'Probability', 'Yes/No'])
result_df.to_csv('interview_prediction.csv')
result_df.head(100)

Unnamed: 0,Names/ID,Probability,Yes/No
0,Candidate 100,0.7667504205040899,yes
1,Candidate 427,0.6096332871255677,yes
2,Candidate 20,0.6383762065877965,yes
3,Candidate 10,0.3805053865137806,no
4,Candidate 1033,0.5447302129554077,yes
5,Candidate 436,0.6096332871255677,yes
6,Candidate 829,0.5880159815774028,yes
7,Candidate 300,0.6239011611102354,yes
8,Candidate 919,0.2076289929900855,no
9,Candidate 50,0.5234665066703718,yes
