In [5]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
    
class absenteeism_model():
    def __init__(self, model_file, scaler_file):
        with open('model', 'rb') as model_file, open('scaler', 'rb') as scaler_file:
            self.reg = pickle.load(model_file)
            self.scaler = pickle.load(scaler_file)
            self.data = None
            
    def load_and_clean_data(self, data_file):
        """
        used to clean data according to the preproceesing Jupyter notebook
        :param data_file
        :return cleaned file
        """
        df = pd.read_csv(data_file, delimiter=',')
        self.df_with_predictions = df.copy()
        df = df.drop(['ID'], axis=1)
        df['Absenteeism Time in Hours'] = 'NaN'
        
        reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first = True)
            
        # split reason_columns into 4 types
        reason_type_1 = reason_columns.loc[:,1:14].max(axis=1)
        reason_type_2 = reason_columns.loc[:,15:17].max(axis=1)
        reason_type_3 = reason_columns.loc[:,18:21].max(axis=1)
        reason_type_4 = reason_columns.loc[:,22:].max(axis=1)

        # to avoid multicollinearity, drop the 'Reason for Absence' column from df
        df = df.drop(['Reason for Absence'], axis = 1)

        # concatenate df and the 4 types of reason for absence
        df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis = 1)
        
        column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
                       'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
                       'Pet', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']
        df.columns = column_names

        # re-order the columns in df
        column_names_reordered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date', 'Transportation Expense', 
                                  'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education', 
                                  'Children', 'Pet', 'Absenteeism Time in Hours']
        df = df[column_names_reordered]
        
        df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y') # timestamp
        
        lst_months = []

        for i in range(df.shape[0]):
            lst_months.append(df['Date'][i].month)
            
        df['Month'] = lst_months
        
        df['Day of Week'] = df['Date'].apply(lambda x: x.weekday())
        
        df.drop(['Date'], axis=1, inplace=True)
        
        column_names_updated = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month', 'Day of Week',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
       'Pet', 'Absenteeism Time in Hours']
        
        df = df[column_names_updated]
        
        df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1})
        
        df = df.fillna(value=0)
        
        df = df.drop(['Absenteeism Time in Hours'], axis=1)
        
        self.preprocessed_data = df.copy()
        
        self.data = self.scaler.transform(df)
    
    def predicted_probability(self):
        if (self.data is not None):
            pred = self.reg.predict_prob(self.data)[:,1]
            return pred
        
    def predicted_output_category(self):
        if (self.data is not None):
            pred_outputs = self.reg.predict(self.data)
            return pred_outputs
    
    def predicted_outputs(self):
        if (self.data is not None):
            self.preprocessed_data['Probability'] = self.reg.predicted_probability()
            self.preprocessed_data['Prediction'] = self.reg.predicted_output_category()
            return self.preprocessed_data