In [116]:
import pandas as pd,numpy as np, os, pickle as pkl, fancyimpute
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib
class absenteeism():
    def __init__(self,model_file,scaler_file):
#         self.model=pkl.load(open(model_file,'rb'))
#         self.scaler=pkl.load(open(scaler_file,'rb'))
        self.model=joblib.load(model_file)
        self.scaler=joblib.load(scaler_file)
        self.data=None
    def load_and_process_data(self,data_path):
        df=pd.read_csv(data_path,sep=';')
        df=df[df['Month of absence']!=0]
        self.preprocessed_data=df.copy()
        cat_cols=['ID', 'Reason for absence', 'Month of absence', 'Day of the week',
               'Seasons', 'Education','Son', 'Pet']
        num_cols=[ 'Transportation expense', 'Distance from Residence to Work',
               'Service time', 'Age', 'Work load Average/day ', 'Hit target',
               'Disciplinary failure', 'Social drinker',
               'Social smoker', 'Weight', 'Height', 'Body mass index',
               'Absenteeism time in hours']
        df.loc[df['Reason for absence'].isin(range(1,15)),'Reason for absence']=1
        df.loc[df['Reason for absence'].isin(range(15,19)),'Reason for absence']=2
        df.loc[df['Reason for absence'].isin(range(19,22)),'Reason for absence']=3
        df.loc[df['Reason for absence'].isin(range(22,29)),'Reason for absence']=4
        df.Education=df.Education.map({1:0,2:1,3:1,4:1})
        df.Pet=df.Pet.map({0:0,1:1,2:1,4:1,5:1,8:1})
        df.Son=df.Son.map({0:0,1:0,2:1,3:1,4:1})
        while True:
            for i in num_cols:
                median=np.median(df[i])
                std=np.std(df[i])
                min=(df[i].quantile(0.25)-1.5*(df[i].quantile(0.75)-df[i].quantile(0.25)))    
                max=(df[i].quantile(0.75)+1.5*(df[i].quantile(0.75)-df[i].quantile(0.25)))
                df.loc[df[i]<min,i] = np.nan
                df.loc[df[i]>max,i] = np.nan
            missing_val = df.isnull().sum()
            if(missing_val.sum()>0):
                df[num_cols]=pd.DataFrame(fancyimpute.KNN(k = 3).complete(df[num_cols]), columns = num_cols)
    #             for i in num_cols:
    #                 if len(df_o[df_o[i].isnull()])>0:
    #                     df_o.loc[df_o[i].isnull(),i]=np.mean(df_o[i])
            else:
                break
        df.drop(['ID','Weight','Age','Social smoker','Disciplinary failure','Education','Pet','Absenteeism time in hours'],axis=1,inplace=True)
        df=pd.get_dummies(df,columns=['Reason for absence'],drop_first=True)
        df=df[['Reason for absence_1', 'Reason for absence_2', 'Reason for absence_3', 'Reason for absence_4','Month of absence', 'Day of the week', 'Seasons','Transportation expense', 'Distance from Residence to Work', 'Service time', 'Work load Average/day ', 'Hit target', 'Son','Social drinker', 'Height', 'Body mass index']]
        num_cols=['Transportation expense', 'Distance from Residence to Work',
                   'Service time', 'Work load Average/day ', 'Hit target','Height', 'Body mass index']
        self.data=df
        self.data[num_cols]=self.scaler.transform(self.data[num_cols])
        print(self.data[num_cols].head())

    def predicted_probability(self):
        if(self.data is not None):
            pred=self.model.predict_proba(self.data)[:,1]
            return pred
    def predicted_class(self):
        if(self.data is not None):
            pred=self.model.predict(self.data)
            return pred
    def prediction_with_inputs(self):
        if(self.data is not None):
            self.preprocessed_data['Prediction']=self.model.predict(self.data)
            self.preprocessed_data['Prediction Prob']=self.model.predict_proba(self.data)[:,1]
            return self.preprocessed_data

In [111]:
model=absenteeism(r'model.pickle',r'scaler.pickle')

In [112]:
import os
model.load_and_process_data(os.path.join(os.path.pardir,'data','raw','Absenteeism_at_work.csv'))

Imputing row 1/737 with 0 missing, elapsed time: 0.077
Imputing row 101/737 with 1 missing, elapsed time: 0.079
Imputing row 201/737 with 1 missing, elapsed time: 0.079
Imputing row 301/737 with 1 missing, elapsed time: 0.081
Imputing row 401/737 with 1 missing, elapsed time: 0.082
Imputing row 501/737 with 0 missing, elapsed time: 0.083
Imputing row 601/737 with 0 missing, elapsed time: 0.083
Imputing row 701/737 with 0 missing, elapsed time: 0.084
   Transportation expense  Distance from Residence to Work  Service time  \
0                1.031569                         0.429824      0.133912   
1               -1.553079                        -1.120707      1.331642   
2               -0.631070                         1.441040      1.331642   
3                0.880420                        -1.660022      0.373458   
4                1.031569                         0.429824      0.133912   

   Work load Average/day   Hit target    Height  Body mass index  
0               -0.841

In [113]:
model.data

Unnamed: 0,Reason for absence_1,Reason for absence_2,Reason for absence_3,Reason for absence_4,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Work load Average/day,Hit target,Son,Social drinker,Height,Body mass index
0,0,0,0,1,7,3,1,1.031569,0.429824,0.133912,-0.841499,0.670048,1,1.0,0.887013,0.782414
1,0,0,0,0,7,3,1,-1.553079,-1.120707,1.331642,-0.841499,0.670048,0,1.0,0.887018,1.016535
2,0,0,0,1,7,4,1,-0.631070,1.441040,1.331642,-0.841499,0.670048,0,1.0,-0.143485,1.016535
3,1,0,0,0,7,5,1,0.880420,-1.660022,0.373458,-0.841499,0.670048,1,1.0,-1.173983,-0.622310
4,0,0,0,1,7,5,1,1.031569,0.429824,0.133912,-0.841499,0.670048,1,1.0,0.887013,0.782414
5,0,0,0,1,7,6,1,-0.631070,1.441040,1.331642,-0.841499,0.670048,0,1.0,-0.143485,1.016535
6,0,0,0,1,7,6,1,2.119842,1.508454,-2.261548,-0.841499,0.670048,0,1.0,0.887013,0.080052
7,0,0,0,1,7,6,1,0.593237,1.373625,-0.345180,-0.841499,0.670048,1,1.0,-1.173983,-0.856431
8,0,0,1,0,7,2,1,-0.993828,-1.188122,0.373458,-0.841499,0.670048,1,1.0,0.887017,-0.388189
9,0,0,0,1,7,2,1,0.215364,-1.255536,0.373458,-0.841499,0.670048,0,0.0,0.887013,0.548294


In [114]:
model.model.coef_

array([[ 2.18789311,  1.95207458,  2.07277999,  0.62039073,  0.04807646,
        -0.08154004, -0.34185613,  0.52955715, -0.0794636 , -0.02714478,
         0.08941637, -0.10183254,  0.62221902,  0.40359874,  0.11107644,
         0.06070335]])

In [115]:
model.prediction_with_inputs()

Unnamed: 0,ID,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,...,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index,Absenteeism time in hours,Prediction,Prediction Prob
0,11,26,7,3,1,289,36,13,33,239.554,...,2,1,0,1,90,172,30,4,1,0.729151
1,36,0,7,3,1,118,13,18,50,239.554,...,1,1,0,0,98,178,31,0,0,0.180037
2,3,23,7,4,1,179,51,18,38,239.554,...,0,1,0,0,89,170,31,2,0,0.308526
3,7,7,7,5,1,279,5,14,39,239.554,...,2,1,1,0,68,168,24,4,1,0.896606
4,11,23,7,5,1,289,36,13,33,239.554,...,2,1,0,1,90,172,30,2,1,0.695771
5,3,23,7,6,1,179,51,18,38,239.554,...,0,1,0,0,89,170,31,2,0,0.274861
6,10,22,7,6,1,361,52,3,28,239.554,...,1,1,0,4,80,172,27,8,1,0.653953
7,20,23,7,6,1,260,50,11,36,239.554,...,4,1,0,0,65,168,23,4,1,0.530755
8,14,19,7,2,1,155,12,14,34,239.554,...,2,1,0,0,95,196,25,40,1,0.817976
9,1,22,7,2,1,235,11,14,37,239.554,...,1,0,0,1,88,172,29,8,0,0.432177


In [80]:
model.scaler.var_

array([1., 1., 1., 1., 1., 1., 1.])

In [74]:
num_cols=['Transportation expense', 'Distance from Residence to Work',
                   'Service time', 'Work load Average/day ', 'Hit target','Height', 'Body mass index']

In [108]:
len(model.model.predict(model.data))

737

In [109]:
len(model.preprocessed_data)

740

In [98]:
a=r'scaler.pickle'

In [99]:
joblib.load(a)

StandardScaler(copy=True, with_mean=True, with_std=True)