# 1. Definition of user-specific functions

In this section, I will define all the user-specific functions needed in this project. The majority of the functions serve to apply robust pre-processing methods to the training data set, e.g. winsorise the data robustly (i.e. using median instead of mean, and mad instead of sd).

In [1]:
#Function to calculate MAD (Median Absolute Deviation) with NA
mad_na <- function(x) {
  if (is.numeric(x)) {
    z <- median(abs(x-median(x, na.rm = T)), na.rm = T)*1.4826
    mad <- ifelse(z == 0, .1, z)
  } else {
    mad <- x
  }
  return(mad)
}

In [2]:
#Function to transform a matrix into robust z-scores
z_rob <- function(x, center = TRUE, scale = TRUE) { 
  Med <- median(x, na.rm = T)
  Mad <- mad_na(x)
  z_train <- (x-Med)/Mad
  z_test <- (x-center)/scale
  if (scale == TRUE & center == TRUE) {
    return(z_train)
  } else {
    return(z_test)
  }
}

In [3]:
#Function to winsorise extreme values (z > |2.57|)
wins <- function(x, cut = 2.57) { 
  x <- as.data.frame(x)
  z <- x %>% 
    mutate_if(is.numeric, scale(.))
  z[z > cut] <- cut
  z[z < -cut] <- -cut
  x <- (z*sd(x, na.rm = T)) + mean(x, na.rm = T)
  return(x)
}

In [4]:
#Robust function to winsorise extreme values (z > |2.57|)
wins_rob <- function(x, cut = 2.57, centers, scales, transform = FALSE) { 
  z_matrix <- matrix(rep(NA, nrow(x)*ncol(x)), ncol = ncol(x))
  x_matrix <- z_matrix
  for (i in 1:ncol(x)) {
    z <- scale(x[,i], center = centers[i], scale = scales[i])
    z_wins <- case_when(
      z > cut ~ cut,
      z < -cut ~ -cut,
      TRUE ~ z
    )
    z_matrix[,i] <- z_wins
    x_wins <- z_wins*scales[i] + centers[i]
    x_matrix[,i] <- x_wins
  }
  if (transform == TRUE) {
    z_matrix <- as.data.frame(z_matrix)
    names(z_matrix) = names(x)
    return(z_matrix)
  } else {
    x_matrix <- as.data.frame(x_matrix)
    names(x_matrix) = names(x)
    return(x_matrix)
  }
}

In [5]:
#Function to detect multivariate extreme values with Mahalanobis' Distance (MD) (this function was copied from stack overflow)
md <- function(x, showout = FALSE) {
  x <- as.data.frame(x)
  nums <- unlist(lapply(x, is.numeric)) 
  xmd <- x %>% 
    mutate(MD = mahalanobis(x[nums],
                            colMeans(x[nums], na.rm = TRUE),
                            cov(x[nums], use = "complete.obs")))
  crit <- qchisq(.999, (ncol(x[nums])))
  x <- xmd[xmd$MD <= crit | is.na(xmd$MD), -ncol(xmd)]
  y <- xmd[xmd$MD > crit & !is.na(xmd$MD), -ncol(xmd)]
  if (showout == FALSE) {
    return(x)
  } else {
    return(y)
  }
}

In [6]:
#Function to create a mosaic plot using ggplot2 (this function was copied from stack overflow)
ggMMplot <- function(data, x, y, statDigits = 1, residDigits = 1, pDigits = 3, ...){
  xvar <- deparse(substitute(x))
  yvar <- deparse(substitute(y))
  mydata <- data[c(xvar, yvar)]
  mytable <- table(mydata)
  
  widths <- c(0, cumsum(apply(mytable, 1, sum)))
  heights <- apply(mytable, 1, function(x){c(0, cumsum(x/sum(x)))})
  
  alldata <- data.frame()
  allnames <- data.frame()
  for(i in 1:nrow(mytable)){
    for(j in 1:ncol(mytable)){
      alldata <- rbind(alldata, c(widths[i], widths[i+1], heights[j, i], heights[j+1, i]))
    }
  }
  colnames(alldata) <- c("xmin", "xmax", "ymin", "ymax")
  
  alldata[[xvar]] <- rep(dimnames(mytable)[[1]],rep(ncol(mytable), nrow(mytable)))
  alldata[[yvar]] <- rep(dimnames(mytable)[[2]],nrow(mytable))
  
  chisq <- chisq.test(mytable)
  df <- chisq$parameter
  pval <- chisq$p.value
  chisqval <- chisq$statistic
  # stdResids <- chisq$stdres
  alldata$xcent <- (alldata$xmin + alldata$xmax)/2
  alldata$ycent <- (alldata$ymin + alldata$ymax)/2
  alldata$stdres <- round(as.vector(t(chisq$stdres)), residDigits)
  # print(chisq$stdres)
  # print(alldata)
  
  titleTxt1 <- paste0("Mosaic plot of ",
                      yvar,
                      " against ",
                      xvar,
                      ", ")
  titleTxt2 <- paste0("chisq(",
                      df,
                      ") = ",
                      round(chisqval, statDigits),
                      ", p = ",
                      format.pval(pval, digits = pDigits))
  titleTxt <- paste0(titleTxt1, titleTxt2)
  subTitleTxt <- "Cell labels are standardised residuals"
  
  ggplot(data  = alldata, 
         aes(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)) + 
    geom_rect(color="black", aes_string(fill=yvar)) +
    geom_text(aes(x = xcent, y = ycent, label = stdres)) +
    xlab(paste0("Count of '", 
                xvar,
                "', total = ",
                max(alldata$xmax))) + # tweaked by CE
    ylab(paste0("Proportion of '", 
                yvar,
                "' per level of '",
                xvar,
                "'")) +
    ggtitle(titleTxt,
            subtitle = subTitleTxt) +
    theme_bw() +
    theme(plot.title = element_text(hjust = .5),
          plot.subtitle = element_text(hjust = .5))
}

In [7]:
#Function to display numerous plots in a grid
#(this function was copied from https://rstudio-pubs-static.s3.amazonaws.com/285012_aca1535265c24407bf17e05741936f7f.html)
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

# 2. Save user-defined functions in local folder
To re-use the user-defined functions in a different notebook, I will save them as an R object to be able to import them into other notebooks. I know there are more elegant ways to do this (e.g. creating my own library), but for now, this workaround will do. 

In [8]:
#Set wd
setwd("C:/Users/veren/github/ML_Project_Predict_Employee_Performance")

In [9]:
#Save user-defined functions
save.image(file = "03_Objects/ud_functions.RData")