Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

file 206 lines (182 sloc) 7.408 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
# These functions provide template for creating common plots.
# They are also useful to illustrate some different capabilities of
# ggplot.

# Parallel coordinates plot.
# Generate a plot ``template'' for a parallel coordinates plot.
#
# One way to think about a parallel coordinates plot, is as plotting
# the data after it has transformation been transformed to gain a new
# variable. This function does this using \code{\link[reshape]{melt}}.
#
# This gives us enormous flexibility as we have separated out the
# type of drawing (lines by tradition) and can now use any of the existing
# geom functions. In particular this makes it very easy to create parallel
# boxplots, as shown in the example.
#
# Three different scaling function are available:
# \itemize{
# \item "range": scale coordinates to have common range $[0, 1]
# \item "var": scale coordinates to have mean 0 and variance 1
# \item "I": don't scale the coordinates at all
# }
# @arguments data frame
# @arguments variables to include in parallel coordinates plot
# @arguments scaling function, one of "range", "var" or "I"
# @arguments other arguments passed on plot creation
# @keyword hplot
#X ggpcp(mtcars) + geom_line()
#X ggpcp(mtcars, scale="var") + geom_line()
#X ggpcp(mtcars, vars=names(mtcars)[3:6], formula= . ~cyl, scale="I") + geom_line()
#X ggpcp(mtcars, scale="I") + geom_boxplot(aes(group=variable))
#X ggpcp(mtcars, vars=names(mtcars[2:6])) + geom_line()
#X p <- ggpcp(mtcars, vars=names(mtcars[2:6]))
#X p + geom_line()
#X p + geom_line(aes(colour=mpg))
ggpcp <- function(data, vars=names(data), scale="range", ...) {
  force(vars)
  scaled <- rescaler(data[, vars], type=scale)
  data <- cunion(scaled, data)
  
  data$ROWID <- 1:nrow(data)
  molten <- melt(data, m=vars)

  ggplot(molten, aes_string(x = "variable", y = "value", group = "ROWID"), ...)
}

# Fluctuation plot
# Create a fluctuation plot.
#
# A fluctutation diagram is a graphical representation of a contingency
# table. This fuction currently only supports 2D contingency tabless
# but extension to more should be relatively straightforward.
#
# With the default size fluctuation diagram, area is proportional to the
# count (length of sides proportional to sqrt(count))
#
# @arguments a table of values, or a data frame with three columns, the last column being frequency
# @arguments size, or colour to create traditional heatmap
# @arguments don't display cells smaller than this value
# @arguments
# @keyword hplot
#X ggfluctuation(table(movies$Action, movies$Comedy))
#X ggfluctuation(table(movies$Action, movies$mpaa))
#X ggfluctuation(table(movies$Action, movies$Comedy), type="colour")
#X ggfluctuation(table(warpbreaks$breaks, warpbreaks$tension))
ggfluctuation <- function(table, type="size", floor=0, ceiling=max(table$freq, na.rm=TRUE)) {
  if (is.table(table)) table <- as.data.frame(t(table))

  oldnames <- names(table)
  names(table) <- c("x","y", "result")
  
  table <- add.all.combinations(table, list("x","y"))
  table <- transform(table,
    x = as.factor(x),
    y = as.factor(y),
    freq = result
 )

  if (type =="size") {
    table <- transform(table,
      freq = sqrt(pmin(freq, ceiling) / ceiling),
      border = ifelse(is.na(freq), "grey90", ifelse(freq > ceiling, "grey30", "grey50"))
    )
    table[is.na(table$freq), "freq"] <- 1
    table <- subset(table, freq * ceiling >= floor)
  }

  if (type=="size") {
    nx <- length(levels(table$x))
    ny <- length(levels(table$y))
    
    p <- ggplot(table,
      aes_string(x="x", y="y", height="freq", width="freq", fill="border")) +
      geom_tile(colour="white") +
      scale_fill_identity() +
      opts(aspect.ratio = ny / nx)

      # geom_rect(aes(xmin = as.numeric(x), ymin = as.numeric(y), xmax = as.numeric(x) + freq, ymax = as.numeric(y) + freq), colour="white") +
    
  } else {
    p <- ggplot(table, aes_string(x="x", y="y", fill="freq")) +
      geom_tile(colour="grey50") +
      scale_fill_gradient2(low="white", high="darkgreen")
  }

  p$xlabel <- oldnames[1]
  p$ylabel <- oldnames[2]
  p
}

# Missing values plot
# Create a plot to illustrate patterns of missing values
#
# The missing values plot is a useful tool to get a rapid
# overview of the number of missings in a dataset. It's strength
# is much more apparent when used with interactive graphics, as you can
# see in Mondrian (\url{http://rosuda.org/mondrian}) where this plot was
# copied from.
#
# @arguments data.frame
# @arguments whether missings should be stacked or dodged, see \code{\link{geom_bar}} for more details
# @arguments whether variable should be ordered by number of missings
# @arguments whether only variables containing some missing values should be shown
# @keyword hplot
# @seealso \code{\link{ggstructure}}, \code{\link{ggorder}}
#X mmissing <- movies
#X mmissing[sample(nrow(movies), 1000), sample(ncol(movies), 5)] <- NA
#X ggmissing(mmissing)
#X ggmissing(mmissing, order=FALSE, missing.only = FALSE)
#X ggmissing(mmissing, avoid="dodge") + scale_y_sqrt()
ggmissing <- function(data, avoid="stack", order=TRUE, missing.only = TRUE) {
  missings <- mapply(function(var, name) cbind(as.data.frame(table(missing=factor(is.na(var), levels=c(TRUE, FALSE), labels=c("yes", "no")))), variable=name),
    data, names(data), SIMPLIFY=FALSE
  )
  df <- do.call("rbind", missings)
  
  prop <- df[df$missing == "yes", "Freq"] / (df[df$missing == "no", "Freq"] + df[df$missing == "yes", "Freq"])
  df$prop <- rep(prop, each=2)
  
  if (order) {
    var <- df$variable
    var <- factor(var, levels = levels(var)[order(1 - prop)])
    df$variable <- var
  }

  if (missing.only) {
    df <- df[df$prop > 0 & df$prop < 1, , drop=FALSE]
    df$variable <- factor(df$variable)
  }
  
  ggplot(df, aes_string(y="Freq", x="variable", fill="missing")) + geom_bar(position=avoid)
}

# Structure plot
# A plot which aims to reveal gross structural anomalies in the data
#
# @arguments data set to plot
# @arguments type of scaling to use. See \code{\link[reshape]{rescaler}} for options
# @keyword hplot
#X ggstructure(mtcars)
ggstructure <- function(data, scale = "rank") {
  ggpcp(data, scale=scale) +
    aes_string(y="ROWID", fill="value", x="variable") +
    geom_tile() +
    scale_y_continuous("row number", expand = c(0, 1)) +
    scale_fill_gradient2(low="blue", mid="white", high="red", midpoint=0)
}

# Order plot
# A plot to investigate the order in which observations were recorded.
#
# @arguments data set to plot
# @arguments type of scaling to use. See \code{\link[reshape]{rescaler}} for options
# @keyword hplot
ggorder <- function(data, scale="rank") {
  ggpcp(data, scale="rank") +
    aes_string(x="ROWID", group="variable", y="value") +
    facet_grid(. ~ variable) +
    geom_line() +
    scale_x_continuous("row number")
}

# Distribution plot
# Experimental template
#
# @keyword internal
ggdist <- function(data, vars=names(data), facets = . ~ .) {
  cat <- sapply(data[vars], is.factor)
  facets <- deparse(substitute(facets))
  
  grid.newpage()
  pushViewport(viewport(layout=grid.layout(ncol = ncol(data))))
  
  mapply(function(name, cat, i) {
    p <- ggplot(data) +
      facet_grid(facets) +
      aes_string(x=name, y=1) +
      geom_bar()

    pushViewport(viewport(layout.pos.col=i))
    grid.draw(ggplotGrob(p))
    popViewport()
  }, names(data[vars]), cat, 1:ncol(data[vars]))
  invisible()
  
}
Something went wrong with that request. Please try again.