Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Added clustergram.r

  • Loading branch information...
commit b2f542afea2494313d5fd830b0e99fa606455033 1 parent 269711d
Tal Galili authored

Showing 1 changed file with 166 additions and 0 deletions. Show diff stats Hide diff stats

  1. +166 0 clustergram.r
166 clustergram.r
... ... @@ -0,0 +1,166 @@
  1 +###### published on:
  2 +# http://www.r-statistics.com/2010/06/clustergram-visualization-and-diagnostics-for-cluster-analysis-r-code/
  3 +## Main author of the function: Tal Galili (tal.galili@gmail.com)
  4 +
  5 +
  6 +
  7 +clustergram.kmeans <- function(Data, k, ...)
  8 +{
  9 + # this is the type of function that the clustergram
  10 + # function takes for the clustering.
  11 + # using similar structure will allow implementation of different clustering algorithms
  12 +
  13 + # It returns a list with two elements:
  14 + # cluster = a vector of length of n (the number of subjects/items)
  15 + # indicating to which cluster each item belongs.
  16 + # centers = a k dimensional vector. Each element is 1 number that represent that cluster
  17 + # In our case, we are using the weighted mean of the cluster dimensions by
  18 + # Using the first component (loading) of the PCA of the Data.
  19 +
  20 + cl <- kmeans(Data, k,...)
  21 +
  22 + cluster <- cl$cluster
  23 + centers <- cl$centers %*% princomp(Data)$loadings[,1] # 1 number per center
  24 + # here we are using the weighted mean for each
  25 +
  26 + return(list(
  27 + cluster = cluster,
  28 + centers = centers
  29 + ))
  30 +}
  31 +
  32 +clustergram.plot.matlines <- function(X,Y, k.range,
  33 + x.range, y.range , COL,
  34 + add.center.points , centers.points)
  35 + {
  36 + plot(0,0, col = "white", xlim = x.range, ylim = y.range,
  37 + axes = F,
  38 + xlab = "Number of clusters (k)", ylab = "PCA weighted Mean of the clusters", main = c("Clustergram of the PCA-weighted Mean of" ,"the clusters k-mean clusters vs number of clusters (k)"))
  39 + axis(side =1, at = k.range)
  40 + axis(side =2)
  41 + abline(v = k.range, col = "grey")
  42 +
  43 + matlines(t(X), t(Y), pch = 19, col = COL, lty = 1, lwd = 1.5)
  44 +
  45 + if(add.center.points)
  46 + {
  47 + require(plyr)
  48 +
  49 + xx <- ldply(centers.points, rbind)
  50 + points(xx$y~xx$x, pch = 19, col = "red", cex = 1.3)
  51 +
  52 + # add points
  53 + # temp <- l_ply(centers.points, function(xx) {
  54 + # with(xx,points(y~x, pch = 19, col = "red", cex = 1.3))
  55 + # points(xx$y~xx$x, pch = 19, col = "red", cex = 1.3)
  56 + # return(1)
  57 + # })
  58 + # We assign the lapply to a variable (temp) only to suppress the lapply "NULL" output
  59 + }
  60 + }
  61 +
  62 +
  63 +
  64 +clustergram <- function(Data, k.range = 2:10 ,
  65 + clustering.function = clustergram.kmeans,
  66 + clustergram.plot = clustergram.plot.matlines,
  67 + line.width = .004, add.center.points = T)
  68 +{
  69 + # Data - should be a scales matrix. Where each column belongs to a different dimension of the observations
  70 + # k.range - is a vector with the number of clusters to plot the clustergram for
  71 + # clustering.function - this is not really used, but offers a bases to later extend the function to other algorithms
  72 + # Although that would more work on the code
  73 + # line.width - is the amount to lift each line in the plot so they won't superimpose eachother
  74 + # add.center.points - just assures that we want to plot points of the cluster means
  75 +
  76 + n <- dim(Data)[1]
  77 +
  78 + PCA.1 <- Data %*% princomp(Data)$loadings[,1] # first principal component of our data
  79 +
  80 + if(require(colorspace)) {
  81 + COL <- heat_hcl(n)[order(PCA.1)] # line colors
  82 + } else {
  83 + COL <- rainbow(n)[order(PCA.1)] # line colors
  84 + warning('Please consider installing the package "colorspace" for prittier colors')
  85 + }
  86 +
  87 + line.width <- rep(line.width, n)
  88 +
  89 + Y <- NULL # Y matrix
  90 + X <- NULL # X matrix
  91 +
  92 + centers.points <- list()
  93 +
  94 + for(k in k.range)
  95 + {
  96 + k.clusters <- clustering.function(Data, k)
  97 +
  98 + clusters.vec <- k.clusters$cluster
  99 + # the.centers <- apply(cl$centers,1, mean)
  100 + the.centers <- k.clusters$centers
  101 +
  102 + noise <- unlist(tapply(line.width, clusters.vec, cumsum))[order(seq_along(clusters.vec)[order(clusters.vec)])]
  103 + # noise <- noise - mean(range(noise))
  104 + y <- the.centers[clusters.vec] + noise
  105 + Y <- cbind(Y, y)
  106 + x <- rep(k, length(y))
  107 + X <- cbind(X, x)
  108 +
  109 + centers.points[[k]] <- data.frame(y = the.centers , x = rep(k , k))
  110 + # points(the.centers ~ rep(k , k), pch = 19, col = "red", cex = 1.5)
  111 + }
  112 +
  113 +
  114 + x.range <- range(k.range)
  115 + y.range <- range(PCA.1)
  116 +
  117 + clustergram.plot(X,Y, k.range,
  118 + x.range, y.range , COL,
  119 + add.center.points , centers.points)
  120 +
  121 +
  122 +}
  123 +
  124 +
  125 +
  126 +
  127 +if(F) {
  128 +
  129 +#Examples:
  130 +
  131 +png("d:\\clustergram_plots_%03d.png",650,650, pointsize = 15)
  132 +
  133 +data(iris)
  134 +set.seed(250)
  135 +par(cex.lab = 1.5, cex.main = 1.2)
  136 +Data <- scale(iris[,-5]) # notice I am scaling the vectors)
  137 +clustergram(Data, k.range = 2:8, line.width = 0.004) # notice how I am using line.width. Play with it on your problem, according to the scale of Y.
  138 +
  139 +set.seed(500)
  140 +Data <- scale(iris[,-5]) # notice I am scaling the vectors)
  141 +par(cex.lab = 1.2, cex.main = .7)
  142 +par(mfrow = c(3,2))
  143 +for(i in 1:6) clustergram(Data, k.range = 2:8 , line.width = .004, add.center.points = T)
  144 +par(mfrow = c(1,1))
  145 +
  146 +set.seed(250)
  147 +Data <- rbind(
  148 + cbind(rnorm(100,0, sd = 0.3),rnorm(100,0, sd = 0.3),rnorm(100,0, sd = 0.3)),
  149 + cbind(rnorm(100,1, sd = 0.3),rnorm(100,1, sd = 0.3),rnorm(100,1, sd = 0.3)),
  150 + cbind(rnorm(100,2, sd = 0.3),rnorm(100,2, sd = 0.3),rnorm(100,2, sd = 0.3))
  151 + )
  152 +clustergram(Data, k.range = 2:5 , line.width = .004, add.center.points = T)
  153 +
  154 +set.seed(250)
  155 +Data <- rbind(
  156 + cbind(rnorm(100,1, sd = 0.3),rnorm(100,0, sd = 0.3),rnorm(100,0, sd = 0.3),rnorm(100,0, sd = 0.3)),
  157 + cbind(rnorm(100,0, sd = 0.3),rnorm(100,1, sd = 0.3),rnorm(100,0, sd = 0.3),rnorm(100,0, sd = 0.3)),
  158 + cbind(rnorm(100,0, sd = 0.3),rnorm(100,1, sd = 0.3),rnorm(100,1, sd = 0.3),rnorm(100,0, sd = 0.3)),
  159 + cbind(rnorm(100,0, sd = 0.3),rnorm(100,0, sd = 0.3),rnorm(100,0, sd = 0.3),rnorm(100,1, sd = 0.3))
  160 + )
  161 +clustergram(Data, k.range = 2:8 , line.width = .004, add.center.points = T)
  162 +
  163 +
  164 +dev.off()
  165 +}
  166 +

0 comments on commit b2f542a

Please sign in to comment.
Something went wrong with that request. Please try again.