Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
branch: master
Fetching contributors…

Octocat-spinner-32-eaf2f5

Cannot retrieve contributors at this time

file 167 lines (123 sloc) 5.592 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
###### published on:
# http://www.r-statistics.com/2010/06/clustergram-visualization-and-diagnostics-for-cluster-analysis-r-code/
## Main author of the function: Tal Galili (tal.galili@gmail.com)


 
clustergram.kmeans <- function(Data, k, ...)
{
# this is the type of function that the clustergram
# function takes for the clustering.
# using similar structure will allow implementation of different clustering algorithms
 
# It returns a list with two elements:
# cluster = a vector of length of n (the number of subjects/items)
# indicating to which cluster each item belongs.
# centers = a k dimensional vector. Each element is 1 number that represent that cluster
# In our case, we are using the weighted mean of the cluster dimensions by
# Using the first component (loading) of the PCA of the Data.
 
cl <- kmeans(Data, k,...)
 
cluster <- cl$cluster
centers <- cl$centers %*% princomp(Data)$loadings[,1] # 1 number per center
# here we are using the weighted mean for each
 
return(list(
cluster = cluster,
centers = centers
))
}
 
clustergram.plot.matlines <- function(X,Y, k.range,
x.range, y.range , COL,
add.center.points , centers.points)
{
plot(0,0, col = "white", xlim = x.range, ylim = y.range,
axes = F,
xlab = "Number of clusters (k)", ylab = "PCA weighted Mean of the clusters", main = c("Clustergram of the PCA-weighted Mean of" ,"the clusters k-mean clusters vs number of clusters (k)"))
axis(side =1, at = k.range)
axis(side =2)
abline(v = k.range, col = "grey")
 
matlines(t(X), t(Y), pch = 19, col = COL, lty = 1, lwd = 1.5)
 
if(add.center.points)
{
require(plyr)
 
xx <- ldply(centers.points, rbind)
points(xx$y~xx$x, pch = 19, col = "red", cex = 1.3)
 
# add points
# temp <- l_ply(centers.points, function(xx) {
# with(xx,points(y~x, pch = 19, col = "red", cex = 1.3))
# points(xx$y~xx$x, pch = 19, col = "red", cex = 1.3)
# return(1)
# })
# We assign the lapply to a variable (temp) only to suppress the lapply "NULL" output
}
}
 
 
 
clustergram <- function(Data, k.range = 2:10 ,
clustering.function = clustergram.kmeans,
clustergram.plot = clustergram.plot.matlines,
line.width = .004, add.center.points = T)
{
# Data - should be a scales matrix. Where each column belongs to a different dimension of the observations
# k.range - is a vector with the number of clusters to plot the clustergram for
# clustering.function - this is not really used, but offers a bases to later extend the function to other algorithms
# Although that would more work on the code
# line.width - is the amount to lift each line in the plot so they won't superimpose eachother
# add.center.points - just assures that we want to plot points of the cluster means
 
n <- dim(Data)[1]
 
PCA.1 <- Data %*% princomp(Data)$loadings[,1] # first principal component of our data
 
if(require(colorspace)) {
COL <- heat_hcl(n)[order(PCA.1)] # line colors
} else {
COL <- rainbow(n)[order(PCA.1)] # line colors
warning('Please consider installing the package "colorspace" for prittier colors')
}
 
line.width <- rep(line.width, n)
 
Y <- NULL # Y matrix
X <- NULL # X matrix
 
centers.points <- list()
 
for(k in k.range)
{
k.clusters <- clustering.function(Data, k)
 
clusters.vec <- k.clusters$cluster
# the.centers <- apply(cl$centers,1, mean)
the.centers <- k.clusters$centers
 
noise <- unlist(tapply(line.width, clusters.vec, cumsum))[order(seq_along(clusters.vec)[order(clusters.vec)])]
# noise <- noise - mean(range(noise))
y <- the.centers[clusters.vec] + noise
Y <- cbind(Y, y)
x <- rep(k, length(y))
X <- cbind(X, x)
 
centers.points[[k]] <- data.frame(y = the.centers , x = rep(k , k))
# points(the.centers ~ rep(k , k), pch = 19, col = "red", cex = 1.5)
}
 
 
x.range <- range(k.range)
y.range <- range(PCA.1)
 
clustergram.plot(X,Y, k.range,
x.range, y.range , COL,
add.center.points , centers.points)
 
 
}




if(F) {

#Examples:

png("d:\\clustergram_plots_%03d.png",650,650, pointsize = 15)

data(iris)
set.seed(250)
par(cex.lab = 1.5, cex.main = 1.2)
Data <- scale(iris[,-5]) # notice I am scaling the vectors)
clustergram(Data, k.range = 2:8, line.width = 0.004) # notice how I am using line.width. Play with it on your problem, according to the scale of Y.

set.seed(500)
Data <- scale(iris[,-5]) # notice I am scaling the vectors)
par(cex.lab = 1.2, cex.main = .7)
par(mfrow = c(3,2))
for(i in 1:6) clustergram(Data, k.range = 2:8 , line.width = .004, add.center.points = T)
par(mfrow = c(1,1))

set.seed(250)
Data <- rbind(
cbind(rnorm(100,0, sd = 0.3),rnorm(100,0, sd = 0.3),rnorm(100,0, sd = 0.3)),
cbind(rnorm(100,1, sd = 0.3),rnorm(100,1, sd = 0.3),rnorm(100,1, sd = 0.3)),
cbind(rnorm(100,2, sd = 0.3),rnorm(100,2, sd = 0.3),rnorm(100,2, sd = 0.3))
)
clustergram(Data, k.range = 2:5 , line.width = .004, add.center.points = T)

set.seed(250)
Data <- rbind(
cbind(rnorm(100,1, sd = 0.3),rnorm(100,0, sd = 0.3),rnorm(100,0, sd = 0.3),rnorm(100,0, sd = 0.3)),
cbind(rnorm(100,0, sd = 0.3),rnorm(100,1, sd = 0.3),rnorm(100,0, sd = 0.3),rnorm(100,0, sd = 0.3)),
cbind(rnorm(100,0, sd = 0.3),rnorm(100,1, sd = 0.3),rnorm(100,1, sd = 0.3),rnorm(100,0, sd = 0.3)),
cbind(rnorm(100,0, sd = 0.3),rnorm(100,0, sd = 0.3),rnorm(100,0, sd = 0.3),rnorm(100,1, sd = 0.3))
)
clustergram(Data, k.range = 2:8 , line.width = .004, add.center.points = T)


dev.off()
}

Something went wrong with that request. Please try again.