Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
2,749 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,375 @@ | ||
|
||
|
||
######################################################################################################## | ||
######################################################################################################## | ||
# CODE FOR THE EMPIRICAL STUDY IN SECTION 6 # | ||
######################################################################################################## | ||
######################################################################################################## | ||
|
||
# Creating the new data set by drawing the top 100 and bottom 30 genes from the original data | ||
####################################################################################################### | ||
data=read.csv("khan_train.csv",header=TRUE, sep=",") # Reading the original dataset. | ||
data$gene=NULL | ||
data=as.matrix(data) | ||
data=t(data) | ||
|
||
# Evaluating the F statistics in (6.14) | ||
EWS=data[1:23,] | ||
BL=data[24:31,] | ||
NB=data[32:43,] | ||
RMS=data[44:64,] | ||
xbar=colMeans(data) | ||
xbar1=colMeans(EWS) | ||
xbar2=colMeans(BL) | ||
xbar3=colMeans(NB) | ||
xbar4=colMeans(RMS) | ||
|
||
n=dim(data)[1] | ||
k=4 | ||
difsq1=dim(EWS)[1]*(xbar1-xbar)^2 | ||
difsq2=dim(BL)[1]*(xbar2-xbar)^2 | ||
difsq3=dim(NB)[1]*(xbar3-xbar)^2 | ||
difsq4=dim(RMS)[1]*(xbar4-xbar)^2 | ||
num=(difsq1+difsq2+difsq3+difsq4)/(k-1)# Numerator of the F statistics | ||
cowVar <- function(x,y) { | ||
one=matrix(rep(1,dim(x)[1]),dim(x)[1]) | ||
bar=one%*%y | ||
S=colSums((x-bar)^2)/(dim(x)[1]-1) | ||
|
||
} | ||
S1=cowVar(EWS,xbar1) | ||
S2=cowVar(BL,xbar2) | ||
S3=cowVar(NB,xbar3) | ||
S4=cowVar(RMS,xbar4) | ||
|
||
dnr=(((dim(EWS)[1]-1)*S1)+((dim(BL)[1]-1)*S2)+((dim(NB)[1]-1)*S3)+((dim(RMS)[1]-1)*S4))/(n-k)# Denominator of F statistic | ||
|
||
Fstat=num/dnr # F statistics | ||
jtop=order(Fstat)[1:100] | ||
jbot=order(Fstat)[2279:2308] | ||
topgene=data[,jtop] # Choosing the top 100 genes using the F statistics | ||
botgene=data[,jbot] # Choosing the bottom 30 genes using the F statistics | ||
newdata=cbind(topgene,botgene) # Combining top and bottom genes to create new data set | ||
write.table(newdata,file="newdata.csv",sep=",",col.names=NA,qmethod="double") | ||
|
||
############The end of the code for creating the new data set of selected top and bottom genes ############### | ||
|
||
|
||
# Case study using the proposed method | ||
####################################### | ||
newdata=data.matrix(read.csv("newdata.csv", header = TRUE, row.names = 1,sep = ",")) #Reading the created sub data set using top 100 and bottom 30 genes. | ||
dimnames(newdata) <- NULL | ||
newdata=as.matrix(newdata) | ||
library(reshape2) | ||
#library(gplots) | ||
library(ggplot2) | ||
covmat=cov(newdata) | ||
p=dim(covmat)[1] | ||
n=dim(newdata)[1] | ||
thetahat = matrix(0, p, p) | ||
mean.new = colMeans(newdata) | ||
# Evaluating thetahat | ||
t1=proc.time()[3] | ||
for(i in 1 : p){ | ||
for(j in 1 : p){ | ||
s = c() | ||
for(k in 1 : n){ | ||
s[k] = (((newdata[k, i] - mean.new[i]) * (newdata[k, j] - mean.new[j])) - covmat[i, j])^2 | ||
} | ||
thetahat[i, j] = sum(s) / n | ||
} | ||
} | ||
c1c2Est = function(n, Sn, thetahat, a1 , a2, c0 ){ | ||
p = dim(Sn)[1] | ||
ALamhat1 = c() | ||
kk = 0 | ||
for (i in 1 : (p-1 )){ | ||
for (j in (i+1 ) : p){ | ||
kk = kk + 1 | ||
ALamhat1[kk] = abs(Sn[i, j] / sqrt(thetahat[i, j]) * sqrt(n / log(p))) | ||
} | ||
} | ||
Mk = sum((ALamhat1 > a1) & (ALamhat1 < a2)) | ||
nhat = Mk - 2 * (pnorm((a2) * sqrt(log(p))) - pnorm((a1)* sqrt(log(p)))) * ((p^2 - p) / 2) | ||
if(nhat > c0) {on = (log(nhat / sqrt(log(p)))) / log(p); deltaStarhat = sqrt(2 * (2 - on))} | ||
if(nhat <= c0) {deltaStarhat = 2} | ||
list(deltaStarhat = deltaStarhat, nhat = nhat, ALamhat = ALamhat1, Mk = Mk) | ||
} | ||
#t1=proc.time()[3] | ||
a = min(sqrt(2 + log(n) / log(p)),2) | ||
b=sqrt(1/log(log(p))) | ||
temp = c1c2Est(n, covmat, thetahat, a1 = 2 - a+b, a2 = 2 , c0 = sqrt(log(p))) | ||
t2=proc.time()[3] | ||
time.prop=t2-t1 | ||
|
||
|
||
# Evaluating the adaptive thresholding covariance estimator using the proposed method | ||
SigmaProp = matrix(0, p, p) | ||
indexM = matrix(1, p, p) - diag(rep(1, p)) | ||
thetahatDiag = thetahat * indexM | ||
deltaProp = temp$deltaStarhat | ||
SigmaProp[which(abs(covmat) > deltaProp * sqrt(thetahatDiag * log(p) / n), arr.ind = TRUE)] = covmat[which(abs(covmat) > deltaProp * sqrt(thetahatDiag * log(p) / n), arr.ind = TRUE)] | ||
# Calculating the proportion of zeros produced by the proposed method | ||
zeros.prop=sum(colSums(SigmaProp== 0))/(p^2) | ||
|
||
|
||
|
||
# Heatmap of absolute values of the covariance # | ||
################################################# | ||
library("lattice") | ||
par(mar=c(3,4,2,2)) | ||
SigmaProp[abs(SigmaProp) >1] =1 | ||
ramp <- colorRamp(c( "white","black")) | ||
c=rgb( ramp(seq(0, 1, length = 16)), max = 255) | ||
xscale <- list(cex=1.3) | ||
yscale <- list(cex=1.3) | ||
levelplot(abs(SigmaProp),main=list(label="Heatmap (Proposed)",cex=1.5 ), col.regions = c , | ||
scales=list(x=xscale, y=yscale),xlab="", ylab="" ,colorkey=list(labels=list(cex=1),space="bottom")) | ||
|
||
|
||
# Gene co-expression networks on positive and negative correlations # | ||
##################################################################### | ||
library(igraph) | ||
rprop=matrix(rep(0,p^2),p) | ||
# Evaluating correlations using adaptive thresholding covariance estimator | ||
for (i in 1 : (p - 1)){ | ||
for (j in (i + 1) : p){ | ||
|
||
rprop[i,j]=SigmaProp[i,j]/sqrt(SigmaProp[i,i]*SigmaProp[j,j]) | ||
|
||
} | ||
} | ||
|
||
trh=min(abs(rprop[which(abs(rprop)>0)])) # Choosing the smallest correlation as the cutoff value | ||
edgesP=which(rprop>=trh,arr.ind = T) # Choosing edges with positive correlations | ||
edgesN=which(-1*rprop>=trh,arr.ind = T) # Choosing edges with negative correlations | ||
lp=as.vector(t(edgesP)) | ||
gp<-graph(lp, directed=F) | ||
ln=as.vector(t(edgesN)) | ||
gn<-graph(ln, directed=F) | ||
|
||
# Function for wrapping strings | ||
wrap <- function(strings,width){ | ||
as.character(sapply(strings, FUN=function(x){ | ||
paste(strwrap(x, width=width), collapse="\n") | ||
})) | ||
} | ||
|
||
# Wrapping the node labels | ||
V(gp)$label = wrap(V(gp)$label, 12) | ||
V(gn)$label = wrap(V(gn)$label, 12) | ||
|
||
#Change font size | ||
V(gp)$label.cex = .8 | ||
V(gn)$label.cex = .8 | ||
|
||
# Function for increasing node separation | ||
layoutattr <- function(graph, wc, strength=1,layout=layout.auto) { | ||
g <- graph.edgelist(get.edgelist(graph)) # create a lightweight copy of graph w/o the attributes. | ||
E(g)$weight <- 1 | ||
|
||
attr <- cbind(id=1:vcount(g), val=wc) | ||
g <- g + vertices(unique(attr[,2])) + igraph::edges(unlist(t(attr)), weight=strength) | ||
|
||
l <- layout(g, weights=E(g)$weight)[1:vcount(graph),] | ||
return(l) | ||
} | ||
top=seq(1:100) | ||
# Plotting positively correlated genes | ||
plot(gp, vertex.shape="none", vertex.size=1,vertex.label.color =ifelse(V(gp) %in% top, "blue", "black"),layout=layoutattr(gp, wc=1)) | ||
title("Co-expression network of positively correlated genes \n(Proposed)",cex.main=1.5) | ||
legend("topleft",bty = "n", legend=c("Top genes", "Bottom genes"),col=c("blue", "black"), pch=c(15,15),cex=1) | ||
|
||
# Plotting negatively correlated genes | ||
plot(gn, vertex.shape="none", vertex.size=1,vertex.label.color =ifelse(V(gn) %in% top, "blue", "black"), | ||
layout=layoutattr(gn, wc=1)) | ||
title("Co-expression network of negatively correlated genes \n(Proposed)",cex.main=1.5) | ||
legend("topleft",bty = "n", legend=c("Top genes", "Bottom genes"),col=c("blue", "black"), pch=c(15,15),cex=1) | ||
###################The end of codes for empirical study under the proposed method ################################# | ||
|
||
|
||
# Case study using the CV method | ||
################################# | ||
newdata=data.matrix(read.csv("newdata.csv", header = TRUE, row.names = 1,sep = ",")) | ||
dimnames(newdata) <- NULL | ||
newdata=as.matrix(newdata) | ||
library(reshape2) | ||
#library(gplots) | ||
library(ggplot2) | ||
covmat=cov(newdata) | ||
|
||
p=dim(covmat)[1] | ||
n=dim(newdata)[1] | ||
thetahat = matrix(0, p, p) | ||
mean.new = colMeans(newdata) | ||
|
||
# CV method to find delta | ||
############################################ | ||
find.del=function(n,p,H,N,X){ | ||
nn1=floor(n*(1-(1/log(n)))) | ||
sub=c() | ||
Ra.ij=array(0,dim=c(H,2*N)) | ||
for (l in 1:H){ | ||
sub=sample(nrow(X),nn1,replace = FALSE, prob = NULL) | ||
sam1=X[sub,] | ||
sam2=X[-sub,] | ||
n1=nrow(sam1) | ||
n2=nrow(sam2) | ||
Sig.hat1=((n1-1)/n1)*cov(sam1) | ||
Sig.hat2=((n2-1)/n2)*cov(sam2) | ||
one=matrix(1,nrow=n1,ncol=n1) | ||
Xbar=one%*%sam1/n1 | ||
t.hat=array(0,dim=c(p,p)) | ||
that1=c() | ||
for( i in 1:p){ | ||
for(j in 1:p){ | ||
s=c() | ||
for(k in 1:n1){ | ||
s[k]=((sam1[k,i]-Xbar[k,i])*(sam1[k,j]-Xbar[k,j])-Sig.hat1[i,j])^2 | ||
} | ||
that1[j]=sum(s)/n1 | ||
} | ||
t.hat[i,]=that1 | ||
} | ||
Raj.i=c() | ||
to=2*N | ||
for (m in 1:to){ | ||
Sig.hat.star1=matrix(0,nrow=p,ncol=p) | ||
am=m/N | ||
Lam.am=am*sqrt(t.hat*log(p)/n1) | ||
Sig.hat.star1[which(abs(Sig.hat1)>Lam.am)]=Sig.hat1[which(abs(Sig.hat1)>Lam.am)] | ||
Raj.i[m]=sum((Sig.hat.star1-Sig.hat2)^2) | ||
} | ||
Ra.ij[l,]=Raj.i | ||
} | ||
aj=c() | ||
Rhat.a=c() | ||
for (j in 1:to){ | ||
Rhat.a=c(Rhat.a,sum(Ra.ij[,j])/H) | ||
aj[j]=j/N | ||
} | ||
j.hat=which(Rhat.a==min(Rhat.a)) | ||
delta.hat=aj[min(j.hat)] | ||
return(delta.hat) | ||
} | ||
|
||
|
||
thetahat = matrix(0, p, p) | ||
mean.new = colMeans(newdata) | ||
# Evaluating thetahat | ||
for(i in 1 : p){ | ||
for(j in 1 : p){ | ||
s = c() | ||
for(k in 1 : n){ | ||
s[k] = (((newdata[k, i] - mean.new[i]) * (newdata[k, j] - mean.new[j])) - covmat[i, j])^2 | ||
} | ||
thetahat[i, j] = sum(s) / n | ||
} | ||
} | ||
|
||
|
||
|
||
# Evaluating the adaptive thresholding estimator using the CV method | ||
indexM = matrix(1, p, p) - diag(rep(1, p)) | ||
thetahatDiag = thetahat * indexM | ||
|
||
H = 50 | ||
N=100 | ||
tc1=proc.time()[3] | ||
deltaCV = find.del(n,p,H,N,newdata) | ||
tc2=proc.time()[3] | ||
time.CV=tc2-tc1 | ||
SigmaCV = matrix(0, p, p) | ||
SigmaCV[which(abs(covmat) > deltaCV * sqrt(thetahatDiag * log(p) / n), arr.ind = TRUE)] = covmat[which(abs(covmat) > deltaCV *sqrt( thetahatDiag * log(p) / n), arr.ind = TRUE)] | ||
zeros.CV=sum(colSums(SigmaCV== 0))/(p^2) # Calculating the proportion of zeros for the CV method | ||
|
||
|
||
|
||
# Creating co-expression networks | ||
library(igraph) | ||
rCV=matrix(rep(0,p^2),p) | ||
for (i in 1 : (p - 1)){ | ||
for (j in (i + 1) : p){ | ||
rCV[i,j]=SigmaCV[i,j]/sqrt(SigmaCV[i,i]*SigmaCV[j,j]) | ||
} | ||
} | ||
trh=min(abs(rCV[which(abs(rCV)>0)])) | ||
edgesP=which(rCV>=trh,arr.ind = T) # Positively correlated edges | ||
edgesN=which(-1*rCV>=trh,arr.ind = T) # Negatively correlated edges | ||
lp=as.vector(t(edgesP)) | ||
gp<-graph(lp, directed=F) | ||
ln=as.vector(t(edgesN)) | ||
gn<-graph(ln, directed=F) | ||
|
||
# Function for wrapping strings | ||
wrap <- function(strings,width){ | ||
as.character(sapply(strings, FUN=function(x){ | ||
paste(strwrap(x, width=width), collapse="\n") | ||
})) | ||
} | ||
|
||
|
||
V(gp)$label = wrap(V(gp)$label, 12) | ||
V(gn)$label = wrap(V(gn)$label, 12) | ||
|
||
|
||
V(gp)$label.cex = .8 | ||
V(gn)$label.cex = .8 | ||
|
||
layoutattr <- function(graph, wc, strength=1,layout=layout.auto) { | ||
g <- graph.edgelist(get.edgelist(graph)) | ||
E(g)$weight <- 1 | ||
attr <- cbind(id=1:vcount(g), val=wc) | ||
g <- g + vertices(unique(attr[,2])) + igraph::edges(unlist(t(attr)), weight=strength) | ||
l <- layout(g, weights=E(g)$weight)[1:vcount(graph),] | ||
return(l) | ||
} | ||
|
||
top=seq(1:100) | ||
plot(gp, vertex.shape="none", vertex.size=1,vertex.label.color =ifelse(V(gp) %in% top, "blue", "black"), | ||
layout=layoutattr(gp, wc=1)) | ||
title("Co-expression network of positively correlated genes \n(CV)",cex.main=1.5) | ||
legend("topleft",bty = "n", legend=c("Top genes", "Bottom genes"),col=c("blue", "black"), pch=c(15,15),cex=1) | ||
|
||
|
||
plot(gn, vertex.shape="none", vertex.size=1,vertex.label.color =ifelse(V(gn) %in% top, "blue", "black"), | ||
layout=layoutattr(gn, wc=1)) | ||
title("Co-expression network of negatively correlated genes \n(CV)",cex.main=1.5) | ||
legend("topleft",bty = "n", legend=c("Top genes", "Bottom genes"),col=c("blue", "black"), pch=c(15,15),cex=1) | ||
|
||
|
||
# Creating heatmap | ||
################### | ||
library("lattice") | ||
par(mar=c(3,4,2,2)) | ||
SigmaCV[abs(SigmaCV) >1] =1 | ||
ramp <- colorRamp(c( "white","black")) | ||
c=rgb( ramp(seq(0, 1, length = 16)), max = 255) | ||
xscale <- list(cex=1.3) | ||
yscale <- list(cex=1.3) | ||
levelplot(abs(SigmaCV),main=list(label="Heatmap (CV)",cex=1.5 ), col.regions = c , | ||
scales=list(x=xscale, y=yscale),xlab="", ylab="" ,colorkey=list(labels=list(cex=1),space="bottom")) | ||
|
||
############################### The end of codes for empirical study under the CV method ################################# | ||
cat("The estimators under the proposed method:",deltaProp,"\n" ) | ||
cat("The estimators under the CV method:",deltaCV,"\n" ) | ||
|
||
cat("The percentage of zeros in the resulting covariance estimator under the proposed method:",zeros.prop,"\n") | ||
cat("The percentage of zeros in the resulting covariance estimator under the CV method:",zeros.CV,"\n") | ||
|
||
cat("Time taken by the proposed method:",time.prop,"\n") | ||
cat("Time taken by the CV method:",time.CV) | ||
|
||
|
||
|
||
################################################################################################################### | ||
################################################################################################################### | ||
# THE END OF CODES FOR EMPIRICAL STUDY # | ||
################################################################################################################### | ||
################################################################################################################### | ||
|
||
|
||
|
||
|
||
|
||
|
Oops, something went wrong.