Add files via upload

yumouqiu · Oct 11, 2019 · f7a55d6 · f7a55d6
1 parent 3adc77f
commit f7a55d6
Show file tree

Hide file tree

Showing 3 changed files with 2,749 additions and 0 deletions.
diff --git a/Case-Study/EmpiricalStudy.txt b/Case-Study/EmpiricalStudy.txt
@@ -0,0 +1,375 @@
+
+
+########################################################################################################
+########################################################################################################
+#                                  CODE FOR THE  EMPIRICAL STUDY IN SECTION 6                         #
+########################################################################################################
+########################################################################################################
+
+# Creating the new data set by drawing the top 100 and bottom 30 genes from the original data
+#######################################################################################################
+data=read.csv("khan_train.csv",header=TRUE, sep=",") # Reading the original dataset.
+data$gene=NULL
+data=as.matrix(data)
+data=t(data)
+
+# Evaluating the F statistics in (6.14)
+EWS=data[1:23,]
+BL=data[24:31,]
+NB=data[32:43,]
+RMS=data[44:64,]
+xbar=colMeans(data)
+xbar1=colMeans(EWS)
+xbar2=colMeans(BL)
+xbar3=colMeans(NB)
+xbar4=colMeans(RMS)
+
+n=dim(data)[1]
+k=4
+difsq1=dim(EWS)[1]*(xbar1-xbar)^2
+difsq2=dim(BL)[1]*(xbar2-xbar)^2
+difsq3=dim(NB)[1]*(xbar3-xbar)^2
+difsq4=dim(RMS)[1]*(xbar4-xbar)^2
+num=(difsq1+difsq2+difsq3+difsq4)/(k-1)# Numerator of the F statistics
+cowVar <- function(x,y) {
+  one=matrix(rep(1,dim(x)[1]),dim(x)[1])
+  bar=one%*%y
+  S=colSums((x-bar)^2)/(dim(x)[1]-1)
+
+}
+S1=cowVar(EWS,xbar1)
+S2=cowVar(BL,xbar2)
+S3=cowVar(NB,xbar3)
+S4=cowVar(RMS,xbar4)
+
+dnr=(((dim(EWS)[1]-1)*S1)+((dim(BL)[1]-1)*S2)+((dim(NB)[1]-1)*S3)+((dim(RMS)[1]-1)*S4))/(n-k)# Denominator of F statistic
+
+Fstat=num/dnr # F statistics 
+jtop=order(Fstat)[1:100] 
+jbot=order(Fstat)[2279:2308] 
+topgene=data[,jtop] # Choosing the top 100 genes using the F statistics
+botgene=data[,jbot] # Choosing the bottom 30 genes using the F statistics
+newdata=cbind(topgene,botgene) # Combining top and bottom genes to create new data set
+write.table(newdata,file="newdata.csv",sep=",",col.names=NA,qmethod="double")
+
+############The end of the code for creating the new data set of selected top and bottom genes  ###############
+
+
+# Case study using the proposed method
+#######################################
+newdata=data.matrix(read.csv("newdata.csv", header = TRUE, row.names = 1,sep = ",")) #Reading the created sub data set using top 100 and bottom 30 genes.
+dimnames(newdata) <- NULL
+newdata=as.matrix(newdata)
+library(reshape2)
+#library(gplots)
+library(ggplot2)
+covmat=cov(newdata)
+p=dim(covmat)[1]
+n=dim(newdata)[1]
+thetahat = matrix(0, p, p)
+mean.new = colMeans(newdata)
+# Evaluating thetahat 
+t1=proc.time()[3]
+for(i in 1 : p){
+  for(j in 1 : p){
+    s = c()
+    for(k in 1 : n){
+      s[k] = (((newdata[k, i] - mean.new[i]) * (newdata[k, j] - mean.new[j])) - covmat[i, j])^2
+    }
+    thetahat[i, j] = sum(s) / n
+  }
+}
+c1c2Est = function(n, Sn, thetahat, a1 , a2, c0 ){
+  p = dim(Sn)[1]
+  ALamhat1 = c()
+  kk = 0
+  for (i in 1 : (p-1 )){
+    for (j in (i+1 ) : p){
+      kk = kk + 1
+      ALamhat1[kk] = abs(Sn[i, j] / sqrt(thetahat[i, j]) * sqrt(n / log(p)))
+    }
+  }
+  Mk = sum((ALamhat1 > a1) & (ALamhat1 < a2))
+  nhat = Mk - 2 * (pnorm((a2) * sqrt(log(p))) - pnorm((a1)* sqrt(log(p)))) * ((p^2 - p) / 2)
+  if(nhat > c0)  {on = (log(nhat / sqrt(log(p)))) / log(p); deltaStarhat = sqrt(2 * (2 - on))}
+  if(nhat <= c0)  {deltaStarhat = 2} 
+  list(deltaStarhat = deltaStarhat, nhat = nhat, ALamhat = ALamhat1, Mk = Mk) 
+}
+#t1=proc.time()[3]
+ a = min(sqrt(2 + log(n) / log(p)),2) 
+ b=sqrt(1/log(log(p)))
+ temp = c1c2Est(n, covmat, thetahat, a1 = 2 - a+b, a2 = 2 , c0 = sqrt(log(p)))
+t2=proc.time()[3]
+time.prop=t2-t1
+
+
+# Evaluating the adaptive thresholding covariance estimator using the proposed method
+SigmaProp = matrix(0, p, p)
+indexM = matrix(1, p, p) - diag(rep(1, p))
+thetahatDiag = thetahat * indexM
+deltaProp = temp$deltaStarhat
+SigmaProp[which(abs(covmat) > deltaProp * sqrt(thetahatDiag * log(p) / n), arr.ind = TRUE)] = covmat[which(abs(covmat) > deltaProp * sqrt(thetahatDiag * log(p) / n), arr.ind = TRUE)]
+# Calculating the proportion of zeros produced by the proposed method
+zeros.prop=sum(colSums(SigmaProp== 0))/(p^2)
+
+
+
+# Heatmap of absolute values of the covariance  #
+#################################################
+library("lattice")
+par(mar=c(3,4,2,2))
+SigmaProp[abs(SigmaProp) >1] =1
+ramp <- colorRamp(c( "white","black"))
+c=rgb( ramp(seq(0, 1, length = 16)), max = 255)
+xscale <- list(cex=1.3) 
+yscale <- list(cex=1.3) 
+levelplot(abs(SigmaProp),main=list(label="Heatmap (Proposed)",cex=1.5 ),  col.regions = c  ,
+          scales=list(x=xscale, y=yscale),xlab="", ylab="" ,colorkey=list(labels=list(cex=1),space="bottom"))
+
+
+# Gene co-expression networks on positive and negative correlations #
+#####################################################################
+library(igraph)
+rprop=matrix(rep(0,p^2),p)
+# Evaluating correlations using adaptive thresholding covariance estimator 
+for (i in 1 : (p - 1)){
+  for (j in (i + 1) : p){
+
+    rprop[i,j]=SigmaProp[i,j]/sqrt(SigmaProp[i,i]*SigmaProp[j,j])
+
+  }
+}
+
+trh=min(abs(rprop[which(abs(rprop)>0)])) # Choosing the smallest correlation as the cutoff value
+edgesP=which(rprop>=trh,arr.ind = T) # Choosing edges with positive correlations
+edgesN=which(-1*rprop>=trh,arr.ind = T) # Choosing edges with negative correlations
+lp=as.vector(t(edgesP))
+gp<-graph(lp, directed=F)
+ln=as.vector(t(edgesN))
+gn<-graph(ln, directed=F)
+
+# Function for wrapping strings
+wrap <- function(strings,width){
+  as.character(sapply(strings, FUN=function(x){
+    paste(strwrap(x, width=width), collapse="\n")
+  }))
+}
+
+# Wrapping the node labels
+V(gp)$label = wrap(V(gp)$label, 12)
+V(gn)$label = wrap(V(gn)$label, 12)
+
+#Change font size
+V(gp)$label.cex = .8
+V(gn)$label.cex = .8
+
+# Function for increasing node separation
+layoutattr <- function(graph, wc, strength=1,layout=layout.auto) {  
+  g <- graph.edgelist(get.edgelist(graph)) # create a lightweight copy of graph w/o the attributes.
+  E(g)$weight <- 1
+
+  attr <- cbind(id=1:vcount(g), val=wc)
+  g <- g + vertices(unique(attr[,2])) + igraph::edges(unlist(t(attr)), weight=strength)
+
+  l <- layout(g, weights=E(g)$weight)[1:vcount(graph),]
+  return(l)
+}
+top=seq(1:100)
+# Plotting positively correlated genes
+plot(gp, vertex.shape="none", vertex.size=1,vertex.label.color =ifelse(V(gp) %in% top, "blue", "black"),layout=layoutattr(gp, wc=1))
+title("Co-expression network of positively correlated genes \n(Proposed)",cex.main=1.5)
+legend("topleft",bty = "n", legend=c("Top genes", "Bottom genes"),col=c("blue", "black"), pch=c(15,15),cex=1)
+
+# Plotting negatively correlated genes
+plot(gn, vertex.shape="none", vertex.size=1,vertex.label.color =ifelse(V(gn) %in% top, "blue", "black"), 
+     layout=layoutattr(gn, wc=1))
+title("Co-expression network of negatively correlated genes \n(Proposed)",cex.main=1.5)
+legend("topleft",bty = "n", legend=c("Top genes", "Bottom genes"),col=c("blue", "black"), pch=c(15,15),cex=1)
+###################The end of codes for empirical study under the proposed method #################################
+
+
+# Case study using the CV method
+#################################
+newdata=data.matrix(read.csv("newdata.csv", header = TRUE, row.names = 1,sep = ","))
+dimnames(newdata) <- NULL
+newdata=as.matrix(newdata)
+library(reshape2)
+#library(gplots)
+library(ggplot2)
+covmat=cov(newdata)
+
+p=dim(covmat)[1]
+n=dim(newdata)[1]
+thetahat = matrix(0, p, p)
+mean.new = colMeans(newdata)
+
+# CV method to find delta
+############################################
+find.del=function(n,p,H,N,X){
+  nn1=floor(n*(1-(1/log(n))))
+  sub=c()
+  Ra.ij=array(0,dim=c(H,2*N))
+  for (l in 1:H){
+    sub=sample(nrow(X),nn1,replace = FALSE, prob = NULL)
+    sam1=X[sub,]
+    sam2=X[-sub,]
+    n1=nrow(sam1)
+    n2=nrow(sam2)
+    Sig.hat1=((n1-1)/n1)*cov(sam1)
+    Sig.hat2=((n2-1)/n2)*cov(sam2)
+    one=matrix(1,nrow=n1,ncol=n1)
+    Xbar=one%*%sam1/n1
+    t.hat=array(0,dim=c(p,p))
+    that1=c()
+    for( i in 1:p){
+      for(j in 1:p){
+        s=c()
+        for(k in 1:n1){
+          s[k]=((sam1[k,i]-Xbar[k,i])*(sam1[k,j]-Xbar[k,j])-Sig.hat1[i,j])^2
+        }
+        that1[j]=sum(s)/n1
+      }
+      t.hat[i,]=that1
+    }
+    Raj.i=c()
+    to=2*N
+    for (m in 1:to){
+      Sig.hat.star1=matrix(0,nrow=p,ncol=p)
+      am=m/N
+      Lam.am=am*sqrt(t.hat*log(p)/n1)    
+      Sig.hat.star1[which(abs(Sig.hat1)>Lam.am)]=Sig.hat1[which(abs(Sig.hat1)>Lam.am)]
+      Raj.i[m]=sum((Sig.hat.star1-Sig.hat2)^2)
+        }
+      Ra.ij[l,]=Raj.i
+  }
+  aj=c()
+  Rhat.a=c()
+  for (j in 1:to){
+    Rhat.a=c(Rhat.a,sum(Ra.ij[,j])/H)
+    aj[j]=j/N
+  }
+  j.hat=which(Rhat.a==min(Rhat.a))
+  delta.hat=aj[min(j.hat)]
+  return(delta.hat)
+} 
+
+
+thetahat = matrix(0, p, p)
+mean.new = colMeans(newdata)
+# Evaluating thetahat 
+for(i in 1 : p){
+  for(j in 1 : p){
+    s = c()
+    for(k in 1 : n){
+      s[k] = (((newdata[k, i] - mean.new[i]) * (newdata[k, j] - mean.new[j])) - covmat[i, j])^2
+    }
+    thetahat[i, j] = sum(s) / n
+  }
+}
+
+
+
+# Evaluating the adaptive thresholding estimator using the CV method
+indexM = matrix(1, p, p) - diag(rep(1, p))
+thetahatDiag = thetahat * indexM
+
+H = 50
+N=100
+tc1=proc.time()[3]
+deltaCV = find.del(n,p,H,N,newdata)
+tc2=proc.time()[3]
+time.CV=tc2-tc1
+SigmaCV = matrix(0, p, p)
+SigmaCV[which(abs(covmat) > deltaCV * sqrt(thetahatDiag * log(p) / n), arr.ind = TRUE)] = covmat[which(abs(covmat) > deltaCV *sqrt( thetahatDiag * log(p) / n), arr.ind = TRUE)]
+zeros.CV=sum(colSums(SigmaCV== 0))/(p^2) # Calculating the proportion of zeros for the CV method
+
+
+
+# Creating co-expression networks
+library(igraph)
+rCV=matrix(rep(0,p^2),p)
+for (i in 1 : (p - 1)){
+  for (j in (i + 1) : p){
+        rCV[i,j]=SigmaCV[i,j]/sqrt(SigmaCV[i,i]*SigmaCV[j,j])
+      }
+}
+trh=min(abs(rCV[which(abs(rCV)>0)]))
+edgesP=which(rCV>=trh,arr.ind = T) # Positively correlated edges
+edgesN=which(-1*rCV>=trh,arr.ind = T) # Negatively correlated edges
+lp=as.vector(t(edgesP))
+gp<-graph(lp, directed=F)
+ln=as.vector(t(edgesN))
+gn<-graph(ln, directed=F)
+
+# Function for wrapping strings
+wrap <- function(strings,width){
+  as.character(sapply(strings, FUN=function(x){
+    paste(strwrap(x, width=width), collapse="\n")
+  }))
+}
+
+
+V(gp)$label = wrap(V(gp)$label, 12)
+V(gn)$label = wrap(V(gn)$label, 12)
+
+
+V(gp)$label.cex = .8
+V(gn)$label.cex = .8
+
+layoutattr <- function(graph, wc, strength=1,layout=layout.auto) {  
+  g <- graph.edgelist(get.edgelist(graph)) 
+  E(g)$weight <- 1
+  attr <- cbind(id=1:vcount(g), val=wc)
+  g <- g + vertices(unique(attr[,2])) + igraph::edges(unlist(t(attr)), weight=strength)
+  l <- layout(g, weights=E(g)$weight)[1:vcount(graph),]
+  return(l)
+}
+
+top=seq(1:100)
+plot(gp, vertex.shape="none", vertex.size=1,vertex.label.color =ifelse(V(gp) %in% top, "blue", "black"), 
+     layout=layoutattr(gp, wc=1))
+title("Co-expression network of positively correlated genes \n(CV)",cex.main=1.5)
+legend("topleft",bty = "n", legend=c("Top genes", "Bottom genes"),col=c("blue", "black"), pch=c(15,15),cex=1)
+
+
+plot(gn, vertex.shape="none", vertex.size=1,vertex.label.color =ifelse(V(gn) %in% top, "blue", "black"), 
+     layout=layoutattr(gn, wc=1))
+title("Co-expression network of negatively correlated genes \n(CV)",cex.main=1.5)
+legend("topleft",bty = "n", legend=c("Top genes", "Bottom genes"),col=c("blue", "black"), pch=c(15,15),cex=1)
+
+
+# Creating heatmap
+###################
+library("lattice")
+par(mar=c(3,4,2,2))
+SigmaCV[abs(SigmaCV) >1] =1
+ramp <- colorRamp(c( "white","black"))
+c=rgb( ramp(seq(0, 1, length = 16)), max = 255)
+xscale <- list(cex=1.3) 
+yscale <- list(cex=1.3) 
+levelplot(abs(SigmaCV),main=list(label="Heatmap (CV)",cex=1.5 ),  col.regions = c  ,
+          scales=list(x=xscale, y=yscale),xlab="", ylab="" ,colorkey=list(labels=list(cex=1),space="bottom"))
+
+###############################  The end of codes for empirical study under the CV method #################################
+cat("The estimators under the proposed method:",deltaProp,"\n" )
+cat("The estimators under the CV method:",deltaCV,"\n" )
+
+cat("The percentage of zeros in the resulting covariance estimator under the proposed method:",zeros.prop,"\n")
+cat("The percentage of zeros in the resulting covariance estimator under the CV method:",zeros.CV,"\n")
+
+cat("Time taken by the proposed method:",time.prop,"\n")
+cat("Time taken by the CV method:",time.CV)
+
+
+
+###################################################################################################################
+###################################################################################################################
+#                                     THE END OF CODES FOR EMPIRICAL STUDY                                        #                     
+###################################################################################################################
+###################################################################################################################
+
+
+
+
+
+