Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
yumouqiu committed Oct 11, 2019
1 parent 3adc77f commit f7a55d6
Show file tree
Hide file tree
Showing 3 changed files with 2,749 additions and 0 deletions.
375 changes: 375 additions & 0 deletions Case-Study/EmpiricalStudy.txt
@@ -0,0 +1,375 @@


########################################################################################################
########################################################################################################
# CODE FOR THE EMPIRICAL STUDY IN SECTION 6 #
########################################################################################################
########################################################################################################

# Creating the new data set by drawing the top 100 and bottom 30 genes from the original data
#######################################################################################################
data=read.csv("khan_train.csv",header=TRUE, sep=",") # Reading the original dataset.
data$gene=NULL
data=as.matrix(data)
data=t(data)

# Evaluating the F statistics in (6.14)
EWS=data[1:23,]
BL=data[24:31,]
NB=data[32:43,]
RMS=data[44:64,]
xbar=colMeans(data)
xbar1=colMeans(EWS)
xbar2=colMeans(BL)
xbar3=colMeans(NB)
xbar4=colMeans(RMS)

n=dim(data)[1]
k=4
difsq1=dim(EWS)[1]*(xbar1-xbar)^2
difsq2=dim(BL)[1]*(xbar2-xbar)^2
difsq3=dim(NB)[1]*(xbar3-xbar)^2
difsq4=dim(RMS)[1]*(xbar4-xbar)^2
num=(difsq1+difsq2+difsq3+difsq4)/(k-1)# Numerator of the F statistics
cowVar <- function(x,y) {
one=matrix(rep(1,dim(x)[1]),dim(x)[1])
bar=one%*%y
S=colSums((x-bar)^2)/(dim(x)[1]-1)

}
S1=cowVar(EWS,xbar1)
S2=cowVar(BL,xbar2)
S3=cowVar(NB,xbar3)
S4=cowVar(RMS,xbar4)

dnr=(((dim(EWS)[1]-1)*S1)+((dim(BL)[1]-1)*S2)+((dim(NB)[1]-1)*S3)+((dim(RMS)[1]-1)*S4))/(n-k)# Denominator of F statistic

Fstat=num/dnr # F statistics
jtop=order(Fstat)[1:100]
jbot=order(Fstat)[2279:2308]
topgene=data[,jtop] # Choosing the top 100 genes using the F statistics
botgene=data[,jbot] # Choosing the bottom 30 genes using the F statistics
newdata=cbind(topgene,botgene) # Combining top and bottom genes to create new data set
write.table(newdata,file="newdata.csv",sep=",",col.names=NA,qmethod="double")

############The end of the code for creating the new data set of selected top and bottom genes ###############


# Case study using the proposed method
#######################################
newdata=data.matrix(read.csv("newdata.csv", header = TRUE, row.names = 1,sep = ",")) #Reading the created sub data set using top 100 and bottom 30 genes.
dimnames(newdata) <- NULL
newdata=as.matrix(newdata)
library(reshape2)
#library(gplots)
library(ggplot2)
covmat=cov(newdata)
p=dim(covmat)[1]
n=dim(newdata)[1]
thetahat = matrix(0, p, p)
mean.new = colMeans(newdata)
# Evaluating thetahat
t1=proc.time()[3]
for(i in 1 : p){
for(j in 1 : p){
s = c()
for(k in 1 : n){
s[k] = (((newdata[k, i] - mean.new[i]) * (newdata[k, j] - mean.new[j])) - covmat[i, j])^2
}
thetahat[i, j] = sum(s) / n
}
}
c1c2Est = function(n, Sn, thetahat, a1 , a2, c0 ){
p = dim(Sn)[1]
ALamhat1 = c()
kk = 0
for (i in 1 : (p-1 )){
for (j in (i+1 ) : p){
kk = kk + 1
ALamhat1[kk] = abs(Sn[i, j] / sqrt(thetahat[i, j]) * sqrt(n / log(p)))
}
}
Mk = sum((ALamhat1 > a1) & (ALamhat1 < a2))
nhat = Mk - 2 * (pnorm((a2) * sqrt(log(p))) - pnorm((a1)* sqrt(log(p)))) * ((p^2 - p) / 2)
if(nhat > c0) {on = (log(nhat / sqrt(log(p)))) / log(p); deltaStarhat = sqrt(2 * (2 - on))}
if(nhat <= c0) {deltaStarhat = 2}
list(deltaStarhat = deltaStarhat, nhat = nhat, ALamhat = ALamhat1, Mk = Mk)
}
#t1=proc.time()[3]
a = min(sqrt(2 + log(n) / log(p)),2)
b=sqrt(1/log(log(p)))
temp = c1c2Est(n, covmat, thetahat, a1 = 2 - a+b, a2 = 2 , c0 = sqrt(log(p)))
t2=proc.time()[3]
time.prop=t2-t1


# Evaluating the adaptive thresholding covariance estimator using the proposed method
SigmaProp = matrix(0, p, p)
indexM = matrix(1, p, p) - diag(rep(1, p))
thetahatDiag = thetahat * indexM
deltaProp = temp$deltaStarhat
SigmaProp[which(abs(covmat) > deltaProp * sqrt(thetahatDiag * log(p) / n), arr.ind = TRUE)] = covmat[which(abs(covmat) > deltaProp * sqrt(thetahatDiag * log(p) / n), arr.ind = TRUE)]
# Calculating the proportion of zeros produced by the proposed method
zeros.prop=sum(colSums(SigmaProp== 0))/(p^2)



# Heatmap of absolute values of the covariance #
#################################################
library("lattice")
par(mar=c(3,4,2,2))
SigmaProp[abs(SigmaProp) >1] =1
ramp <- colorRamp(c( "white","black"))
c=rgb( ramp(seq(0, 1, length = 16)), max = 255)
xscale <- list(cex=1.3)
yscale <- list(cex=1.3)
levelplot(abs(SigmaProp),main=list(label="Heatmap (Proposed)",cex=1.5 ), col.regions = c ,
scales=list(x=xscale, y=yscale),xlab="", ylab="" ,colorkey=list(labels=list(cex=1),space="bottom"))


# Gene co-expression networks on positive and negative correlations #
#####################################################################
library(igraph)
rprop=matrix(rep(0,p^2),p)
# Evaluating correlations using adaptive thresholding covariance estimator
for (i in 1 : (p - 1)){
for (j in (i + 1) : p){

rprop[i,j]=SigmaProp[i,j]/sqrt(SigmaProp[i,i]*SigmaProp[j,j])

}
}

trh=min(abs(rprop[which(abs(rprop)>0)])) # Choosing the smallest correlation as the cutoff value
edgesP=which(rprop>=trh,arr.ind = T) # Choosing edges with positive correlations
edgesN=which(-1*rprop>=trh,arr.ind = T) # Choosing edges with negative correlations
lp=as.vector(t(edgesP))
gp<-graph(lp, directed=F)
ln=as.vector(t(edgesN))
gn<-graph(ln, directed=F)

# Function for wrapping strings
wrap <- function(strings,width){
as.character(sapply(strings, FUN=function(x){
paste(strwrap(x, width=width), collapse="\n")
}))
}

# Wrapping the node labels
V(gp)$label = wrap(V(gp)$label, 12)
V(gn)$label = wrap(V(gn)$label, 12)

#Change font size
V(gp)$label.cex = .8
V(gn)$label.cex = .8

# Function for increasing node separation
layoutattr <- function(graph, wc, strength=1,layout=layout.auto) {
g <- graph.edgelist(get.edgelist(graph)) # create a lightweight copy of graph w/o the attributes.
E(g)$weight <- 1

attr <- cbind(id=1:vcount(g), val=wc)
g <- g + vertices(unique(attr[,2])) + igraph::edges(unlist(t(attr)), weight=strength)

l <- layout(g, weights=E(g)$weight)[1:vcount(graph),]
return(l)
}
top=seq(1:100)
# Plotting positively correlated genes
plot(gp, vertex.shape="none", vertex.size=1,vertex.label.color =ifelse(V(gp) %in% top, "blue", "black"),layout=layoutattr(gp, wc=1))
title("Co-expression network of positively correlated genes \n(Proposed)",cex.main=1.5)
legend("topleft",bty = "n", legend=c("Top genes", "Bottom genes"),col=c("blue", "black"), pch=c(15,15),cex=1)

# Plotting negatively correlated genes
plot(gn, vertex.shape="none", vertex.size=1,vertex.label.color =ifelse(V(gn) %in% top, "blue", "black"),
layout=layoutattr(gn, wc=1))
title("Co-expression network of negatively correlated genes \n(Proposed)",cex.main=1.5)
legend("topleft",bty = "n", legend=c("Top genes", "Bottom genes"),col=c("blue", "black"), pch=c(15,15),cex=1)
###################The end of codes for empirical study under the proposed method #################################


# Case study using the CV method
#################################
newdata=data.matrix(read.csv("newdata.csv", header = TRUE, row.names = 1,sep = ","))
dimnames(newdata) <- NULL
newdata=as.matrix(newdata)
library(reshape2)
#library(gplots)
library(ggplot2)
covmat=cov(newdata)

p=dim(covmat)[1]
n=dim(newdata)[1]
thetahat = matrix(0, p, p)
mean.new = colMeans(newdata)

# CV method to find delta
############################################
find.del=function(n,p,H,N,X){
nn1=floor(n*(1-(1/log(n))))
sub=c()
Ra.ij=array(0,dim=c(H,2*N))
for (l in 1:H){
sub=sample(nrow(X),nn1,replace = FALSE, prob = NULL)
sam1=X[sub,]
sam2=X[-sub,]
n1=nrow(sam1)
n2=nrow(sam2)
Sig.hat1=((n1-1)/n1)*cov(sam1)
Sig.hat2=((n2-1)/n2)*cov(sam2)
one=matrix(1,nrow=n1,ncol=n1)
Xbar=one%*%sam1/n1
t.hat=array(0,dim=c(p,p))
that1=c()
for( i in 1:p){
for(j in 1:p){
s=c()
for(k in 1:n1){
s[k]=((sam1[k,i]-Xbar[k,i])*(sam1[k,j]-Xbar[k,j])-Sig.hat1[i,j])^2
}
that1[j]=sum(s)/n1
}
t.hat[i,]=that1
}
Raj.i=c()
to=2*N
for (m in 1:to){
Sig.hat.star1=matrix(0,nrow=p,ncol=p)
am=m/N
Lam.am=am*sqrt(t.hat*log(p)/n1)
Sig.hat.star1[which(abs(Sig.hat1)>Lam.am)]=Sig.hat1[which(abs(Sig.hat1)>Lam.am)]
Raj.i[m]=sum((Sig.hat.star1-Sig.hat2)^2)
}
Ra.ij[l,]=Raj.i
}
aj=c()
Rhat.a=c()
for (j in 1:to){
Rhat.a=c(Rhat.a,sum(Ra.ij[,j])/H)
aj[j]=j/N
}
j.hat=which(Rhat.a==min(Rhat.a))
delta.hat=aj[min(j.hat)]
return(delta.hat)
}


thetahat = matrix(0, p, p)
mean.new = colMeans(newdata)
# Evaluating thetahat
for(i in 1 : p){
for(j in 1 : p){
s = c()
for(k in 1 : n){
s[k] = (((newdata[k, i] - mean.new[i]) * (newdata[k, j] - mean.new[j])) - covmat[i, j])^2
}
thetahat[i, j] = sum(s) / n
}
}



# Evaluating the adaptive thresholding estimator using the CV method
indexM = matrix(1, p, p) - diag(rep(1, p))
thetahatDiag = thetahat * indexM

H = 50
N=100
tc1=proc.time()[3]
deltaCV = find.del(n,p,H,N,newdata)
tc2=proc.time()[3]
time.CV=tc2-tc1
SigmaCV = matrix(0, p, p)
SigmaCV[which(abs(covmat) > deltaCV * sqrt(thetahatDiag * log(p) / n), arr.ind = TRUE)] = covmat[which(abs(covmat) > deltaCV *sqrt( thetahatDiag * log(p) / n), arr.ind = TRUE)]
zeros.CV=sum(colSums(SigmaCV== 0))/(p^2) # Calculating the proportion of zeros for the CV method



# Creating co-expression networks
library(igraph)
rCV=matrix(rep(0,p^2),p)
for (i in 1 : (p - 1)){
for (j in (i + 1) : p){
rCV[i,j]=SigmaCV[i,j]/sqrt(SigmaCV[i,i]*SigmaCV[j,j])
}
}
trh=min(abs(rCV[which(abs(rCV)>0)]))
edgesP=which(rCV>=trh,arr.ind = T) # Positively correlated edges
edgesN=which(-1*rCV>=trh,arr.ind = T) # Negatively correlated edges
lp=as.vector(t(edgesP))
gp<-graph(lp, directed=F)
ln=as.vector(t(edgesN))
gn<-graph(ln, directed=F)

# Function for wrapping strings
wrap <- function(strings,width){
as.character(sapply(strings, FUN=function(x){
paste(strwrap(x, width=width), collapse="\n")
}))
}


V(gp)$label = wrap(V(gp)$label, 12)
V(gn)$label = wrap(V(gn)$label, 12)


V(gp)$label.cex = .8
V(gn)$label.cex = .8

layoutattr <- function(graph, wc, strength=1,layout=layout.auto) {
g <- graph.edgelist(get.edgelist(graph))
E(g)$weight <- 1
attr <- cbind(id=1:vcount(g), val=wc)
g <- g + vertices(unique(attr[,2])) + igraph::edges(unlist(t(attr)), weight=strength)
l <- layout(g, weights=E(g)$weight)[1:vcount(graph),]
return(l)
}

top=seq(1:100)
plot(gp, vertex.shape="none", vertex.size=1,vertex.label.color =ifelse(V(gp) %in% top, "blue", "black"),
layout=layoutattr(gp, wc=1))
title("Co-expression network of positively correlated genes \n(CV)",cex.main=1.5)
legend("topleft",bty = "n", legend=c("Top genes", "Bottom genes"),col=c("blue", "black"), pch=c(15,15),cex=1)


plot(gn, vertex.shape="none", vertex.size=1,vertex.label.color =ifelse(V(gn) %in% top, "blue", "black"),
layout=layoutattr(gn, wc=1))
title("Co-expression network of negatively correlated genes \n(CV)",cex.main=1.5)
legend("topleft",bty = "n", legend=c("Top genes", "Bottom genes"),col=c("blue", "black"), pch=c(15,15),cex=1)


# Creating heatmap
###################
library("lattice")
par(mar=c(3,4,2,2))
SigmaCV[abs(SigmaCV) >1] =1
ramp <- colorRamp(c( "white","black"))
c=rgb( ramp(seq(0, 1, length = 16)), max = 255)
xscale <- list(cex=1.3)
yscale <- list(cex=1.3)
levelplot(abs(SigmaCV),main=list(label="Heatmap (CV)",cex=1.5 ), col.regions = c ,
scales=list(x=xscale, y=yscale),xlab="", ylab="" ,colorkey=list(labels=list(cex=1),space="bottom"))

############################### The end of codes for empirical study under the CV method #################################
cat("The estimators under the proposed method:",deltaProp,"\n" )
cat("The estimators under the CV method:",deltaCV,"\n" )

cat("The percentage of zeros in the resulting covariance estimator under the proposed method:",zeros.prop,"\n")
cat("The percentage of zeros in the resulting covariance estimator under the CV method:",zeros.CV,"\n")

cat("Time taken by the proposed method:",time.prop,"\n")
cat("Time taken by the CV method:",time.CV)



###################################################################################################################
###################################################################################################################
# THE END OF CODES FOR EMPIRICAL STUDY #
###################################################################################################################
###################################################################################################################






0 comments on commit f7a55d6

Please sign in to comment.