## Compare HE and GCTA standardized results

In [1]:
library(dplyr)
library(pheatmap)
options(stringsAsFactors = FALSE)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [2]:
## load HE data
##
HE_std=as.data.frame(read.table('/Users/marygordon/Documents/YeLab/Lupus/all.HE.txt'))
#HE_std=read.table("/Users/marygordon/Documents/YeLab/Lupus/non_standard_H2/all.HE.results.txt")
HE_header=c('cell_pair','gene','v','sg1','se1','sg2','se2','sgx','sex','rg','re')
colnames(HE_std)=HE_header

## load GCTA data


##load cell pairs
pairs=scan('/Users/marygordon/Documents/YeLab/Lupus/phen_pairs.txt',what=character())

“cannot open file '/Users/marygordon/Documents/YeLab/Lupus/all.HE.txt': No such file or directory”

ERROR: Error in file(file, "rt"): cannot open the connection


In [None]:
head(HE_std)

### some general functions

In [None]:
#print simple stats of a vector
print_stats<-function(vect,label){
    print(label)
    print(c('min',min(as.numeric(vect))))
    print(c('max',max(as.numeric(vect))))
    print(c('mean',mean(as.numeric(vect))))
    print(c('median',median(as.numeric(vect))))
    print(c('var',var(as.numeric(vect))))
    print(c('in 0,1 range', sum(as.numeric(vect) >= 0 & as.numeric(vect)<=1), 'of',(length(as.numeric(vect)))))
    
}

In [None]:
#filter HE output file by standard errors for trait 1 and trait 2
filter_std_err<-function(df,se_filter){
    tmp1=as.data.frame(filter(df,se1<se_filter))
    tmp2=as.data.frame(filter(tmp1,se2<se_filter))
    return(tmp2)
}


In [None]:
#inverse variance weighting
calc_inverse_variance<-function(df){
    rG_inv_wgt=as.numeric(df$rg)/as.numeric(var(df$rg))
    rE_inv_wgt=as.numeric(df$rg)/as.numeric(var(df$re))
    out=as.data.frame(cbind(as.character(df$cell_pair),as.character(df$gene),as.numeric(rG_inv_wgt),as.numeric(rE_inv_wgt)))
    header=c('cell_pair','gene','rg','re')
    colnames(out)=header
    return(out)
}

In [None]:
#plot scatter plots and histograms
plot_scatter_hist<-function(df, title){
    #print out some stats to start
    all_rG_rE=as.numeric(df$rg)+as.numeric(df$re)
    print_stats(df$rg,"rG info")
    print_stats(df$re,"rE info")
    print_stats(all_rG_rE,"summary stats rG+rE")
    
    #plot overall 1)rG vs rE scatter 2)hist rG+rE 3)variance weighted of 1 and 2
    #rG vs rE scatter
    print(plot(df$rg, df$re, xlab="rG",ylab="rE",main=paste(title,cor(as.numeric(df$rg),as.numeric(df$re)))))
    
    #hist rG+rE
    hist(all_rG_rE,breaks=10000,main=paste(title),xlim=c(-10,10))

}

In [None]:
#for each pair plot 1)rG vs rE scatter 2)hist rG+rE 3)variance weighted of 1 and 2
plot_cell_cell_scatter_hist<-function(df,se_filt){
    avg_rG=c()
    avg_rG_w=c()
    avg_rE=c()
    avg_rE_w=c()
    var_rG=c()
    var_rE=c()
    avg_rG_FW=c()
    avg_rE_FW=c()
    for(p in pairs){
        print(p)
        df_tmp=filter_std_err(as.data.frame(filter(df,cell_pair==as.character(p))),se_filt)
        print(head(df_tmp))
        plot_scatter_hist(df_tmp,paste(p,"rG vs rE"))
        
        #calculate inverse variance weight rG and rE per cell comparison
        inv_tmp=calc_inverse_variance(df_tmp)
        plot_scatter_hist(inv_tmp,paste(p,"inverse var weight rG vs inverse var weight rE"))
        
        #rG inverse variance weight vs rE raw
        mixed_inv_tmp=as.data.frame(cbind(as.character(df_tmp$cell_pair),as.character(df_tmp$gene),as.numeric(inv_tmp$rg),as.numeric(df_tmp$re)))
        colnames(mixed_inv_tmp)=c("cell_pair","gene","rg","re")
        plot_scatter_hist(mixed_inv_tmp,paste(p,"inverse var weight rG vs rE"))
        
        #collect averages to view together
        avg_rG=c(avg_rG,median(df_tmp$rg))
        avg_rG_w=c(avg_rG_w,median(as.numeric(inv_tmp$rg)))
        avg_rE=c(avg_rE,median(df_tmp$re))
        avg_rE_w=c(avg_rE_w,median(as.numeric(inv_tmp$re)))
        var_rG=c(var_rG,var(df_tmp$rg))
        var_rE=c(var_rE,var(df_tmp$re))
        
    }
    all_avgs=as.data.frame(cbind(pairs,avg_rG,avg_rG_w,var_rG,avg_rE,avg_rE_w,var_rE,(avg_rG+avg_rE),(avg_rG_w+avg_rE_w),(avg_rG_w+avg_rE)))


    return(all_avgs)
}



## Analyze raw heritability

In [None]:
#plot HE stats over all pairs
plot_scatter_hist(HE_std,"all rG vs rE")

In [None]:
#plot HE variance weighted over all pairs
head(HE_std)
inv_w=(calc_inverse_variance(HE_std))
head(inv_w)
print(head(as.numeric(inv_w$re)+as.numeric(inv_w$rg)))
plot_scatter_hist(inv_w,"all inverse weighted rG vs rE")
#len=dim(HE_std)[1]


In [None]:
he_outs=plot_cell_cell_scatter_hist(HE_std,2)
head(he_outs,36)

In [None]:
#standard SE1
#head(all_avgs,36)

In [None]:
#non-standard SE1
#head(all_avgs,36)

### Analyze GCTA

In [None]:
gcta_std_full=as.data.frame(read.table('/Users/marygordon/Documents/YeLab/Lupus/GCTA/all.cells.bivar.standardized.txt',header=TRUE,sep='\t'))
#head(gcta_std)
gcta_std=as.data.frame(cbind(as.character(gcta_std_full$cell),as.character(gcta_std_full$gene),as.numeric(gcta_std_full$Var_V.G._tr1),as.numeric(gcta_std_full$SE_V.G._tr1),as.numeric(gcta_std_full$Var_V.G._tr2),as.numeric(gcta_std_full$SE_V.G._tr2),as.numeric(gcta_std_full$Var_rG),as.numeric(gcta_std_full$Var_C.e._tr12)))
gcta_header=c('cell_pair','gene','sg1','se1','sg2','se2','rg','re')
colnames(gcta_std)=gcta_header




In [None]:
head(gcta_std)

In [None]:
#plot gcta stats over all pairs
plot_scatter_hist(gcta_std, "all rG vs rE")

In [None]:
#plot gcta variance weighted over all pairs
head(gcta_std)
head(gcta_std$rg)

inv_w=(calc_inverse_variance(gcta_std))

print('made invw')
head(inv_w)
print(head(as.numeric(inv_w$re)+as.numeric(inv_w$rg)))
plot_scatter_hist(inv_w,"all inverse weighted rG vs rE")

In [None]:
head(gcta_std)
#ONLY ONE CELL PAIR- fixing aggregated file
gcta_outs=plot_cell_cell_scatter_hist(gcta_std,2)


## compare rG HE to rG GCTA

In [3]:
tmp_he=filter(HE_std,cell_pair=="NK.cells_PBMC")

#filter for rG<1 
#gcta_f=filter(gcta_std,rg<1)
#he_f=filter(tmp_he,rg<1)

#gcta_std
#HE_std
#he_f=tmp_he

all_match_gcta=c()
all_match_he=c()
for(p in pairs){
    #filter for cell pair
    gcta_f=filter(as.data.frame(filter(gcta_std,cell_pair==as.character(p))))
    he_f=filter(as.data.frame(filter(HE_std,cell_pair==as.character(p))))
    
    #match items
    matched_he=subset(he_f, gene %in% as.character(gcta_f$gene))
    matched_gcta=subset(gcta_f, gene %in% as.character(he_f$gene))
    
    print('ordered he')
    s_matched_he=(matched_he[order(matched_he$gene),])
    head(s_matched_he)
    
    print('ordered gcta')
    s_matched_gcta=(matched_gcta[order(matched_gcta$gene),])
    head(s_matched_gcta)

    dim(s_matched_he)
    dim(s_matched_gcta)
    #no weight
    print(plot(s_matched_he$rg,s_matched_gcta$rg,xlim=c(-2,2),main=paste(p,"rG vs rG",cor(as.numeric(s_matched_he$rg),as.numeric(s_matched_gcta$rg),method="spearman"))))

    all_match_gcta=rbind(all_match_gcta,s_matched_gcta)
    all_match_he=rbind(all_match_he,s_matched_he)
}

print(plot(all_match_he$rg,all_match_gcta$rg,xlim=c(-2,2),main=paste("All cells: rG vs rG",cor(as.numeric(s_matched_he$rg),as.numeric(s_matched_gcta$rg),method="spearman"))))






#inverse variance weighted




ERROR: Error in filter(HE_std, cell_pair == "NK.cells_PBMC"): object 'HE_std' not found


In [None]:
save_pheatmap_pdf <- function(x, filename, width=7, height=7) {
   stopifnot(!missing(x))
   stopifnot(!missing(filename))
   pdf(filename, width=width, height=height)
   grid::grid.newpage()
   grid::grid.draw(x$gtable)
   dev.off()
}

In [None]:
plot_heatmap<-function(df_outs){
    res=as.data.frame(cbind(df_outs$pairs, as.numeric(df_outs$avg_rG)))
    print(head(res))
    colnames(res)=c('cell_pair','rg')
    mat=matrix(0,9,9)
    cells =(unique(unlist(strsplit(as.character(res$cell_pair),"_",fixed=TRUE))))
    rownames(mat)=cells
    colnames(mat)=cells
    
    for(c in cells){
        mat[c,c]=1
    }
    
    iter=1
    for(p in pairs){
        val=(res$rg[iter])
        name=(unlist(strsplit(as.character(p),"_",fixed=TRUE)))
        r=(name[1])
        c=(name[2])

        mat[r,c]=as.numeric(val)
        mat[c,r]=as.numeric(val)
        iter=iter+1
    }
    mat1=mat[,-9]
    mat2=mat1[-9,]
    mat1=mat2[,-7]
    mat2=mat1[-7,]
    print(mat2)

    #p=pheatmap(mat2,breaks=seq(.9, 1, by = 0.005))
    #p=pheatmap(mat2,breaks=seq(.85, 1, by = 0.01))
    #p=pheatmap(mat2,breaks=seq(-0.2, 0.2, by = 0.005))
    #p=pheatmap(mat2,breaks=seq(0, 1, by = 0.01))
    p=pheatmap(mat2,breaks=seq(-0.2, 1, by = 0.02))
    print(p)

}

In [None]:
head(he_outs)
plot_heatmap(he_outs)

In [None]:
head(gcta_outs)
plot_heatmap(gcta_outs)