In [3]:

############## hydrophobicity vs gene age

system('tar zxvf gene.tar.gz')

rm(list = ls())  

options(warn=-1)

library(Peptides)


# a function returning per residue average hydrophobicity for a group of sequences of varying length as well as the number of sequences counted at each position
# seqs: a list of sequences
# L_max: take the last L_max residues for very long sequences
# L_min: only consider sequences at least L_min aa long.
# if a sequence is shorter than L_max, fill the beginning with 'x', which will not be used for calculating average hydrophobicity at that position
avg_hydrophobicity_per_position <-function(seqs,L_max,L_min,hydrophobicity_scale){
  H <- rep(0,L_max) # hydrophobicity at each position
  N <- rep(0,L_max) # count of sequence at each position
  i=0
  for(seq in seqs){
    # print progress for every 1000
    i=i+1
    if (i%%1000==0) {
      print(i)
    }
    
    if(nchar(seq) >= L_min){
      # extend the N-terminal with x
      seq = paste(paste(rep('x',L_max),collapse=''),seq,sep='')
      seq = substr(seq,nchar(seq)-L_max+1,nchar(seq))
      
      hi = hydrophobicity(unlist(strsplit(seq,'')),hydrophobicity_scale) 
      H = H + hi 
      N = as.numeric(N) + as.numeric(hi!=0)
    }
  }
  H = H/N
  return(list(H,N))
}

L_max=100
L_min=200
hydrophobicity_scale = 'Miyazawa'
tail_len=30
skip_last=2
human_avg = c()

for(i in 0:12){
  x=read.table(paste('gene_age_seq.human',i,sep='-'))
  hyd = avg_hydrophobicity_per_position(x$V1,L_max,L_min,hydrophobicity_scale)
  y=hyd[[1]][(L_max-tail_len+1):(L_max-skip_last)]
  #y=hyd[[1]][(L_max-tail_len-60+1):(L_max-60+1)]
  human_avg = c(human_avg,mean(y))
}
mouse_avg = c()
for(i in 0:11){
  x=read.table(paste('gene_age_seq.mouse',i,sep='-'))
  hyd = avg_hydrophobicity_per_position(x$V1,L_max,L_min,hydrophobicity_scale)
  y=hyd[[1]][(L_max-tail_len+1):(L_max-skip_last)]
  #y=hyd[[1]][(L_max-tail_len-60+1):(L_max-60+1)]
  mouse_avg = c(mouse_avg,mean(y))
}

human_age = -c(-500,-407,-337,-252,-182,-134,-90,-78,-57,-30,-20,-9,-3)
mouse_age = -c(-500,-407,-337,-252,-182,-134,-90,-78,-66,-54,-31,-9)

filename=paste('Fig-gene-age',L_min,tail_len,skip_last,'.pdf',sep='-')
pdf(filename,width=4.2,height=4.6)
plot(human_age,human_avg,log='x', bty='n', lwd=2, col='magenta',pch=15, xlab="Gene age (million years)", ylab="Average C-tail hydrophobicity",ylim=c(min(human_avg,mouse_avg),max(human_avg,mouse_avg)+0.1))
lines(human_age, predict(loess(human_avg~human_age),human_age), col='magenta',lwd=2)
#lines(human_age, lm(human_avg~human_age)$fitted.values,  lwd=2,col='blue')
# mouse
points(mouse_age,mouse_avg,bty='n',col='blue', lwd=2, pch=16)
lines(mouse_age, predict(loess(mouse_avg~mouse_age),mouse_age),  lwd=2,col='blue')
#lines(mouse_age, lm(mouse_avg~mouse_age)$fitted.values,  lwd=2,col='blue')
r1=cor(human_age,human_avg,method='s')
r2=cor(mouse_age,mouse_avg,method='s')
cor.test(human_age,human_avg,method='s')
cor.test(mouse_age,mouse_avg,method='s')
#legend('topright',paste(c("Mouse, Rs =","Human, Rs ="),c(format(r2,digits=2,nsmall=2),format(r1,digits=2,nsmall=2))), text.col=c("blue","magenta"),bty='n')
legend(75,5.75,paste("Mouse\nRs =",format(r2,digits=2,nsmall=2)), text.col="blue",bty='n')
legend(2.1,5.35,paste("Human\nRs =",format(r1,digits=2,nsmall=2)), text.col="magenta",bty='n')
dev.off()

system('rm gene_age*')


[1] 1000
[1] 2000
[1] 3000
[1] 4000
[1] 5000
[1] 6000
[1] 7000
[1] 8000
[1] 9000
[1] 10000
[1] 11000
[1] 12000
[1] 13000
[1] 14000
[1] 15000
[1] 16000
[1] 17000
[1] 18000
[1] 19000
[1] 20000
[1] 21000
[1] 22000
[1] 23000
[1] 24000
[1] 25000
[1] 26000
[1] 27000
[1] 28000
[1] 29000
[1] 30000
[1] 31000
[1] 32000
[1] 33000
[1] 34000
[1] 35000
[1] 36000
[1] 37000
[1] 38000
[1] 39000
[1] 40000
[1] 41000
[1] 42000
[1] 43000
[1] 44000
[1] 45000
[1] 46000
[1] 47000
[1] 48000
[1] 49000
[1] 50000
[1] 51000
[1] 52000
[1] 53000
[1] 54000
[1] 55000
[1] 56000
[1] 57000
[1] 58000
[1] 59000
[1] 60000
[1] 61000
[1] 62000
[1] 1000
[1] 2000
[1] 3000
[1] 1000
[1] 2000
[1] 3000
[1] 4000
[1] 1000
[1] 2000
[1] 1000
[1] 1000
[1] 2000
[1] 1000
[1] 2000
[1] 3000
[1] 4000
[1] 5000
[1] 6000
[1] 7000
[1] 8000
[1] 9000
[1] 10000
[1] 11000
[1] 12000
[1] 13000
[1] 14000
[1] 15000
[1] 16000
[1] 17000
[1] 18000
[1] 19000
[1] 20000
[1] 21000
[1] 22000
[1] 23000
[1] 24000
[1] 25000
[1] 26000
[1] 27000
[1] 28000
[1] 29000



	Spearman's rank correlation rho

data:  human_age and human_avg
S = 444, p-value = 0.4704
alternative hypothesis: true rho is not equal to 0
sample estimates:
       rho 
-0.2197802 



	Spearman's rank correlation rho

data:  mouse_age and mouse_avg
S = 562, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
-0.965035 
