In [1]:
%load_ext rpy2.ipython

  "The symbol '%s' is not in this R namespace/package." % name


In [2]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
import urllib

In [3]:
%%R
data_folder = './'
setwd(data_folder)

In [5]:
import re
from collections import Counter

def clean_PeptideSequence(pep_seq):
    pattern = re.compile(r"\(.*?\)")
    pep_seq = pep_seq.replace(".", "")
    return pattern.sub("", pep_seq)

def modified_PeptideSequence(pep_seq):
    pattern = re.compile(r"\(.*?\)")
    t_pep_seq = pep_seq.replace(".", "").replace("Acetyl", "ac").replace("Carbamidomethyl", "ca").replace("Oxidation", "ox")
    t_pep_seq = "_" + t_pep_seq + "_"
    return t_pep_seq

def get_modifications(pep_seq):
    pattern = re.compile(r"\(.*?\)")
    t = ""
    modifications = re.findall(pattern, pep_seq)
    if len(modifications) == 0:
        return "Unmodified"
    c = Counter(modifications)
    for key, value in c.items():
        if value == 1:
            c[key] = ""
        else:
            c[key] = str(c[key]) + " "
    modifications = ",".join(modifications)
    if "(Acetyl)" in c:
        modifications = modifications.replace("(Acetyl)", c["(Acetyl)"] + "Acetyl (Protein N-term)")
    if "(Carbamidomethyl)" in c:
        modifications = modifications.replace("(Carbamidomethyl)", c["(Carbamidomethyl)"] + "Carbamidomethyl (C)")
    if "(Oxidation)" in c:
        modifications = modifications.replace("(Oxidation)", c["(Oxidation)"] + "Oxidation (M)")
    if "(Phospho)" in c:
         modifications = modifications.replace("(Phospho)", c["(Phospho)"] + "Phospho (STY)")
    
    return modifications

def get_experiment(row):
    return row['Reference'].split(".")[0].split("_")[-2] + "_" +"TechRep" + str(row['Reference'].split(".")[0].split("_")[-1])
    
    
evi = pd.DataFrame(None, columns=['sequence', 'modified_sequence', 'modifications', 'protein_group', 
                                  'protein', 'experiment', 'charge', 'reverse', 'contaminant', 'intensity'])
quantms = pd.read_csv("../out_msstats_filter.csv", sep=',', header=0)
quantms = quantms[-quantms['ProteinName'].str.contains("DECOY_")]
evi['sequence'] = quantms.apply(lambda x: clean_PeptideSequence(x['PeptideSequence']), axis=1)
evi['modified_sequence'] = quantms.apply(lambda x: modified_PeptideSequence(x['PeptideSequence']), axis=1)
evi['modifications'] = quantms.apply(lambda x: get_modifications(x['PeptideSequence']), axis=1)
evi['protein_group'] = quantms['ProteinName']
evi['protein'] = quantms['ProteinName']
evi['experiment'] = quantms.apply(lambda x: get_experiment(x), axis=1)
evi['charge'] = quantms['PrecursorCharge']
evi['intensity'] = quantms['Intensity']
evi.to_csv("./evi.txt", sep='\t', index=False)

metadata = pd.DataFrame(None, columns=['experiment', 'measure', 'sample', 'condition', 'replicate'])
metadata['experiment'] = ["1_TechRep1", "1_TechRep2", "1_TechRep3", "1_TechRep4", "1_TechRep5", "1_TechRep6", 
                          "4_TechRep1", "4_TechRep2", "4_TechRep3", "4_TechRep4", "4_TechRep5", "4_TechRep6",
                          "10_TechRep1", "10_TechRep2", "10_TechRep3", "10_TechRep4", "10_TechRep5", "10_TechRep6"]
metadata['measure'] = ['Intensity'] * 18
metadata['sample'] = ["1_TechRep1", "1_TechRep2", "1_TechRep3", "1_TechRep4", "1_TechRep5", "1_TechRep6", 
                          "4_TechRep1", "4_TechRep2", "4_TechRep3", "4_TechRep4", "4_TechRep5", "4_TechRep6",
                          "10_TechRep1", "10_TechRep2", "10_TechRep3", "10_TechRep4", "10_TechRep5", "10_TechRep6"]

metadata['condition'] = ["1"]*6 + ["4"]*6 + ["10"]*6
metadata['replicate'] = [1, 2, 3, 4, 5, 6] * 3
metadata.to_csv("./metadata.txt", sep="\t", index=False)



In [5]:
%%R

### peptide

library(proteus)

# quantms
evi <- read.csv("evi.txt", row.names = NULL, sep="\t")
meta <- read.csv("metadata.txt", sep="\t")
pepdat <- makePeptideTable(evi, meta)
prodat <- makeProteinTable(pepdat)

prodat.med <- normalizeData(prodat)

res4 <- limmaDE(prodat.med, sig.level=0.05, conditions=c("4","1"))
res10 <- limmaDE(prodat.med, sig.level=0.05, conditions=c("10","1"))

r4 <- res4[which(res4$significant), c("protein", "logFC", "adj.P.Val")]
r10 <- res10[which(res10$significant), c("protein", "logFC", "adj.P.Val")]

write.csv(res4, "proteus-EM-quantms_4.csv")
write.csv(res10, "proteus-EM-quantms_10.csv")


res4 <- limmaDE(prodat, sig.level=0.05, conditions=c("4","1"))
res10 <- limmaDE(prodat, sig.level=0.05, conditions=c("10","1"))

r4 <- res4[which(res4$significant), c("protein", "logFC", "adj.P.Val")]
r10 <- res10[which(res10$significant), c("protein", "logFC", "adj.P.Val")]

write.csv(res4, "proteus-NN-quantms_4.csv")
write.csv(res10, "proteus-NN-quantms_10.csv")


prodat.q <- normalizeData(prodat, norm.fun=limma::normalizeQuantiles)
res4 <- limmaDE(prodat.q, sig.level=0.05, conditions=c("4","1"))
res10 <- limmaDE(prodat.q, sig.level=0.05, conditions=c("10","1"))

r4 <- res4[which(res4$significant), c("protein", "logFC", "adj.P.Val")]
r10 <- res10[which(res10$significant), c("protein", "logFC", "adj.P.Val")]

write.csv(res4, "proteus-Q-quantms_4.csv")
write.csv(res10, "proteus-Q-quantms_10.csv")


In [6]:
# quantms
proteus = pd.read_csv("./proteus-EM-quantms_4.csv", header=0, sep=",", index_col=0)

proteus.dropna(subset=["logFC", "P.Value"], how="any", inplace=True)
positive =  proteus[(abs(proteus["logFC"]) >1) &(proteus["adj.P.Val"] <0.05) ]
negative = proteus[(abs(proteus["logFC"]) <=1) | (proteus["adj.P.Val"] >=0.05) ]

TP = len(positive[positive["protein"].str.contains("YEAST")])
FP = len(positive) - TP

FN = len(negative[negative["protein"].str.contains("YEAST")])
TN = len(negative) - FN

print(TP)
print(FP)
print(TN)
print(FN)

print("PPV = %.6f" % (TP / (TP + FP)))
print("NPV = %.6f" % (TN / (TN + FN)))



365
122
1932
168
PPV = 0.749487
NPV = 0.920000


In [7]:
# quantms
proteus = pd.read_csv("./proteus-EM-quantms_10.csv", header=0, sep=",", index_col=0)

proteus.dropna(subset=["logFC", "P.Value"], how="any", inplace=True)
positive =  proteus[(abs(proteus["logFC"]) >1) &(proteus["adj.P.Val"] <0.05) ]
negative = proteus[(abs(proteus["logFC"]) <=1) | (proteus["adj.P.Val"] >=0.05) ]

TP = len(positive[positive["protein"].str.contains("YEAST")])
FP = len(positive) - TP

FN = len(negative[negative["protein"].str.contains("YEAST")])
TN = len(negative) - FN

print(TP)
print(FP)
print(TN)
print(FN)

print("PPV = %.6f" % (TP / (TP + FP)))
print("NPV = %.6f" % (TN / (TN + FN)))

432
324
1673
106
PPV = 0.571429
NPV = 0.940416


In [8]:
# quantms
proteus = pd.read_csv("./proteus-Q-quantms_4.csv", header=0, sep=",", index_col=0)

proteus.dropna(subset=["logFC", "P.Value"], how="any", inplace=True)
positive =  proteus[(abs(proteus["logFC"]) >1) &(proteus["adj.P.Val"] <0.05) ]
negative = proteus[(abs(proteus["logFC"]) <=1) | (proteus["adj.P.Val"] >=0.05) ]

TP = len(positive[positive["protein"].str.contains("YEAST")])
FP = len(positive) - TP

FN = len(negative[negative["protein"].str.contains("YEAST")])
TN = len(negative) - FN

print(TP)
print(FP)
print(TN)
print(FN)

print("PPV = %.6f" % (TP / (TP + FP)))
print("NPV = %.6f" % (TN / (TN + FN)))

362
119
1935
171
PPV = 0.752599
NPV = 0.918803


In [9]:
# quantms
proteus = pd.read_csv("./proteus-Q-quantms_10.csv", header=0, sep=",", index_col=0)

proteus.dropna(subset=["logFC", "P.Value"], how="any", inplace=True)
positive =  proteus[(abs(proteus["logFC"]) >1) &(proteus["adj.P.Val"] <0.05) ]
negative = proteus[(abs(proteus["logFC"]) <=1) | (proteus["adj.P.Val"] >=0.05) ]

TP = len(positive[positive["protein"].str.contains("YEAST")])
FP = len(positive) - TP

FN = len(negative[negative["protein"].str.contains("YEAST")])
TN = len(negative) - FN

print(TP)
print(FP)
print(TN)
print(FN)

print("PPV = %.6f" % (TP / (TP + FP)))
print("NPV = %.6f" % (TN / (TN + FN)))

428
323
1674
110
PPV = 0.569907
NPV = 0.938341


In [10]:
# quantms
proteus = pd.read_csv("./proteus-NN-quantms_4.csv", header=0, sep=",", index_col=0)

proteus.dropna(subset=["logFC", "P.Value"], how="any", inplace=True)
positive =  proteus[(abs(proteus["logFC"]) >1) &(proteus["adj.P.Val"] <0.05) ]
negative = proteus[(abs(proteus["logFC"]) <=1) | (proteus["adj.P.Val"] >=0.05) ]

TP = len(positive[positive["protein"].str.contains("YEAST")])
FP = len(positive) - TP

FN = len(negative[negative["protein"].str.contains("YEAST")])
TN = len(negative) - FN

print(TP)
print(FP)
print(TN)
print(FN)

print("PPV = %.6f" % (TP / (TP + FP)))
print("NPV = %.6f" % (TN / (TN + FN)))

# quantms
proteus = pd.read_csv("./proteus-NN-quantms_10.csv", header=0, sep=",", index_col=0)

proteus.dropna(subset=["logFC", "P.Value"], how="any", inplace=True)
positive =  proteus[(abs(proteus["logFC"]) >1) &(proteus["adj.P.Val"] <0.05) ]
negative = proteus[(abs(proteus["logFC"]) <=1) | (proteus["adj.P.Val"] >=0.05) ]

TP = len(positive[positive["protein"].str.contains("YEAST")])
FP = len(positive) - TP

FN = len(negative[negative["protein"].str.contains("YEAST")])
TN = len(negative) - FN

print(TP)
print(FP)
print(TN)
print(FN)

print("PPV = %.6f" % (TP / (TP + FP)))
print("NPV = %.6f" % (TN / (TN + FN)))

386
125
1929
147
PPV = 0.755382
NPV = 0.929191
450
233
1764
88
PPV = 0.658858
NPV = 0.952484


In [None]:
%%R -w 800 -h 600
### peptide

library(proteus)


# out_proteus
evi <- read.csv("out_proteus.csv", row.names = NULL)
meta <- read.csv("metadata.csv")
colnames(meta)[1] <- 'experiment'
pepdat <- makePeptideTable(evi, meta)
prodat <- makeProteinTable(pepdat)
prodat.med <- normalizeData(prodat)

res <- limmaDE(prodat.med, sig.level=0.05, conditions=c("fold4","fold1"))
res <- limmaDE(prodat.med, sig.level=0.05, conditions=c("fold10","fold1"))

r <- res[which(res$significant), c("protein", "logFC", "adj.P.Val")]
write.csv(res, "proteus-quantms.csv")


# proteinGroups
proteinGroupsFile <- "D:/dataset/R downstream analysis/0-paper/data_benchmark/0-reviewer/peptide based/PXD007145/proteinGroups.txt"

meta <- read.csv("./metadata.csv")
prot.MQ <- readProteinGroups(proteinGroupsFile, meta)
#It is possible to read these data directly into Proteus and skip peptide and protein aggregation steps.
#equalize medians (EM), quantile (Q), no normalization (NN)
prodat.EM <- normalizeData(prot.MQ)

res <- limmaDE(prodat.med, sig.level=0.05, conditions=c("fold4","fold1"))
res <- limmaDE(prodat.med, sig.level=0.05, conditions=c("fold10","fold1"))

r <- res[which(res$significant), c("protein", "logFC", "adj.P.Val")]
write.csv(res, "proteus-maxquant.csv")
