-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
new scripts for protein abundance data loading
- Loading branch information
Showing
2 changed files
with
380 additions
and
0 deletions.
There are no files selected for viewing
166 changes: 166 additions & 0 deletions
166
scripts/loading/protein/load_abundance_data-29361465.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
import urllib | ||
import gzip | ||
import shutil | ||
import logging | ||
import os | ||
from datetime import datetime | ||
import sys | ||
reload(sys) # Reload does the trick! | ||
from src.models import Taxonomy, Source, Efo, Eco, Chebi, Go, Locusdbentity, Referencedbentity, \ | ||
Proteinabundanceannotation | ||
from scripts.loading.database_session import get_session | ||
from scripts.loading.util import get_strain_taxid_mapping | ||
|
||
# from src.helpers import upload_file | ||
|
||
__author__ = 'sweng66' | ||
|
||
logging.basicConfig(format='%(message)s') | ||
log = logging.getLogger() | ||
log.setLevel(logging.INFO) | ||
|
||
CREATED_BY = os.environ['DEFAULT_USER'] | ||
|
||
datafile = "scripts/loading/protein/data/proteinAbundanceData-29361465.txt" | ||
logfile = "scripts/loading/protein/logs/load_abundance_data-29361465.log" | ||
PMID = 29361465 | ||
|
||
def load_data(): | ||
|
||
nex_session = get_session() | ||
|
||
sgd = nex_session.query(Source).filter_by(display_name='SGD').one_or_none() | ||
source_id = sgd.source_id | ||
name_to_dbentity_id = dict([(x.systematic_name, x.dbentity_id) for x in nex_session.query(Locusdbentity).all()]) | ||
pmid_to_reference_id = dict([(x.pmid, x.dbentity_id) for x in nex_session.query(Referencedbentity).all()]) | ||
ecoid_to_eco_id = dict([(x.ecoid, x.eco_id) for x in nex_session.query(Eco).all()]) | ||
efoid_to_efo_id = dict([(x.efoid, x.efo_id) for x in nex_session.query(Efo).all()]) | ||
chebiid_to_chebi_id = dict([(x.chebiid, x.chebi_id) for x in nex_session.query(Chebi).all()]) | ||
goid_to_go_id = dict([(x.goid, x.go_id) for x in nex_session.query(Go).all()]) | ||
taxid_to_taxonomy_id = dict([(x.taxid, x.taxonomy_id) for x in nex_session.query(Taxonomy).all()]) | ||
strain_to_taxid_mapping = get_strain_taxid_mapping() | ||
reference_id = pmid_to_reference_id.get(PMID) | ||
if reference_id is None: | ||
print "The PMID:", PMID, " is not in the database." | ||
return | ||
|
||
log.info("Start loading:\n") | ||
log.info(str(datetime.now()) + "\n") | ||
|
||
fw = open(logfile, "w") | ||
f = open(datafile) | ||
|
||
i = 0 | ||
|
||
for line in f: | ||
if line.startswith("SYSTEMATIC_NMAE"): | ||
continue | ||
pieces = line.strip().replace("None", "").split("\t") | ||
dbentity_id = name_to_dbentity_id.get(pieces[0]) | ||
if dbentity_id is None: | ||
print "The ORF name is not in the Locusdbentity table:", pieces[0] | ||
continue | ||
original_reference_id = pmid_to_reference_id.get(int(pieces[2])) | ||
data_value = int(pieces[3]) | ||
eco_id = ecoid_to_eco_id.get(pieces[4]) | ||
if eco_id is None: | ||
print "The ECOID:", pieces[4], " is not in the database." | ||
continue | ||
efo_id = efoid_to_efo_id.get(pieces[5]) | ||
if efo_id is None: | ||
print "The EFOID:", pieces[5], " is not in the database." | ||
continue | ||
taxid = strain_to_taxid_mapping.get(pieces[6]) | ||
if taxid is None: | ||
print "The strain:", pieces[6], " is not in the mapping list." | ||
continue | ||
taxonomy_id = taxid_to_taxonomy_id.get(taxid) | ||
if taxonomy_id is None: | ||
print "The TAXID:", taxid, " is not in the database." | ||
continue | ||
chebi_id = None | ||
go_id = None | ||
time_value = None | ||
time_unit = None | ||
conc_value = None | ||
conc_unit = None | ||
fold_change = None | ||
if len(pieces) >= 8: | ||
if pieces[7]: | ||
chebi_id = chebiid_to_chebi_id.get(pieces[7]) | ||
if chebi_id is None: | ||
print "The chebiid:", pieces[7], " is not in the database." | ||
continue | ||
if pieces[8]: | ||
go_id = goid_to_go_id.get(pieces[8]) | ||
if go_id is None: | ||
print "The goid:", pieces[8], " is not in the database." | ||
continue | ||
if pieces[9]: | ||
time_value = int(pieces[9]) | ||
if pieces[10]: | ||
time_unit = pieces[10] | ||
if time_unit.startswith('hour'): | ||
time_unit = 'hr' | ||
if time_unit.startswith('day'): | ||
time_unit = 'd' | ||
if time_unit.startswith('min'): | ||
time_unit = 'min' | ||
if pieces[11]: | ||
conc_value = float(pieces[11]) | ||
conc_unit = pieces[12] | ||
if pieces[13]: | ||
fold_change = float(pieces[13]) | ||
|
||
insert_proteinabundanceannotation(nex_session, fw, dbentity_id, source_id, taxonomy_id, | ||
reference_id, original_reference_id, eco_id, efo_id, | ||
chebi_id, go_id, data_value, fold_change, | ||
time_value, time_unit, conc_value, conc_unit) | ||
|
||
i = i + 1 | ||
if i > 500: | ||
# nex_session.rollback() | ||
nex_session.commit() | ||
i = 0 | ||
|
||
f.close() | ||
|
||
# nex_session.rollback() | ||
nex_session.commit() | ||
nex_session.close() | ||
|
||
log.info("Done loading\n") | ||
log.info(str(datetime.now()) + "\n") | ||
|
||
|
||
def insert_proteinabundanceannotation(nex_session, fw, dbentity_id, source_id, taxonomy_id, reference_id, original_reference_id, eco_id, efo_id, chebi_id, go_id, data_value, fold_change, time_value, time_unit, conc_value, conc_unit): | ||
|
||
x = Proteinabundanceannotation(dbentity_id = dbentity_id, | ||
source_id = source_id, | ||
taxonomy_id = taxonomy_id, | ||
reference_id = reference_id, | ||
original_reference_id = original_reference_id, | ||
assay_id = eco_id, | ||
media_id = efo_id, | ||
data_value = data_value, | ||
data_unit = "molecules/cell", | ||
fold_change = fold_change, | ||
chemical_id = chebi_id, | ||
process_id = go_id, | ||
concentration_value = conc_value, | ||
concentration_unit = conc_unit, | ||
time_value = time_value, | ||
time_unit = time_unit, | ||
created_by = CREATED_BY) | ||
|
||
nex_session.add(x) | ||
|
||
fw.write("Insert new row for dbentity_id = " + str(dbentity_id) + ", original_reference_id " + str(original_reference_id) + ", aasay_id = " + str(eco_id) + ", media_id = " + str(efo_id) + ", chemical_id = " + str(chebi_id) + ", process_id = " + str(go_id) + "\n") | ||
|
||
if __name__ == "__main__": | ||
|
||
load_data() | ||
|
||
|
||
|
||
|
214 changes: 214 additions & 0 deletions
214
scripts/loading/protein/preprocess_abundance_data-29361465.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,214 @@ | ||
metadatafileTreated = "data/Grid-with-metadata_final_treated.txt" | ||
metadatafileUntreated = "data/Grid-with-metadata_final_untreated.txt" | ||
datafileUntreated = "data/Table-S4-final-abundance-no-stress-29361465.txt" | ||
datafileTreated = "data/Table-S8-abundance-in-stress-29361465.txt" | ||
foldfile = "data/Table-S9-fold-change-abundance-in-stress-29361465.txt" | ||
|
||
|
||
def generate_data(): | ||
|
||
author2metadataUntreated = get_untreated_metadata() | ||
author2metadataTreated = get_treated_metadata() | ||
geneAuthor2fold = get_fold_change() | ||
|
||
print "SYSTEMATIC_NMAE\tAUTHOR\tPMID\tMOLECULES_PER_CELL\tECO\tEFO\tSTRAIN\tCHEBI\tGOID\tTIME_VALUE\tTIME_UNIT\tCOND_VALUE\tCONT_UNIT\tCHANGE_FOLD" | ||
|
||
generate_data_for_untreated_expts(author2metadataUntreated) | ||
generate_data_for_treated_expts(author2metadataTreated, geneAuthor2fold) | ||
|
||
def generate_data_for_untreated_expts(author2metadataUntreated): | ||
|
||
f = open(datafileUntreated) | ||
|
||
header = [] | ||
for line in f: | ||
pieces = line.strip().split("\t") | ||
if line.startswith('Systematic Name'): | ||
header = pieces[6:] | ||
continue | ||
if len(pieces) < 6: | ||
continue | ||
gene = pieces[0] | ||
data = pieces[6:] | ||
i = 0 | ||
for author in header: | ||
if i >= len(data): | ||
break | ||
if data[i] == "": | ||
i = i + 1 | ||
continue | ||
molecules = data[i] | ||
(pmid, eco, efo, strain) = author2metadataUntreated[author] | ||
print gene + "\t" + author + "\t" + pmid + "\t" + molecules + "\t" + eco + "\t" + efo + "\t" + strain + "\t\t\t\t\t\t\t" | ||
i = i + 1 | ||
|
||
f.close() | ||
|
||
def generate_data_for_treated_expts(author2metadataTreated, geneAuthor2fold): | ||
|
||
f = open(datafileTreated) | ||
|
||
geneAuthor2data = {} | ||
header = [] | ||
for line in f: | ||
pieces = line.strip().split("\t") | ||
if line.startswith('Systematic Name'): | ||
header = pieces[2:] | ||
continue | ||
if len(pieces) < 3: | ||
continue | ||
gene = pieces[0] | ||
data = pieces[2:] | ||
i = 0 | ||
for authorExpt in header: | ||
if i >= len(data): | ||
break | ||
if authorExpt == '': | ||
i = i + 1 | ||
continue | ||
if "Untreated" in authorExpt: | ||
i = i + 1 | ||
continue | ||
molecules = data[i] | ||
author = authorExpt.split(':')[0] | ||
values = [] | ||
if (gene, author) in geneAuthor2data: | ||
values = geneAuthor2data[(gene, author)] | ||
values.append(molecules) | ||
geneAuthor2data[(gene, author)] = values | ||
i = i + 1 | ||
f.close() | ||
|
||
for (gene, author) in geneAuthor2data: | ||
data = geneAuthor2data[(gene, author)] | ||
metadata = author2metadataTreated.get(author) | ||
fold = geneAuthor2fold.get((gene, author)) | ||
if metadata is None: | ||
print "BAD: no metadata for ", author | ||
continue | ||
i = 0 | ||
|
||
# if len(data) > len(metadata): | ||
# print (gene, author), ", data=", data | ||
# print (gene, author), ", metadata=", metadata | ||
|
||
for molecules in data: | ||
if molecules == '': | ||
i = i + 1 | ||
continue | ||
(pmid, eco, efo, strain, chebi, goid, time_value, time_unit, conc_value, conc_unit) = metadata[i] | ||
thisFold = None | ||
if fold is not None and len(fold) > i: | ||
thisFold = fold[i] | ||
if thisFold == '': | ||
thisFold = None | ||
if conc_value == '': | ||
conc_value = None | ||
if conc_unit == '': | ||
conc_unit = None | ||
print gene + "\t" + author + "\t" + pmid + "\t" + molecules + "\t" + eco + "\t" + efo + "\t" + strain + "\t" + str(chebi) + "\t" + str(goid) + "\t" + time_value + "\t" + time_unit + "\t" + str(conc_value) + "\t" + str(conc_unit) + "\t" + str(thisFold) | ||
i = i + 1 | ||
|
||
def get_fold_change(): | ||
|
||
geneAuthor2fold = {} | ||
|
||
f = open(foldfile) | ||
|
||
for line in f: | ||
pieces = line.strip().split("\t") | ||
if line.startswith('Systematic Name'): | ||
header = pieces[2:] | ||
continue | ||
if len(pieces) < 3: | ||
continue | ||
gene = pieces[0] | ||
data = pieces[2:] | ||
i = 0 | ||
for authorExpt in header: | ||
if i >= len(data): | ||
break | ||
fold = data[i] | ||
author = authorExpt.split(':')[0] | ||
values = [] | ||
if (gene, author) in geneAuthor2fold: | ||
values = geneAuthor2fold[(gene, author)] | ||
values.append(fold) | ||
geneAuthor2fold[(gene, author)] = values | ||
i = i + 1 | ||
|
||
f.close() | ||
|
||
return geneAuthor2fold | ||
|
||
|
||
def get_treated_metadata(): | ||
|
||
f = open(metadatafileTreated) | ||
|
||
author2metadataTreated = {} | ||
|
||
for line in f: | ||
pieces = line.strip().split("\t") | ||
if len(pieces) < 10: | ||
continue | ||
author = pieces[0].upper() | ||
pmid = pieces[2] | ||
eco = pieces[5] | ||
efo = pieces[8] | ||
strain = pieces[10] | ||
taxonomy_id = pieces[11] | ||
chebi = pieces[13] | ||
if chebi == '': | ||
chebi = None | ||
time_value = pieces[14].split(" ")[0] | ||
time_unit = pieces[14].split(" ")[1] | ||
conc_value = pieces[15] | ||
conc_unit = pieces[16] | ||
goid = None | ||
goterm = None | ||
if len(pieces) >= 19: | ||
goterm = pieces[17] | ||
goid = pieces[18] | ||
|
||
data = [] | ||
if author in author2metadataTreated: | ||
data = author2metadataTreated[author] | ||
data.append((pmid, eco, efo, strain, chebi, goid, time_value, time_unit, conc_value, conc_unit)) | ||
author2metadataTreated[author] = data | ||
|
||
# print "author=", author, ", pimd=", pmid, ", eco=", eco, ", efo=", efo, ", strain=", strain, ", chebi=", chebi, ", chemical=", chemical, ", time=", time_value, ", time_unit=", time_unit, ", conc_value=", conc_value, ", conc_unit=", conc_unit, ", goid=", goid, ", goTerm=", goterm | ||
|
||
f.close() | ||
|
||
return author2metadataTreated | ||
|
||
|
||
def get_untreated_metadata(): | ||
|
||
f = open(metadatafileUntreated) | ||
|
||
author2metadataUntreated = {} | ||
|
||
for line in f: | ||
|
||
pieces = line.strip().split("\t") | ||
if len(pieces) < 10: | ||
continue | ||
author = pieces[0].upper() | ||
pmid = pieces[2] | ||
eco = pieces[5] | ||
efo = pieces[8] | ||
strain = pieces[10] | ||
taxonomy_id = pieces[11] | ||
|
||
author2metadataUntreated[author] = (pmid, eco, efo, strain) | ||
|
||
# print "author=", author, ", pimd=", pmid, ", eco=", eco, ", efo=", efo, ", strain=", strain, ", taxonomy_id=", taxonomy_id | ||
|
||
f.close() | ||
return author2metadataUntreated | ||
|
||
if __name__ == '__main__': | ||
|
||
generate_data() |