Skip to content

Commit

Permalink
new scripts for protein abundance data loading
Browse files Browse the repository at this point in the history
  • Loading branch information
sweng66 committed Oct 9, 2018
1 parent bb3a7fe commit 375425c
Show file tree
Hide file tree
Showing 2 changed files with 380 additions and 0 deletions.
166 changes: 166 additions & 0 deletions scripts/loading/protein/load_abundance_data-29361465.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import urllib
import gzip
import shutil
import logging
import os
from datetime import datetime
import sys
reload(sys) # Reload does the trick!
from src.models import Taxonomy, Source, Efo, Eco, Chebi, Go, Locusdbentity, Referencedbentity, \
Proteinabundanceannotation
from scripts.loading.database_session import get_session
from scripts.loading.util import get_strain_taxid_mapping

# from src.helpers import upload_file

__author__ = 'sweng66'

logging.basicConfig(format='%(message)s')
log = logging.getLogger()
log.setLevel(logging.INFO)

CREATED_BY = os.environ['DEFAULT_USER']

datafile = "scripts/loading/protein/data/proteinAbundanceData-29361465.txt"
logfile = "scripts/loading/protein/logs/load_abundance_data-29361465.log"
PMID = 29361465

def load_data():

nex_session = get_session()

sgd = nex_session.query(Source).filter_by(display_name='SGD').one_or_none()
source_id = sgd.source_id
name_to_dbentity_id = dict([(x.systematic_name, x.dbentity_id) for x in nex_session.query(Locusdbentity).all()])
pmid_to_reference_id = dict([(x.pmid, x.dbentity_id) for x in nex_session.query(Referencedbentity).all()])
ecoid_to_eco_id = dict([(x.ecoid, x.eco_id) for x in nex_session.query(Eco).all()])
efoid_to_efo_id = dict([(x.efoid, x.efo_id) for x in nex_session.query(Efo).all()])
chebiid_to_chebi_id = dict([(x.chebiid, x.chebi_id) for x in nex_session.query(Chebi).all()])
goid_to_go_id = dict([(x.goid, x.go_id) for x in nex_session.query(Go).all()])
taxid_to_taxonomy_id = dict([(x.taxid, x.taxonomy_id) for x in nex_session.query(Taxonomy).all()])
strain_to_taxid_mapping = get_strain_taxid_mapping()
reference_id = pmid_to_reference_id.get(PMID)
if reference_id is None:
print "The PMID:", PMID, " is not in the database."
return

log.info("Start loading:\n")
log.info(str(datetime.now()) + "\n")

fw = open(logfile, "w")
f = open(datafile)

i = 0

for line in f:
if line.startswith("SYSTEMATIC_NMAE"):
continue
pieces = line.strip().replace("None", "").split("\t")
dbentity_id = name_to_dbentity_id.get(pieces[0])
if dbentity_id is None:
print "The ORF name is not in the Locusdbentity table:", pieces[0]
continue
original_reference_id = pmid_to_reference_id.get(int(pieces[2]))
data_value = int(pieces[3])
eco_id = ecoid_to_eco_id.get(pieces[4])
if eco_id is None:
print "The ECOID:", pieces[4], " is not in the database."
continue
efo_id = efoid_to_efo_id.get(pieces[5])
if efo_id is None:
print "The EFOID:", pieces[5], " is not in the database."
continue
taxid = strain_to_taxid_mapping.get(pieces[6])
if taxid is None:
print "The strain:", pieces[6], " is not in the mapping list."
continue
taxonomy_id = taxid_to_taxonomy_id.get(taxid)
if taxonomy_id is None:
print "The TAXID:", taxid, " is not in the database."
continue
chebi_id = None
go_id = None
time_value = None
time_unit = None
conc_value = None
conc_unit = None
fold_change = None
if len(pieces) >= 8:
if pieces[7]:
chebi_id = chebiid_to_chebi_id.get(pieces[7])
if chebi_id is None:
print "The chebiid:", pieces[7], " is not in the database."
continue
if pieces[8]:
go_id = goid_to_go_id.get(pieces[8])
if go_id is None:
print "The goid:", pieces[8], " is not in the database."
continue
if pieces[9]:
time_value = int(pieces[9])
if pieces[10]:
time_unit = pieces[10]
if time_unit.startswith('hour'):
time_unit = 'hr'
if time_unit.startswith('day'):
time_unit = 'd'
if time_unit.startswith('min'):
time_unit = 'min'
if pieces[11]:
conc_value = float(pieces[11])
conc_unit = pieces[12]
if pieces[13]:
fold_change = float(pieces[13])

insert_proteinabundanceannotation(nex_session, fw, dbentity_id, source_id, taxonomy_id,
reference_id, original_reference_id, eco_id, efo_id,
chebi_id, go_id, data_value, fold_change,
time_value, time_unit, conc_value, conc_unit)

i = i + 1
if i > 500:
# nex_session.rollback()
nex_session.commit()
i = 0

f.close()

# nex_session.rollback()
nex_session.commit()
nex_session.close()

log.info("Done loading\n")
log.info(str(datetime.now()) + "\n")


def insert_proteinabundanceannotation(nex_session, fw, dbentity_id, source_id, taxonomy_id, reference_id, original_reference_id, eco_id, efo_id, chebi_id, go_id, data_value, fold_change, time_value, time_unit, conc_value, conc_unit):

x = Proteinabundanceannotation(dbentity_id = dbentity_id,
source_id = source_id,
taxonomy_id = taxonomy_id,
reference_id = reference_id,
original_reference_id = original_reference_id,
assay_id = eco_id,
media_id = efo_id,
data_value = data_value,
data_unit = "molecules/cell",
fold_change = fold_change,
chemical_id = chebi_id,
process_id = go_id,
concentration_value = conc_value,
concentration_unit = conc_unit,
time_value = time_value,
time_unit = time_unit,
created_by = CREATED_BY)

nex_session.add(x)

fw.write("Insert new row for dbentity_id = " + str(dbentity_id) + ", original_reference_id " + str(original_reference_id) + ", aasay_id = " + str(eco_id) + ", media_id = " + str(efo_id) + ", chemical_id = " + str(chebi_id) + ", process_id = " + str(go_id) + "\n")

if __name__ == "__main__":

load_data()




214 changes: 214 additions & 0 deletions scripts/loading/protein/preprocess_abundance_data-29361465.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
metadatafileTreated = "data/Grid-with-metadata_final_treated.txt"
metadatafileUntreated = "data/Grid-with-metadata_final_untreated.txt"
datafileUntreated = "data/Table-S4-final-abundance-no-stress-29361465.txt"
datafileTreated = "data/Table-S8-abundance-in-stress-29361465.txt"
foldfile = "data/Table-S9-fold-change-abundance-in-stress-29361465.txt"


def generate_data():

author2metadataUntreated = get_untreated_metadata()
author2metadataTreated = get_treated_metadata()
geneAuthor2fold = get_fold_change()

print "SYSTEMATIC_NMAE\tAUTHOR\tPMID\tMOLECULES_PER_CELL\tECO\tEFO\tSTRAIN\tCHEBI\tGOID\tTIME_VALUE\tTIME_UNIT\tCOND_VALUE\tCONT_UNIT\tCHANGE_FOLD"

generate_data_for_untreated_expts(author2metadataUntreated)
generate_data_for_treated_expts(author2metadataTreated, geneAuthor2fold)

def generate_data_for_untreated_expts(author2metadataUntreated):

f = open(datafileUntreated)

header = []
for line in f:
pieces = line.strip().split("\t")
if line.startswith('Systematic Name'):
header = pieces[6:]
continue
if len(pieces) < 6:
continue
gene = pieces[0]
data = pieces[6:]
i = 0
for author in header:
if i >= len(data):
break
if data[i] == "":
i = i + 1
continue
molecules = data[i]
(pmid, eco, efo, strain) = author2metadataUntreated[author]
print gene + "\t" + author + "\t" + pmid + "\t" + molecules + "\t" + eco + "\t" + efo + "\t" + strain + "\t\t\t\t\t\t\t"
i = i + 1

f.close()

def generate_data_for_treated_expts(author2metadataTreated, geneAuthor2fold):

f = open(datafileTreated)

geneAuthor2data = {}
header = []
for line in f:
pieces = line.strip().split("\t")
if line.startswith('Systematic Name'):
header = pieces[2:]
continue
if len(pieces) < 3:
continue
gene = pieces[0]
data = pieces[2:]
i = 0
for authorExpt in header:
if i >= len(data):
break
if authorExpt == '':
i = i + 1
continue
if "Untreated" in authorExpt:
i = i + 1
continue
molecules = data[i]
author = authorExpt.split(':')[0]
values = []
if (gene, author) in geneAuthor2data:
values = geneAuthor2data[(gene, author)]
values.append(molecules)
geneAuthor2data[(gene, author)] = values
i = i + 1
f.close()

for (gene, author) in geneAuthor2data:
data = geneAuthor2data[(gene, author)]
metadata = author2metadataTreated.get(author)
fold = geneAuthor2fold.get((gene, author))
if metadata is None:
print "BAD: no metadata for ", author
continue
i = 0

# if len(data) > len(metadata):
# print (gene, author), ", data=", data
# print (gene, author), ", metadata=", metadata

for molecules in data:
if molecules == '':
i = i + 1
continue
(pmid, eco, efo, strain, chebi, goid, time_value, time_unit, conc_value, conc_unit) = metadata[i]
thisFold = None
if fold is not None and len(fold) > i:
thisFold = fold[i]
if thisFold == '':
thisFold = None
if conc_value == '':
conc_value = None
if conc_unit == '':
conc_unit = None
print gene + "\t" + author + "\t" + pmid + "\t" + molecules + "\t" + eco + "\t" + efo + "\t" + strain + "\t" + str(chebi) + "\t" + str(goid) + "\t" + time_value + "\t" + time_unit + "\t" + str(conc_value) + "\t" + str(conc_unit) + "\t" + str(thisFold)
i = i + 1

def get_fold_change():

geneAuthor2fold = {}

f = open(foldfile)

for line in f:
pieces = line.strip().split("\t")
if line.startswith('Systematic Name'):
header = pieces[2:]
continue
if len(pieces) < 3:
continue
gene = pieces[0]
data = pieces[2:]
i = 0
for authorExpt in header:
if i >= len(data):
break
fold = data[i]
author = authorExpt.split(':')[0]
values = []
if (gene, author) in geneAuthor2fold:
values = geneAuthor2fold[(gene, author)]
values.append(fold)
geneAuthor2fold[(gene, author)] = values
i = i + 1

f.close()

return geneAuthor2fold


def get_treated_metadata():

f = open(metadatafileTreated)

author2metadataTreated = {}

for line in f:
pieces = line.strip().split("\t")
if len(pieces) < 10:
continue
author = pieces[0].upper()
pmid = pieces[2]
eco = pieces[5]
efo = pieces[8]
strain = pieces[10]
taxonomy_id = pieces[11]
chebi = pieces[13]
if chebi == '':
chebi = None
time_value = pieces[14].split(" ")[0]
time_unit = pieces[14].split(" ")[1]
conc_value = pieces[15]
conc_unit = pieces[16]
goid = None
goterm = None
if len(pieces) >= 19:
goterm = pieces[17]
goid = pieces[18]

data = []
if author in author2metadataTreated:
data = author2metadataTreated[author]
data.append((pmid, eco, efo, strain, chebi, goid, time_value, time_unit, conc_value, conc_unit))
author2metadataTreated[author] = data

# print "author=", author, ", pimd=", pmid, ", eco=", eco, ", efo=", efo, ", strain=", strain, ", chebi=", chebi, ", chemical=", chemical, ", time=", time_value, ", time_unit=", time_unit, ", conc_value=", conc_value, ", conc_unit=", conc_unit, ", goid=", goid, ", goTerm=", goterm

f.close()

return author2metadataTreated


def get_untreated_metadata():

f = open(metadatafileUntreated)

author2metadataUntreated = {}

for line in f:

pieces = line.strip().split("\t")
if len(pieces) < 10:
continue
author = pieces[0].upper()
pmid = pieces[2]
eco = pieces[5]
efo = pieces[8]
strain = pieces[10]
taxonomy_id = pieces[11]

author2metadataUntreated[author] = (pmid, eco, efo, strain)

# print "author=", author, ", pimd=", pmid, ", eco=", eco, ", efo=", efo, ", strain=", strain, ", taxonomy_id=", taxonomy_id

f.close()
return author2metadataUntreated

if __name__ == '__main__':

generate_data()

0 comments on commit 375425c

Please sign in to comment.