new scripts for protein abundance data loading

yeastgenome · Oct 9, 2018 · 375425c · 375425c
1 parent bb3a7fe
commit 375425c
Show file tree

Hide file tree

Showing 2 changed files with 380 additions and 0 deletions.
diff --git a/scripts/loading/protein/load_abundance_data-29361465.py b/scripts/loading/protein/load_abundance_data-29361465.py
@@ -0,0 +1,166 @@
+import urllib
+import gzip
+import shutil
+import logging
+import os
+from datetime import datetime
+import sys
+reload(sys)  # Reload does the trick!
+from src.models import Taxonomy, Source, Efo, Eco, Chebi, Go, Locusdbentity, Referencedbentity, \
+                       Proteinabundanceannotation
+from scripts.loading.database_session import get_session
+from scripts.loading.util import get_strain_taxid_mapping
+
+# from src.helpers import upload_file
+
+__author__ = 'sweng66'
+
+logging.basicConfig(format='%(message)s')
+log = logging.getLogger()
+log.setLevel(logging.INFO)
+
+CREATED_BY = os.environ['DEFAULT_USER']
+
+datafile = "scripts/loading/protein/data/proteinAbundanceData-29361465.txt"
+logfile = "scripts/loading/protein/logs/load_abundance_data-29361465.log"
+PMID = 29361465
+
+def load_data():
+
+    nex_session = get_session()
+
+    sgd = nex_session.query(Source).filter_by(display_name='SGD').one_or_none()
+    source_id = sgd.source_id
+    name_to_dbentity_id = dict([(x.systematic_name, x.dbentity_id) for x in nex_session.query(Locusdbentity).all()])
+    pmid_to_reference_id = dict([(x.pmid, x.dbentity_id) for x in nex_session.query(Referencedbentity).all()])
+    ecoid_to_eco_id = dict([(x.ecoid, x.eco_id) for x in nex_session.query(Eco).all()])
+    efoid_to_efo_id = dict([(x.efoid, x.efo_id) for x in nex_session.query(Efo).all()])
+    chebiid_to_chebi_id = dict([(x.chebiid, x.chebi_id) for x in nex_session.query(Chebi).all()])
+    goid_to_go_id = dict([(x.goid, x.go_id) for x in nex_session.query(Go).all()])
+    taxid_to_taxonomy_id =  dict([(x.taxid, x.taxonomy_id) for x in nex_session.query(Taxonomy).all()])
+    strain_to_taxid_mapping = get_strain_taxid_mapping()
+    reference_id = pmid_to_reference_id.get(PMID)
+    if reference_id is None:
+        print "The PMID:", PMID, " is not in the database."
+        return
+
+    log.info("Start loading:\n") 
+    log.info(str(datetime.now()) + "\n")
+
+    fw = open(logfile, "w")
+    f = open(datafile)
+
+    i = 0
+
+    for line in f:
+        if line.startswith("SYSTEMATIC_NMAE"):
+            continue
+        pieces = line.strip().replace("None", "").split("\t")
+        dbentity_id = name_to_dbentity_id.get(pieces[0])
+        if dbentity_id is None:
+            print "The ORF name is not in the Locusdbentity table:", pieces[0]
+            continue
+        original_reference_id = pmid_to_reference_id.get(int(pieces[2]))
+        data_value = int(pieces[3])
+        eco_id = ecoid_to_eco_id.get(pieces[4])
+        if eco_id is None:
+            print "The ECOID:", pieces[4], " is not in the database."
+            continue
+        efo_id = efoid_to_efo_id.get(pieces[5])
+        if efo_id is None:
+            print "The EFOID:", pieces[5], " is not in the database."
+            continue
+        taxid = strain_to_taxid_mapping.get(pieces[6])
+        if taxid is None:
+            print "The strain:", pieces[6], " is not in the mapping list."
+            continue
+        taxonomy_id = taxid_to_taxonomy_id.get(taxid)
+        if taxonomy_id is None:
+            print "The TAXID:", taxid, " is not in the database."
+            continue
+        chebi_id = None
+        go_id = None
+        time_value = None
+        time_unit = None
+        conc_value = None
+        conc_unit = None
+        fold_change = None
+        if len(pieces) >= 8:
+            if pieces[7]:
+                chebi_id = chebiid_to_chebi_id.get(pieces[7])
+                if chebi_id is None:
+                    print "The chebiid:", pieces[7], " is not in the database."
+                    continue
+            if pieces[8]:
+                go_id = goid_to_go_id.get(pieces[8])
+                if go_id is None:
+                    print "The goid:", pieces[8], " is not in the database."
+                    continue
+            if pieces[9]:
+                time_value = int(pieces[9])
+            if pieces[10]:
+                time_unit = pieces[10]
+                if time_unit.startswith('hour'):
+                    time_unit = 'hr'
+                if time_unit.startswith('day'):
+                    time_unit = 'd'
+                if time_unit.startswith('min'):
+                    time_unit = 'min'
+            if pieces[11]:
+                conc_value = float(pieces[11])
+                conc_unit = pieces[12]
+            if pieces[13]:
+                fold_change = float(pieces[13])
+
+        insert_proteinabundanceannotation(nex_session, fw, dbentity_id, source_id, taxonomy_id,
+                                          reference_id, original_reference_id, eco_id, efo_id, 
+                                          chebi_id, go_id, data_value, fold_change,
+                                          time_value, time_unit, conc_value, conc_unit)
+
+        i = i + 1
+        if i > 500:
+            # nex_session.rollback()
+            nex_session.commit()  
+            i = 0
+
+    f.close()
+
+    # nex_session.rollback()
+    nex_session.commit()
+    nex_session.close()
+
+    log.info("Done loading\n")
+    log.info(str(datetime.now()) + "\n")
+
+
+def insert_proteinabundanceannotation(nex_session, fw, dbentity_id, source_id, taxonomy_id, reference_id, original_reference_id, eco_id, efo_id, chebi_id, go_id, data_value, fold_change, time_value, time_unit, conc_value, conc_unit):
+
+    x = Proteinabundanceannotation(dbentity_id = dbentity_id,
+                                   source_id = source_id,
+                                   taxonomy_id = taxonomy_id,
+                                   reference_id = reference_id,
+                                   original_reference_id = original_reference_id,
+                                   assay_id = eco_id,
+                                   media_id = efo_id,
+                                   data_value = data_value,
+                                   data_unit = "molecules/cell",
+                                   fold_change = fold_change,
+                                   chemical_id = chebi_id,
+                                   process_id = go_id,
+                                   concentration_value = conc_value,
+                                   concentration_unit = conc_unit,
+                                   time_value = time_value,
+                                   time_unit = time_unit,
+                                   created_by = CREATED_BY)
+
+    nex_session.add(x)
+
+    fw.write("Insert new row for dbentity_id = " + str(dbentity_id) + ", original_reference_id " + str(original_reference_id) + ", aasay_id = " + str(eco_id) + ", media_id = " + str(efo_id) + ", chemical_id = " + str(chebi_id) + ", process_id = " + str(go_id) + "\n")
+
+if __name__ == "__main__":
+
+    load_data()
+
+
+
+
diff --git a/scripts/loading/protein/preprocess_abundance_data-29361465.py b/scripts/loading/protein/preprocess_abundance_data-29361465.py
@@ -0,0 +1,214 @@
+metadatafileTreated = "data/Grid-with-metadata_final_treated.txt"
+metadatafileUntreated = "data/Grid-with-metadata_final_untreated.txt"
+datafileUntreated = "data/Table-S4-final-abundance-no-stress-29361465.txt"
+datafileTreated = "data/Table-S8-abundance-in-stress-29361465.txt"
+foldfile = "data/Table-S9-fold-change-abundance-in-stress-29361465.txt"
+
+
+def generate_data():
+
+    author2metadataUntreated = get_untreated_metadata()
+    author2metadataTreated = get_treated_metadata()
+    geneAuthor2fold = get_fold_change()
+
+    print "SYSTEMATIC_NMAE\tAUTHOR\tPMID\tMOLECULES_PER_CELL\tECO\tEFO\tSTRAIN\tCHEBI\tGOID\tTIME_VALUE\tTIME_UNIT\tCOND_VALUE\tCONT_UNIT\tCHANGE_FOLD"
+
+    generate_data_for_untreated_expts(author2metadataUntreated)
+    generate_data_for_treated_expts(author2metadataTreated, geneAuthor2fold)
+
+def generate_data_for_untreated_expts(author2metadataUntreated):
+
+    f = open(datafileUntreated)
+
+    header = []
+    for line in f:
+        pieces = line.strip().split("\t")
+        if line.startswith('Systematic Name'):
+            header = pieces[6:]
+            continue
+        if len(pieces) < 6:
+            continue
+        gene = pieces[0]
+        data = pieces[6:]
+        i = 0
+        for author in header:
+            if i >= len(data):
+                break
+            if data[i] == "":
+                i = i + 1
+                continue
+            molecules = data[i]
+            (pmid, eco, efo, strain) = author2metadataUntreated[author]
+            print gene + "\t" + author + "\t" + pmid + "\t" + molecules + "\t" + eco + "\t" + efo + "\t" + strain + "\t\t\t\t\t\t\t"
+            i = i + 1
+
+    f.close()
+
+def generate_data_for_treated_expts(author2metadataTreated, geneAuthor2fold):
+
+    f = open(datafileTreated)
+
+    geneAuthor2data = {}
+    header = []
+    for line in f:
+        pieces = line.strip().split("\t")
+        if line.startswith('Systematic Name'):
+            header = pieces[2:]
+            continue
+        if len(pieces) < 3:
+            continue
+        gene = pieces[0]
+        data = pieces[2:]
+        i = 0
+        for authorExpt in header:
+            if i >= len(data):
+                break
+            if authorExpt == '':
+                i = i + 1
+                continue
+            if "Untreated" in authorExpt:
+                i = i + 1
+                continue            
+            molecules = data[i]
+            author = authorExpt.split(':')[0]
+            values = []
+            if (gene, author) in geneAuthor2data:
+                values = geneAuthor2data[(gene, author)]
+            values.append(molecules)
+            geneAuthor2data[(gene, author)] = values
+            i = i + 1
+    f.close()
+
+    for (gene, author) in geneAuthor2data:
+        data = geneAuthor2data[(gene, author)]
+        metadata = author2metadataTreated.get(author)
+        fold = geneAuthor2fold.get((gene, author))
+        if metadata is None:
+            print "BAD: no metadata for ", author
+            continue
+        i = 0
+
+        # if len(data) > len(metadata):
+        #    print (gene, author), ", data=", data
+        #    print (gene, author), ", metadata=", metadata
+
+        for molecules in data:
+            if molecules == '':
+                i = i + 1
+                continue
+            (pmid, eco, efo, strain, chebi, goid, time_value, time_unit, conc_value, conc_unit) = metadata[i]
+            thisFold = None
+            if fold is not None and len(fold) > i:
+                thisFold = fold[i]
+                if thisFold == '':
+                    thisFold = None
+            if conc_value == '':
+                conc_value = None
+            if conc_unit == '':
+                conc_unit = None
+            print gene + "\t" + author + "\t" + pmid + "\t" + molecules + "\t" + eco + "\t" + efo + "\t" + strain + "\t" + str(chebi) + "\t" + str(goid) + "\t" + time_value + "\t" + time_unit + "\t" + str(conc_value) + "\t" + str(conc_unit) + "\t" + str(thisFold)
+            i = i + 1
+
+def get_fold_change():
+
+    geneAuthor2fold = {}
+
+    f = open(foldfile)
+
+    for line in f:
+        pieces = line.strip().split("\t")
+        if line.startswith('Systematic Name'):
+            header = pieces[2:]
+            continue
+        if len(pieces) < 3:
+            continue
+        gene = pieces[0]
+        data = pieces[2:]
+        i = 0
+        for authorExpt in header:
+            if i >= len(data):
+                break
+            fold = data[i]
+            author = authorExpt.split(':')[0]
+            values = []
+            if (gene, author) in geneAuthor2fold:
+                values = geneAuthor2fold[(gene, author)]
+            values.append(fold)
+            geneAuthor2fold[(gene, author)] = values
+            i = i + 1
+
+    f.close()
+
+    return geneAuthor2fold
+
+
+def get_treated_metadata():
+
+    f = open(metadatafileTreated)
+
+    author2metadataTreated = {}
+
+    for line in f:
+        pieces = line.strip().split("\t")
+        if len(pieces) < 10:
+            continue
+        author = pieces[0].upper()
+        pmid = pieces[2]
+        eco = pieces[5]
+        efo = pieces[8]
+        strain = pieces[10]
+        taxonomy_id = pieces[11]
+        chebi = pieces[13]
+        if chebi == '':
+            chebi = None
+        time_value = pieces[14].split(" ")[0]
+        time_unit = pieces[14].split(" ")[1]
+        conc_value = pieces[15]
+        conc_unit = pieces[16]
+        goid = None
+        goterm = None
+        if len(pieces) >= 19:
+            goterm = pieces[17]
+            goid = pieces[18]
+
+        data = []
+        if author in author2metadataTreated:
+            data = author2metadataTreated[author]
+        data.append((pmid, eco, efo, strain, chebi, goid, time_value, time_unit, conc_value, conc_unit))
+        author2metadataTreated[author] = data
+
+        # print "author=", author, ", pimd=", pmid, ", eco=", eco, ", efo=", efo, ", strain=", strain, ", chebi=", chebi, ", chemical=", chemical, ", time=", time_value, ", time_unit=", time_unit, ", conc_value=", conc_value, ", conc_unit=", conc_unit, ", goid=", goid, ", goTerm=", goterm
+
+    f.close()
+
+    return author2metadataTreated
+
+
+def get_untreated_metadata():
+
+    f = open(metadatafileUntreated)
+
+    author2metadataUntreated = {}
+
+    for line in f:
+
+        pieces = line.strip().split("\t")
+        if len(pieces) < 10:
+            continue
+        author = pieces[0].upper()
+        pmid = pieces[2]
+        eco = pieces[5]
+        efo = pieces[8]
+        strain = pieces[10]
+        taxonomy_id = pieces[11]
+
+        author2metadataUntreated[author] = (pmid, eco, efo, strain)
+
+        # print "author=", author, ", pimd=", pmid, ", eco=", eco, ", efo=", efo, ", strain=", strain, ", taxonomy_id=", taxonomy_id    
+
+    f.close()
+    return author2metadataUntreated
+
+if __name__ == '__main__':
+
+    generate_data()