# EDA: [exploratory data analysis](https://en.wikipedia.org/wiki/Exploratory_data_analysis)

In [1]:
from eda_imports import *

In [2]:
def parse_term(stanza, tag):
    """
    e.g. tag could be Term or Typedef 
    (http://owlcollab.github.io/oboformat/doc/GO.format.obo-1_2.html)
    """
    res = {'tag': tag}
    for line in stanza:
        key, val = line.strip().split(': ', 1)
        # if a key corresponds to multiple values, convert the values to a list
        if key in res:
            old_val = res.pop(key)
            if isinstance(old_val, str):
                res[key] = [old_val, val]
            elif isinstance(old_val, list):
                res[key] = old_val + [val]
            else:
                raise ValueError('unknown value type for key {0}: {1}'.format(key, type(old_val)))
        else:
            res[key] = val
    return res

In [3]:
def collect_stanza(inf):
    res = []
    while True:
        line = inf.readline().strip()
        if line:
            res.append(line)
        else:
            break
    return res

In [4]:
import gzip

In [5]:
res = []
with gzip.open('./go-basic.obo.gz', 'rt') as inf:
    current_stanza = None
    count = 0
    for line in inf:
#         for tag in ['Term', 'Typedef']:
        for tag in ['Term']:
            if line.startswith('[{0}]'.format(tag)):
                current_stanza = collect_stanza(inf)
                parsed = parse_term(current_stanza, tag=tag)
                res.append(parsed)
                current_stanza = None
                count += 1
            else:
                continue
#             if count == 5:
#                 break

In [6]:
# some simple test cases

# 46921 [Term] and 5 [Typedef]
# assert count == 46921 + 5

obj = res[2]
assert obj['id'] == 'GO:0000003'
assert len(obj['subset']) == 6
assert obj['is_a'] == 'GO:0008150 ! biological_process'

obj = res[3]
assert obj['id'] == 'GO:0000005'
assert obj['is_obsolete'] == 'true'
assert obj['consider'] == ['GO:0042254', 'GO:0044183', 'GO:0051082']

In [7]:
df = pd.DataFrame.from_records(res)

In [8]:
df['is_obsolete'] = df.is_obsolete.replace({'true': True}).fillna(False).values

In [9]:
df.shape

(46921, 15)

In [10]:
df.tag.value_counts()

Term    46921
Name: tag, dtype: int64

In [11]:
df.columns.values.shape

(15,)

so there are a max number of this many keys available for each Term in this obo file. It's not that many, and we'll sample values from each key to get a sense of what they look like

In [12]:
np.sort(df.columns.values).tolist()

['alt_id',
 'comment',
 'consider',
 'def',
 'id',
 'is_a',
 'is_obsolete',
 'name',
 'namespace',
 'relationship',
 'replaced_by',
 'subset',
 'synonym',
 'tag',
 'xref']

In [13]:
df.namespace.value_counts()

biological_process    30566
molecular_function    12062
cellular_component    4293 
Name: namespace, dtype: int64

In [14]:
df.is_obsolete.value_counts()

False    44881
True     2040 
Name: is_obsolete, dtype: int64

In [15]:
df.alt_id.dropna().head(2)

2    [GO:0019952, GO:0050876]
6    GO:0000013              
Name: alt_id, dtype: object

In [16]:
df.consider.dropna().head(2)

3    [GO:0042254, GO:0044183, GO:0051082]
6    [GO:0003756, GO:0015036]            
Name: consider, dtype: object

In [17]:
df.xref.dropna().sample(10).values.tolist()

['MetaCyc:RXN-8680',
 ['EC:3.4.24.19', 'MetaCyc:3.4.24.19-RXN'],
 'MetaCyc:FASYN-ELONG-PWY',
 ['EC:6.2.1.23', 'MetaCyc:DICARBOXYLATE--COA-LIGASE-RXN'],
 ['EC:1.6.6', 'UM-BBD_reactionID:r0464'],
 ['EC:3.4.19.3', 'MetaCyc:PYROGLUTAMYL-PEPTIDASE-I-RXN'],
 ['EC:1.11.1.11', 'MetaCyc:L-ASCORBATE-PEROXIDASE-RXN'],
 ['EC:3.2.1.77', 'MetaCyc:3.2.1.77-RXN'],
 ['EC:2.7.1.30',
  'KEGG:R00847',
  'MetaCyc:GLYCEROL-KIN-RXN',
  'Reactome:REACT_724 "Conversion of Glycerol to Glycerol-3-phosphate, Homo sapiens"',
  'RHEA:21647'],
 ['EC:3.1.3.18', 'KEGG:R01334', 'MetaCyc:GPH-RXN', 'RHEA:14372']]

In [18]:
df.query('replaced_by == replaced_by')[['is_obsolete', 'replaced_by']].sample(5)

Unnamed: 0,is_obsolete,replaced_by
5853,True,GO:0000750
7543,True,GO:0045155
1109,True,GO:0005868
24400,True,GO:0019609
320,True,GO:0000384


In [19]:
df.relationship.dropna().head(2)

15    regulates GO:0006310 ! DNA recombination    
16    regulates GO:0006312 ! mitotic recombination
Name: relationship, dtype: object

In [20]:
df['def'].dropna().sample(2)

4504     "The protein complexes that form the mitochondrial electron transport system (the respiratory chain), associated with the inner mitochondrial membrane. The respiratory chain complexes transfer electrons from an electron donor to an electron acceptor and are associated with a proton pump to create a transmembrane electrochemical gradient." [GOC:curators, GOC:ecd, ISBN:0198547684]
30499    "The developmental process, independent of morphogenetic (shape) change, that is required for a columna/cuboidal epithelial cell of the intestine to attain its fully functional state. A columnar/cuboidal epithelial cell of the intestine mature as they migrate from the intestinal crypt to the villus." [GOC:dph, PMID:18824147]                                                       
Name: def, dtype: object

In [21]:
df.subset.apply(lambda v: '|'.join(v) if isinstance(v, list) else v).value_counts().head(10)

gosubset_prok                       7805
goslim_synapse                      326 
virus_checked                       226 
goslim_pir|gosubset_prok            163 
goslim_pir                          147 
goslim_chembl                       80  
gocheck_do_not_annotate             70  
termgenie_unvetted                  61  
gocheck_do_not_manually_annotate    58  
goslim_chembl|gosubset_prok         49  
Name: subset, dtype: int64

In [22]:
# many comments appear to be explanation of why a GO term is deprecated
df.query('comment == comment')[['is_obsolete', 'comment']].is_obsolete.value_counts()

False    3190
True     2002
Name: is_obsolete, dtype: int64

In [23]:
df[['is_obsolete', 'comment']].dropna().sample(10)

Unnamed: 0,is_obsolete,comment
28241,False,"Note that this term should not be used for direct annotation. If you are trying to make an annotation to x phase, it is likely that the correct annotation is 'regulation of x/y phase transition' or to a process which occurs during the reported phase (i.e mitotic DNA replication for mitotic S-phase). To capture the phase when a specific location or process is observed, the phase term can be used in an annotation extension (PMID:24885854) applied to a cellular component term (with the relation exists_during) or a biological process term (with the relation happens_during)."
533,False,Chromosomes include parts that are not part of the chromatin. Examples include the kinetochore.
36849,False,Examples include human speech and learned bird song.
43485,False,wrm-1 in C. Elegans (Q10953) in PMID:17476329 (IMP)
45193,False,An example of this is HSP90AB1 in human (UniProt symbol P08238) in PMID:21855797 (inferred from direct assay).
28984,True,This term was made obsolete because it contained a conjunction (or). It has been replaced with the terms 'growth of symbiont during interaction with host ; GO:0044116' and 'development of symbiont during interaction with host ; GO:0044115'.
7562,True,This term was made obsolete because it represents a gene product.
13088,False,This term was added by GO_REF:0000021.
17661,False,Note that this term is not synonymous with 'homophilic cell adhesion ; GO:0007156'; the process may occur by homophilic or heterophilic mechanisms.
11106,True,This term was made obsolete because it represents a gene product.
