# Ontology Table p-values

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import hypergeom
from rdflib import Graph, Literal, RDF, URIRef, OWL, RDFS, URIRef

---

## load ontology
- create list of classes and leaf classes
- add instances to ancestor classes
- calculate total number of instances
- build dict with number of instances per class

In [3]:
g = Graph().parse('../ontology/cancer_instances_merged.ttl')

In [4]:
classes = [c for c in g.subjects(RDF.type, OWL.Class)]
leaf_classes = [
    uri for uri in classes
    if len(list(g.subjects(RDFS.subClassOf, uri))) == 0
]

In [5]:
for leaf_uri in leaf_classes:
    instances = [i for i in g.subjects(RDF.type, leaf_uri)] 
    ancestors = list(g.transitive_objects(leaf_uri, RDFS.subClassOf))

    for uri in ancestors:
        for i in instances:
            g.add((i, RDF.type, uri))

In [6]:
num_instances = len([i for i in g.subjects(RDF.type, OWL.NamedIndividual, unique=True)])

In [7]:
class_count_dict = {}
for uri in classes:
    class_name = str(uri).replace("http://example.com/cancers.owl/", "")
    class_count_dict[class_name] = len([i for i in g.subjects(RDF.type, uri, unique=True)])
class_count_dict['num_classes'] = len(classes)
class_count_dict['num_instances'] = num_instances

---

## build association dataframe

In [8]:
class_assoc_df = pd.DataFrame(columns=['subject', 'object', 'assoc', 'p_value'])
for subj_class in classes: # iterate over classes in ontology
    subj_class_name = str(subj_class).replace("http://example.com/cancers.owl/", "")
    subj_instances = [i for i in g.subjects(RDF.type, subj_class, unique=True)] # get instances for class
    
    if len(subj_instances) > 0: # check for instances; higher level class won't have instances (needs reasoning)
        for obj_class in classes: # iterate over classes to find related objects
            if subj_class == obj_class:
                intersect, obj_class_name = subj_instances, subj_class_name

                class_assoc_df.loc[len(class_assoc_df.index)] = {
                    'subject': subj_class_name,
                    'object': obj_class_name,
                    'assoc': len(intersect),
                    'p_value': 1.0 # place holder
                }
            else:
                obj_class_name = str(obj_class).replace("http://example.com/cancers.owl/", "")
                obj_instances = set([i for i in g.subjects(RDF.type, obj_class)])

                if len(obj_instances) > 0: # check for instances
                    # print('\tobj_class_name:', obj_class_name, len(obj_instances))
                    si_objects = []
                    for si in subj_instances:
                        si_objects += [o for o in g.objects(si)]

                    intersect = set(si_objects).intersection(obj_instances)
                    if len(intersect) > 0:
                        # print('subj_class_name:', subj_class_name, len(subj_instances), 'obj_class_name:', obj_class_name, len(obj_instances))
                        # print('si_objects:', len(si_objects), 'intersect:', len(intersect))
                        
                        class_assoc_df.loc[len(class_assoc_df.index)] = {
                            'subject': subj_class_name,
                            'object': obj_class_name,
                            'assoc': len(intersect), 
                            'p_value': 1.0 # place holder
                        }

In [9]:
class_assoc_df.head()

Unnamed: 0,subject,object,assoc,p_value
0,alchohol_use,alchohol_use,291,1.0
1,asthma,asthma,87,1.0
2,cancer,cancer,500,1.0
3,castration_resistant_prostate_cancer,castration_resistant_prostate_cancer,55,1.0
4,castration_sensitive_prostate_cancer,castration_sensitive_prostate_cancer,44,1.0


---

In [16]:
for ix, subject, object, assoc, pval in class_assoc_df.itertuples():
    # calc p-value
    x = assoc - 1
    M = class_count_dict['num_instances'] # size of population
    n = class_count_dict[object]  # number of "successes" -> number of objects in population
    N = class_count_dict[subject] # number of "draws"/"chosen" -> number of subjects in population

    pval = hypergeom.sf(x -1, M, n, N)
    class_assoc_df.loc[ix, 'p_value'] = round(pval, 4)

In [17]:
class_assoc_df[class_assoc_df.subject == 'patient']

Unnamed: 0,subject,object,assoc,p_value
21,patient,alchohol_use,291,0.0
22,patient,asthma,87,0.0
23,patient,cancer,500,0.0
24,patient,castration_resistant_prostate_cancer,55,0.0
25,patient,castration_sensitive_prostate_cancer,44,0.0
26,patient,cervical_cancer,52,0.0
27,patient,comorbidity,500,0.0
28,patient,diabetes,175,0.0
29,patient,fallopian_tube_adenocarcinoma,47,0.0
30,patient,fallopian_tube_cancer,129,0.0
