In [1]:
import scoped_mapping
import sqlite3
import pandas as pd
import yaml
import sys
import datetime

In [2]:
yaml_file = "/Users/MAM/condensed_traits_NCBI.yaml"

cd_ontologies = "omp,micro,ncbitaxon"

rr = 5
chars_to_wo = "\\.\\_\\-"

max_string_dist = 0.05

# shingle/ngram size
string_dist_arg=2

selected_enum = "cell_shape_enum"

cat_split = selected_enum.split('_')
bare_cat = cat_split[0]

rdftab_file = "/Users/MAM/multi_obo.db"

In [3]:
# ~ 30 sec for NCBItaxon with transitive closure over subclasses
#   does that make a difference
#   satements table indexed pretty thoroughly

# use straight rdftab (don't need to expose transitive closure from seamatnic sql? other features?)

rdftab_con = sqlite3.connect(rdftab_file)

rdftab_query = """
select distinct s1.subject as class, lower(s1.value) as label
from statements s1
join statements s2
on s1.subject = s2.subject

where s1.predicate = 'rdfs:label' 
and s1.subject = s1.stanza
and s2.predicate = 'rdf:type'
and s2.object = 'owl:Class'

-- limit 999
"""

ct = datetime.datetime.now()
print(ct)

rdftab_res = pd.read_sql(rdftab_query, rdftab_con)

ct = datetime.datetime.now()
print(ct)

rdftab_res


2021-07-12 17:49:07.088036
2021-07-12 17:49:07.795599


Unnamed: 0,class,label
0,BFO:0000002,continuant
1,BFO:0000003,occurrent
2,BFO:0000004,independent continuant
3,BFO:0000015,process
4,BFO:0000016,disposition
...,...,...
17958,UBERON:8410063,myenteric nerve plexus of small intestine
17959,UBERON:8410064,submucous nerve plexus of small intestine
17960,UBERON:8410065,lymph node follicle marginal zone
17961,UBERON:8410066,lymph node paracortex


In [4]:
# drop classes whose labels and IDs are essentially the same

x = rdftab_res['class'].str.split(pat=':|#_', expand=True)
x_col_count = len(x.columns)
x_col_count
na_flag = x[x_col_count-1].isna()
x[x_col_count-1][na_flag] = x[x_col_count-2]
rdftab_res['id_rhs'] = x[x_col_count-1]
fake_lab_flag = rdftab_res['label'] == rdftab_res['id_rhs']
rdftab_res = rdftab_res[ ~ fake_lab_flag]

# NCBItaxon? a small number of IDs like obo:NCBITaxon#_species_group remain

# rdftab_res.to_csv("rdftab_res.csv")

In [5]:
# are there any full-IRI subjects
# may need to update prefix sql
temp_flag = rdftab_res['class'].str.contains('^http', case=False, flags=0, na=None, regex=True)
temp_flag.value_counts()

False    17963
Name: class, dtype: int64

In [6]:
rdftab_res = scoped_mapping.add_prefix_col(rdftab_res, "class", "ontology_prefix")
rdftab_res

Unnamed: 0,class,label,id_rhs,ontology_prefix
0,BFO:0000002,continuant,0000002,BFO
1,BFO:0000003,occurrent,0000003,BFO
2,BFO:0000004,independent continuant,0000004,BFO
3,BFO:0000015,process,0000015,BFO
4,BFO:0000016,disposition,0000016,BFO
...,...,...,...,...
17958,UBERON:8410063,myenteric nerve plexus of small intestine,8410063,UBERON
17959,UBERON:8410064,submucous nerve plexus of small intestine,8410064,UBERON
17960,UBERON:8410065,lymph node follicle marginal zone,8410065,UBERON
17961,UBERON:8410066,lymph node paracortex,8410066,UBERON


In [7]:
rdftab_res['ontology_prefix'].value_counts()

UBERON    15180
PATO       2726
GO           40
BFO          11
CARO          4
CL            2
Name: ontology_prefix, dtype: int64

In [8]:
yaml_model = scoped_mapping.read_yaml_model(yaml_file)

avaialbe_enums = scoped_mapping.get_avaialbe_enums(yaml_model)
# print(avaialbe_enums)
permissible_values = scoped_mapping.get_permissible_values(yaml_model, selected_enum)
# print(permissible_values)

woedf = scoped_mapping.get_whiteout_frame(raw_list=permissible_values)
woedl = scoped_mapping.get_wo_list(woed_frame=woedf)

woed_merge = woedf.merge(rdftab_res, how = "left", left_on = 'woed', right_on = 'label')
woed_merge

Unnamed: 0,raw,woed,class,label,id_rhs,ontology_prefix
0,triangular,triangular,PATO:0001875,triangular,1875.0,PATO
1,pleomorphic,pleomorphic,PATO:0001356,pleomorphic,1356.0,PATO
2,branced,branced,,,,
3,,na,,,,
4,spindle,spindle,,,,
5,flask,flask,,,,
6,bacillus,bacillus,,,,
7,vibrio,vibrio,,,,
8,spiral,spiral,,,,
9,square,square,PATO:0000413,square,413.0,PATO


In [11]:
woed_merge['ontology_prefix'].value_counts()

PATO    4
Name: ontology_prefix, dtype: int64

In [12]:
c_flag = woed_merge['class'].isna()
c_subset = woed_merge[c_flag]
c = c_subset['raw']

d = set(woed_merge['raw']) - set(c)

In [10]:
ct = datetime.datetime.now()
print(ct)

yaml_mapped = scoped_mapping.search_get_annotations_wrapper(
    c,
    bad_chars=chars_to_wo,
    cat_name=bare_cat,
    ontoprefix=cd_ontologies,
    query_fields="",
    string_dist_arg=string_dist_arg,
    rr=rr,
)

ct = datetime.datetime.now()
print(ct)

yaml_mapped

2021-07-12 17:52:31.857711
2021-07-12 17:52:42.165817


Unnamed: 0,category,raw,query,name,string_dist_rank,string_dist,obo_id,label,search_rank,ontology_prefix,scope,type,iri,ontology_name
0,cell,bacillus,bacillus,bacillus,1,0.000,MICRO:0000401,bacillus,1,MICRO,label,label,http://purl.obolibrary.org/obo/MICRO_0000401,micro
4,cell,bacillus,bacillus,bacillus,2,0.000,MICRO:0000401,bacillus,1,MICRO,has_exact_synonym,annotation,http://purl.obolibrary.org/obo/MICRO_0000401,micro
8,cell,bacillus,bacillus,bacillus,3,0.000,OMP:0000076,rod-shaped cells,2,OMP,synonym,synonym,http://purl.obolibrary.org/obo/OMP_0000076,omp
9,cell,bacillus,bacillus,Bacillus sp. 'Bacillus M9',4,0.174,NCBITaxon:427569,Bacillus sp. 'Bacillus M9',3,NCBITAXON,label,label,http://purl.obolibrary.org/obo/NCBITaxon_427569,ncbitaxon
10,cell,bacillus,bacillus,Bacillus sp. 'Bacillus M12',5,0.184,NCBITaxon:427557,Bacillus sp. 'Bacillus M12',4,NCBITAXON,label,label,http://purl.obolibrary.org/obo/NCBITaxon_427557,ncbitaxon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,cell,vibrio,vibrio,Pacinia,11,1.000,NCBITaxon:662,Vibrio,1,NCBITAXON,hasRelatedSynonym,synonym,http://purl.obolibrary.org/obo/NCBITaxon_662,ncbitaxon
235,cell,vibrio,vibrio,Microspira,12,1.000,NCBITaxon:662,Vibrio,1,NCBITAXON,has_related_synonym,annotation,http://purl.obolibrary.org/obo/NCBITaxon_662,ncbitaxon
236,cell,vibrio,vibrio,Pacinia,13,1.000,NCBITaxon:662,Vibrio,1,NCBITAXON,has_related_synonym,annotation,http://purl.obolibrary.org/obo/NCBITaxon_662,ncbitaxon
237,cell,vibrio,vibrio,Listonella,14,1.000,NCBITaxon:662,Vibrio,1,NCBITAXON,has_related_synonym,annotation,http://purl.obolibrary.org/obo/NCBITaxon_662,ncbitaxon


In [13]:
# are there any raw permitted values that don't either
#   join with a label in rdftab
#   retrieve a hit from OLS?>

any_successes = set(d.union(yaml_mapped['raw']))
no_resutls = set(permissible_values) - any_successes
no_resutls

set()

In [14]:
# get one best acceptable mapping from the OLS results
best_acceptable = scoped_mapping.get_best_acceptable(
    mappings=yaml_mapped, max_string_dist=max_string_dist
)

In [16]:
# combine the rdftab label lookuops and the best acceptable OLS mappings
woed_merge_success = woed_merge[ ~ c_flag].copy()

woed_merge_success["string_dist_rank"] = 1
woed_merge_success["search_rank"] = 1
woed_merge_success["string_dist"] = 0
woed_merge_success["scope"] = "label"
woed_merge_success["type"] = "label"

woed_merge_success["name"] = woed_merge_success["label"]
woed_merge_success["category"] = bare_cat

woed_merge_success.rename(columns={"woed": "query", "class": "obo_id"}, inplace=True)

woed_merge_success = scoped_mapping.add_prefix_col(woed_merge_success, "obo_id", "ontology_prefix")

woed_merge_success["ontology_prefix"] = woed_merge_success[
    "ontology_prefix"
].str.upper()

best_acceptable["method"] = "best OLS search result"
woed_merge_success["method"] = "rdftab"

best_acceptable = best_acceptable.append(woed_merge_success, ignore_index=True)

best_acceptable.drop(["ontology_name", "iri", "id_rhs"], axis=1, inplace=True)

best_acceptable

Unnamed: 0,category,raw,query,name,string_dist_rank,string_dist,obo_id,label,search_rank,ontology_prefix,scope,type,method
0,cell,bacillus,bacillus,bacillus,1,0.0,MICRO:0000401,bacillus,1,MICRO,label,label,best OLS search result
1,cell,coccobacillus,coccobacillus,coccobacillus,1,0.0,MICRO:0000366,coccobacillus,1,MICRO,label,label,best OLS search result
2,cell,coccus,coccus,coccus,1,0.0,MICRO:0000402,coccus,1,MICRO,label,label,best OLS search result
3,cell,filament,filament,filament,1,0.0,MICRO:0000042,cell filament,1,MICRO,has_exact_synonym,annotation,best OLS search result
4,cell,flask,flask,flask,1,0.0,MICRO:0000406,pear-shaped cell,2,MICRO,has_exact_synonym,annotation,best OLS search result
5,cell,irregular,irregular,irregular,1,0.0,MICRO:0000333,irregular cell,1,MICRO,has_broad_synonym,annotation,best OLS search result
6,cell,spiral,spiral,spiral,1,0.0,PATO:0000404,coiled,1,MICRO,synonym,synonym,best OLS search result
7,cell,spirochete,spirochete,spirochete,1,0.0,OMP:0000125,spirochete,1,OMP,label,label,best OLS search result
8,cell,vibrio,vibrio,Vibrio,1,0.0,NCBITaxon:662,Vibrio,1,NCBITAXON,label,label,best OLS search result
9,cell,triangular,triangular,triangular,1,0.0,PATO:0001875,triangular,1,PATO,label,label,rdftab


In [17]:
best_acceptable['ontology_prefix'].value_counts()

MICRO        7
PATO         4
OMP          1
NCBITAXON    1
Name: ontology_prefix, dtype: int64

In [19]:
# what raw permitteds don't have any acceptable mappings
#   via rdftab label lookup
#   or *acceptable* OLS mappings
no_acceptable_mappings = scoped_mapping.get_no_acceptable_mappings(
    yaml_mapped, best_acceptables=best_acceptable
)
# no_acceptable_mappings

In [20]:
# query obtained result but not acceptable
# also look for erroneous mappings above
no_acceptable_queries = set(no_acceptable_mappings["query"])
needs_followup = no_acceptable_queries.union(no_resutls)
needs_followup

# use this for commenting enums that need manual intervention?

{'branced', 'disc', 'na', 'ring', 'spindle', 'star', 'tailed'}

In [21]:
# silently saves back into model object
# repeat as necessary for other enums

# add confidence value

# also save mappings as SSSOM

scoped_mapping.rewrite_yaml(yaml_model, selected_enum, best_acceptable)

In [22]:
yaml.safe_dump(yaml_model, sys.stdout, default_flow_style=False)

classes:
  condensed_traits_NCBI:
    slot_usage: {}
    slots:
    - tax_id
    - species_tax_id
    - data_source
    - org_name
    - species
    - genus
    - family
    - order
    - class
    - phylum
    - superkingdom
    - gram_stain
    - metabolism
    - pathways
    - carbon_substrates
    - sporulation
    - motility
    - range_tmp
    - range_salinity
    - cell_shape
    - isolation_source
    - d1_lo
    - d1_up
    - d2_lo
    - d2_up
    - doubling_h
    - genome_size
    - gc_content
    - coding_genes
    - optimum_tmp
    - optimum_ph
    - growth_tmp
    - rRNA16S_genes
    - tRNA_genes
    - ref_id
default_prefix: organism
description: organism
enums:
  cell_shape_enum:
    permissible_values:
      NA:
        description: NA
      bacillus:
        description: bacillus
        meaning: MICRO:0000401
      branced:
        description: branced
      coccobacillus:
        description: coccobacillus
        meaning: MICRO:0000366
      coccus:
        descripti