In [1]:
import scoped_mapping
import sqlite3
import pandas as pd
import yaml
import sys
import datetime

ModuleNotFoundError: No module named 'tdda'

## Convert (from xlsx/gsheet) to tsv to yaml

```Bash
./linkml_model_enrichment/infer_model.py tsv2model \
--class_name synbio_element \
--schema_name synbio \
--enum-columns species /Users/MAM/Documents/gitrepos/linkml-model-enrichment_keep/local/Ontology_example_20210317_P2B1_allmods_categorytype_different_scores_per_mod-1.tsv > \
/Users/MAM/Ontology_example_20210317_P2B1_allmods_categorytype_different_scores_per_mod-1_experimental.yaml
```

---

```Bash
./linkml_model_enrichment/infer_model.py tsv2model \
--class_name condensed_traits_NCBI \
--schema_name organism \
--sep ',' \
/Users/MAM/Documents/gitrepos/bacteria-archaea-traits/output/condensed_traits_NCBI.csv > /Users/MAM/condensed_traits_NCBI.yaml

```

---

```Bash
curl -L -o rdftab https://github.com/ontodev/rdftab.rs/releases/download/v0.1.1/rdftab-x86_64-apple-darwin
chmod +x rdftab
curl -L -o prefix.sql https://raw.githubusercontent.com/ontodev/rdftab.rs/master/test/prefix.sql

curl -L -o pato.owl http://purl.obolibrary.org/obo/pato.owl
curl -L -o uberon.owl https://raw.githubusercontent.com/obophenotype/uberon/master/uberon.owl
curl -L -o micro.owl http://purl.obolibrary.org/obo/micro.owl
rm multi_obo.db
sqlite3 multi_obo.db < prefix.sql
./rdftab multi_obo.db < pato.owl
./rdftab multi_obo.db < uberon.owl


./rdftab multi_obo.db < micro.owl


```


In [2]:
# yaml_file = "/Users/MAM/Ontology_example_20210317_P2B1_allmods_categorytype_different_scores_per_mod-1_experimental.yaml"

yaml_file = "/Users/MAM/condensed_traits_NCBI.yaml"

# cd_ontologies = "ncbitaxon,obi"
cd_ontologies = "omp,micro,ncbitaxon"

rr = 5
chars_to_wo = "\\.\\_\\-"

max_string_dist = 0.05

# shingle/ngram size
string_dist_arg=2

# orgocat = "organism"
# selected_enum = "species_enum"
selected_enum = "cell_shape_enum"

cat_split = selected_enum.split('_')
bare_cat = cat_split[0]

# rdftab_file = "/Users/MAM/Documents/gitrepos/scoped-mapping/semantic-sql/db/ncbitaxon.db"

rdftab_file = "/Users/MAM/multi_obo.db"



In [3]:
# ~ 30 sec for NCBItaxon with transitive closure over subclasses
#   does that make a difference
#   satements table indexed pretty thoroughly

# use straight rdftab (don't need to ecpose transitive closure from seamatnic sql? other features?)

# any reason to use
#   get_sqlite_con
#   or timed_query?


rdftab_con = sqlite3.connect(rdftab_file)

rdftab_query = """
select distinct s1.subject as class, lower(s1.value) as label
from statements s1
join statements s2
on s1.subject = s2.subject

where s1.predicate = 'rdfs:label' 
and s1.subject = s1.stanza
and s2.predicate = 'rdf:type'
and s2.object = 'owl:Class'

-- limit 999
"""

ct = datetime.datetime.now()
print(ct)

rdftab_res = pd.read_sql(rdftab_query, rdftab_con)

ct = datetime.datetime.now()
print(ct)

rdftab_res


2021-07-12 17:11:06.723676
2021-07-12 17:11:06.951849


Unnamed: 0,class,label
0,BFO:0000002,continuant
1,BFO:0000003,occurrent
2,BFO:0000004,independent continuant
3,BFO:0000015,process
4,BFO:0000016,disposition
...,...,...
17958,UBERON:8410063,myenteric nerve plexus of small intestine
17959,UBERON:8410064,submucous nerve plexus of small intestine
17960,UBERON:8410065,lymph node follicle marginal zone
17961,UBERON:8410066,lymph node paracortex


In [4]:
x = rdftab_res['class'].str.split(pat=':|#_', expand=True)
x_col_count = len(x.columns)
x_col_count
na_flag = x[x_col_count-1].isna()
x[x_col_count-1][na_flag] = x[x_col_count-2]

rdftab_res['id_rhs'] = x[x_col_count-1]

In [5]:
fake_lab_flag = rdftab_res['label'] == rdftab_res['id_rhs']

rdftab_res = rdftab_res[ ~ fake_lab_flag]

# may need to update prefix sql

# rdftab_res.to_csv("rdftab_res.csv")

In [6]:
temp_flag = rdftab_res['class'].str.contains('^http', case=False, flags=0, na=None, regex=True)
temp_flag.value_counts()

False    17963
Name: class, dtype: int64

In [7]:
rdftab_res = scoped_mapping.add_prefix_col(rdftab_res, "class", "ontology_prefix")

rdftab_res

# a small number of IDs like obo:NCBITaxon#_species_group remain

Unnamed: 0,class,label,id_rhs,ontology_prefix
0,BFO:0000002,continuant,0000002,BFO
1,BFO:0000003,occurrent,0000003,BFO
2,BFO:0000004,independent continuant,0000004,BFO
3,BFO:0000015,process,0000015,BFO
4,BFO:0000016,disposition,0000016,BFO
...,...,...,...,...
17958,UBERON:8410063,myenteric nerve plexus of small intestine,8410063,UBERON
17959,UBERON:8410064,submucous nerve plexus of small intestine,8410064,UBERON
17960,UBERON:8410065,lymph node follicle marginal zone,8410065,UBERON
17961,UBERON:8410066,lymph node paracortex,8410066,UBERON


In [8]:
rdftab_res['ontology_prefix'].value_counts()

UBERON    15180
PATO       2726
GO           40
BFO          11
CARO          4
CL            2
Name: ontology_prefix, dtype: int64

In [9]:
yaml_model = scoped_mapping.read_yaml_model(yaml_file)

avaialbe_enums = scoped_mapping.get_avaialbe_enums(yaml_model)
# print(avaialbe_enums)
permissible_values = scoped_mapping.get_permissible_values(yaml_model, selected_enum)
# print(permissible_values)

woedf = scoped_mapping.get_whiteout_frame(raw_list=permissible_values)
woedl = scoped_mapping.get_wo_list(woed_frame=woedf)

# woedl

In [10]:
woed_merge = woedf.merge(rdftab_res, how = "left", left_on = 'woed', right_on = 'label')
woed_merge

Unnamed: 0,raw,woed,class,label,id_rhs,ontology_prefix
0,triangular,triangular,PATO:0001875,triangular,1875.0,PATO
1,pleomorphic,pleomorphic,PATO:0001356,pleomorphic,1356.0,PATO
2,branced,branced,,,,
3,,na,,,,
4,spindle,spindle,,,,
5,flask,flask,,,,
6,bacillus,bacillus,,,,
7,vibrio,vibrio,,,,
8,spiral,spiral,,,,
9,square,square,PATO:0000413,square,413.0,PATO


In [11]:
c_flag = woed_merge['class'].isna()
c_subset = woed_merge[c_flag]
c = c_subset['raw']

In [12]:
d = set(woed_merge['raw']) - set(c)

In [13]:
# string_dist_arg = shingle/ngram size


ct = datetime.datetime.now()
print(ct)

yaml_mapped = scoped_mapping.search_get_annotations_wrapper(
    c,
    bad_chars=chars_to_wo,
    cat_name=bare_cat,
    ontoprefix=cd_ontologies,
    query_fields="",
    string_dist_arg=string_dist_arg,
    rr=rr,
)

ct = datetime.datetime.now()
print(ct)

yaml_mapped

2021-07-12 17:11:11.334380
2021-07-12 17:11:23.598108


Unnamed: 0,category,raw,query,name,string_dist_rank,string_dist,obo_id,label,search_rank,ontology_prefix,scope,type,iri,ontology_name
0,cell,bacillus,bacillus,bacillus,1,0.000,MICRO:0000401,bacillus,1,MICRO,label,label,http://purl.obolibrary.org/obo/MICRO_0000401,micro
4,cell,bacillus,bacillus,bacillus,2,0.000,MICRO:0000401,bacillus,1,MICRO,has_exact_synonym,annotation,http://purl.obolibrary.org/obo/MICRO_0000401,micro
8,cell,bacillus,bacillus,bacillus,3,0.000,OMP:0000076,rod-shaped cells,2,OMP,synonym,synonym,http://purl.obolibrary.org/obo/OMP_0000076,omp
9,cell,bacillus,bacillus,Bacillus sp. 'Bacillus M9',4,0.174,NCBITaxon:427569,Bacillus sp. 'Bacillus M9',3,NCBITAXON,label,label,http://purl.obolibrary.org/obo/NCBITaxon_427569,ncbitaxon
10,cell,bacillus,bacillus,Bacillus sp. 'Bacillus M12',5,0.184,NCBITaxon:427557,Bacillus sp. 'Bacillus M12',4,NCBITAXON,label,label,http://purl.obolibrary.org/obo/NCBITaxon_427557,ncbitaxon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,cell,vibrio,vibrio,Pacinia,11,1.000,NCBITaxon:662,Vibrio,1,NCBITAXON,hasRelatedSynonym,synonym,http://purl.obolibrary.org/obo/NCBITaxon_662,ncbitaxon
235,cell,vibrio,vibrio,Microspira,12,1.000,NCBITaxon:662,Vibrio,1,NCBITAXON,has_related_synonym,annotation,http://purl.obolibrary.org/obo/NCBITaxon_662,ncbitaxon
236,cell,vibrio,vibrio,Pacinia,13,1.000,NCBITaxon:662,Vibrio,1,NCBITAXON,has_related_synonym,annotation,http://purl.obolibrary.org/obo/NCBITaxon_662,ncbitaxon
237,cell,vibrio,vibrio,Listonella,14,1.000,NCBITaxon:662,Vibrio,1,NCBITAXON,has_related_synonym,annotation,http://purl.obolibrary.org/obo/NCBITaxon_662,ncbitaxon


In [14]:
any_successes = set(d.union(yaml_mapped['raw']))

no_resutls = set(permissible_values) - any_successes

no_resutls

set()

In [15]:
best_acceptable = scoped_mapping.get_best_acceptable(
    mappings=yaml_mapped, max_string_dist=max_string_dist
)

In [16]:
woed_merge_success = woed_merge[ ~ c_flag].copy()

woed_merge_success["string_dist_rank"] = 1
woed_merge_success["search_rank"] = 1
woed_merge_success["string_dist"] = 0
woed_merge_success["scope"] = "label"
woed_merge_success["type"] = "label"

woed_merge_success["name"] = woed_merge_success["label"]
woed_merge_success["category"] = bare_cat

woed_merge_success.rename(columns={"woed": "query", "class": "obo_id"}, inplace=True)

woed_merge_success = scoped_mapping.add_prefix_col(woed_merge_success, "obo_id", "ontology_prefix")

In [17]:
woed_merge_success = scoped_mapping.add_prefix_col(woed_merge_success, "obo_id", "ontology_prefix")

In [18]:
woed_merge_success["ontology_prefix"] = woed_merge_success[
    "ontology_prefix"
].str.upper()

best_acceptable["method"] = "best OLS search result"
woed_merge_success["method"] = "rdftab"

best_acceptable = best_acceptable.append(woed_merge_success, ignore_index=True)

best_acceptable.drop(["ontology_name", "iri", "id_rhs"], axis=1, inplace=True)

best_acceptable

Unnamed: 0,category,raw,query,name,string_dist_rank,string_dist,obo_id,label,search_rank,ontology_prefix,scope,type,method
0,cell,bacillus,bacillus,bacillus,1,0.0,MICRO:0000401,bacillus,1,MICRO,label,label,best OLS search result
1,cell,coccobacillus,coccobacillus,coccobacillus,1,0.0,MICRO:0000366,coccobacillus,1,MICRO,label,label,best OLS search result
2,cell,coccus,coccus,coccus,1,0.0,MICRO:0000402,coccus,1,MICRO,label,label,best OLS search result
3,cell,filament,filament,filament,1,0.0,MICRO:0000042,cell filament,1,MICRO,has_exact_synonym,annotation,best OLS search result
4,cell,flask,flask,flask,1,0.0,MICRO:0000406,pear-shaped cell,2,MICRO,has_exact_synonym,annotation,best OLS search result
5,cell,irregular,irregular,irregular,1,0.0,MICRO:0000333,irregular cell,1,MICRO,has_broad_synonym,annotation,best OLS search result
6,cell,spiral,spiral,spiral,1,0.0,PATO:0000404,coiled,1,MICRO,synonym,synonym,best OLS search result
7,cell,spirochete,spirochete,spirochete,1,0.0,OMP:0000125,spirochete,1,OMP,label,label,best OLS search result
8,cell,vibrio,vibrio,Vibrio,1,0.0,NCBITaxon:662,Vibrio,1,NCBITAXON,label,label,best OLS search result
9,cell,triangular,triangular,triangular,1,0.0,PATO:0001875,triangular,1,PATO,label,label,rdftab


In [19]:

best_acceptable['ontology_prefix'].value_counts()

MICRO        7
PATO         4
OMP          1
NCBITAXON    1
Name: ontology_prefix, dtype: int64

In [20]:
no_acceptable_mappings = scoped_mapping.get_no_acceptable_mappings(
    yaml_mapped, best_acceptables=best_acceptable
)
# no_acceptable_mappings

In [21]:
# query obtained result but not acceptable
# also look for erroneous mappings above
no_acceptable_queries = set(no_acceptable_mappings["query"])
needs_followup = no_acceptable_queries.union(no_resutls)
needs_followup

# use this for commenting enums that need manual intervention?

{'branced', 'disc', 'na', 'ring', 'spindle', 'star', 'tailed'}

In [22]:
# silently saves back into model object
# repeat as necessary for other enums

# add confidence value

# also save mappings as SSSOM

scoped_mapping.rewrite_yaml(yaml_model, selected_enum, best_acceptable)

In [23]:
yaml.safe_dump(yaml_model, sys.stdout, default_flow_style=False)

classes:
  condensed_traits_NCBI:
    slot_usage: {}
    slots:
    - tax_id
    - species_tax_id
    - data_source
    - org_name
    - species
    - genus
    - family
    - order
    - class
    - phylum
    - superkingdom
    - gram_stain
    - metabolism
    - pathways
    - carbon_substrates
    - sporulation
    - motility
    - range_tmp
    - range_salinity
    - cell_shape
    - isolation_source
    - d1_lo
    - d1_up
    - d2_lo
    - d2_up
    - doubling_h
    - genome_size
    - gc_content
    - coding_genes
    - optimum_tmp
    - optimum_ph
    - growth_tmp
    - rRNA16S_genes
    - tRNA_genes
    - ref_id
default_prefix: organism
description: organism
enums:
  cell_shape_enum:
    permissible_values:
      NA:
        description: NA
      bacillus:
        description: bacillus
        meaning: MICRO:0000401
      branced:
        description: branced
      coccobacillus:
        description: coccobacillus
        meaning: MICRO:0000366
      coccus:
        descripti

---

In [None]:
scoped_mapping.get_multi_term_patterns(rdftab_res, 'class', 'ontology_prefix')

In [None]:
# 2 minutes and memory hungry for NCBItaxon

ct = datetime.datetime.now()
print(ct)

# handle empty, string or list?
# what about ontologies that import a lot of terms?
# what about multi-ontology rdftabs
discovered_id_pattern = scoped_mapping.discover_id_pattern(list(rdftab_res['class']))

ct = datetime.datetime.now()
print(ct)

# may not give 100% coverage
# with elimination of SOME but not all textual NCBItaxon IDs
#   the discovered ID pattern is 'NCBITaxon:\\d+'
discovered_id_pattern

In [None]:
problem_file = "/Users/MAM/icbo.xlsx"
problem_spreadsheet = pd.read_excel(problem_file)
problem_spreadsheet

In [None]:
pso = problem_spreadsheet[orgocat]
pso

In [None]:
pso_vc = pso.value_counts()
pso_vc

In [None]:
pso_unique = list(pso_vc.index)
pso_unique

In [None]:
# orgolist = ['homo--sapiens', 'MUS.MUSCULUS', 'NCBITaxon:9598', 'NCBITaxon:Rhinoceros']


# # # woed = scoped_mapping.whiteout(raw_string="", char_string="\\.\\_\\-")

# woedf = scoped_mapping.get_whiteout_frame(raw_list = orgolist)

# woedl = scoped_mapping.get_wo_list(woed_frame=woedf)
# # # woedl

# # # socr = scoped_mapping.search_over_category('organisms', woedl, ontology_phrase=cd_ontologies,
# # #                                                         qf_phrase='', row_req=rr)

# # # search_over_category returns list of frames?
# # # part of get_csr_frame
# # #  # part of search_get_annotations_wrapper

# # csr_frame = scoped_mapping.get_csr_frame(woedl, bad_chars=chars_to_wo, category_name=orgocat,
# #                   ontoprefix=cd_ontologies, query_fields='', rows_requested=rr)

# # # # lowercase
# # # scoped_mapping.one_ols_submission(woed, ontology_phrase='ncbitaxon,obi', qf_phrase='', row_req=5)

# # first_csr_row = csr_frame.iloc[1,:]
# # # category	raw	query	label	iri	obo_id	ontology_name	ontology_prefix	search_rank

# # scoped_mapping.get_label_like(first_csr_row)
# # # embedded in get_bulk_label_like
# # #   embedded in search_get_annotations_wrapper
# # prep_for_label_like
# # #   embedded in search_get_annotations_wrapper
# # merge_and_compare
# # #   embedded in search_get_annotations_wrapper

woedf = scoped_mapping.get_whiteout_frame(raw_list=pso_unique)
woedl = scoped_mapping.get_wo_list(woed_frame=woedf)

# better retention of counts
aw_res = scoped_mapping.search_get_annotations_wrapper(
    raw_list=woedl,
    bad_chars=chars_to_wo,
    cat_name=orgocat,
    ontoprefix=cd_ontologies,
    query_fields="",
    rr=rr,
    string_dist_arg=2,
)

# aw_res

# scoped_mapping.get_best_acceptable(aw_res)

In [None]:
id_series = rdftab_res.subject
id_series

In [None]:
# was expecting 'NCBITaxon:[a-z0-9]+'
id_series[id_series.str.contains("^NCBITaxon:[^\\d]")]

In [None]:
# https://tdda.readthedocs.io/en/v1.0.30/rexpy.html
# check predicate oio:hasOBONamespace
id_series[id_series.str.contains("^obo:NCBITaxon#")]

In [None]:
# vectorize?
decomposed = scoped_mapping.decompose_series(pd.Series(orgolist), discovered_id_pattern)
# make_tidy_col is embedded

# REMOVE PREFIX FROM remaining_tidied SEE NCBITaxon:Rhinoceros

decomposed

In [None]:
demposed_with_prefix = scoped_mapping.add_prefix_col(
    dataframe=decomposed, col_with_prefixes="extract", prefix_col="onto_prefix"
)

In [None]:
demposed_with_prefix

```Python
def get_multi_term_patterns(dataframe, col_with_prefixes, prefix_col):

def get_no_acceptable_mappings(all_mappings, best_acceptables):
def add_unique_to_list(uniquelist, non_unique):
def add_overrides(dataframe, incol, outcol, override_dict):
def flag_canonical(dataframe, incol, outcol, canonicals):
                                   
# yaml specific
def rewrite_yaml(model, enum, best_acceptable):

# biosample specific      
def get_package_dictionary(biosample_packages_file):
def env_package_nomralizastion(dataframe, col_to_normalize, id_replacement_rule):
```