In [1]:
from datetime import datetime
from pkg_resources import get_distribution, DistributionNotFound
from strsimpy.cosine import Cosine
import pandas as pd
import re
import requests as requests
import sqlite3
import string
import urllib
import yaml
from xml.etree import ElementTree
from tdda import rexpy
import scoped_mapping

## User-provided data
See repo README for notes on setting up SQLite databases of OBO ontologies with semantic-sql, relation-graph and rdftab

In [2]:
biosample_packages_file = '../target/biosample_packages.xml'

biosample_sqlite_file   = "../target/harmonized_table.db"
ncbitaxon_sqlite_file   = "../semantic-sql/db/ncbitaxon.db"
envo_sqlite_file        = "../semantic-sql/db/envo.db"

biosample_cnx = sqlite3.connect(biosample_sqlite_file)
ncbitaxon_cnx = sqlite3.connect(ncbitaxon_sqlite_file)
envo_cnx      = sqlite3.connect(envo_sqlite_file)

target_onto_prefix = 'ENVO'
chars_to_whiteout  = '._-'
my_query_fields    = '' # OLS weighted default
my_row_req         = 3
my_string_dist_arg = 2
my_max_string_dist = 0.1

env_package_overrides = {
    'built environment': 'built',
    'misc environment': 'miscellaneous',
    'missing': 'no environmental package',
    'unknown': 'no environmental package',
    'default': 'no environmental package',
    'unspecified': 'no environmental package',
    'not available': 'no environmental package',
    'not collected': 'no environmental package'
}


## Print a sample of the data we're working with
Specifically, INSDC/NCBI Biosample Metadata

In [3]:

q = """
select
    id,
    env_package,
    package,
    package_name,
    host_taxid,
    taxonomy_id,
    env_broad_scale,
    env_local_scale,
    env_medium
    from biosample b
limit 10
"""
biosample_first_ten = pd.read_sql(q, biosample_cnx)
biosample_first_ten

Unnamed: 0,id,env_package,package,package_name,host_taxid,taxonomy_id,env_broad_scale,env_local_scale,env_medium
0,BIOSAMPLE:SAMN00000002,missing,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,9606.0,445970,terrestrial biome [ENVO:00000446],human-associated habitat [ENVO:00009003],biological product [ENVO:02000043]
1,BIOSAMPLE:SAMN00000003,missing,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,9606.0,445972,terrestrial biome [ENVO:00000446],human-associated habitat [ENVO:00009003],biological product [ENVO:02000043]
2,BIOSAMPLE:SAMN00000004,missing,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,9606.0,449673,terrestrial biome [ENVO:00000446],human-associated habitat [ENVO:00009003],biological product [ENVO:02000043]
3,BIOSAMPLE:SAMN00000005,,Generic.1.0,Generic,,6526,,,
4,BIOSAMPLE:SAMN00000006,,Generic.1.0,Generic,,9483,,,
5,BIOSAMPLE:SAMN00000007,missing,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,9606.0,445974,terrestrial biome [ENVO:00000446],human-associated habitat [ENVO:00009003],biological product [ENVO:02000043]
6,BIOSAMPLE:SAMN00000008,missing,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,9606.0,411461,terrestrial biome [ENVO:00000446],human-associated habitat [ENVO:00009003],biological product [ENVO:02000043]
7,BIOSAMPLE:SAMN00000009,,Generic.1.0,Generic,,13616,,,
8,BIOSAMPLE:SAMN00000010,,Generic.1.0,Generic,,451639,,,
9,BIOSAMPLE:SAMN00000011,,Generic.1.0,Generic,,451638,,,


## Get the canonical checklist and package terms from NCBI

Unfortunately it doesn't do a very good job of differentiating checklists (MIMAG, MIMARKS, etc.) from packages (soil, water, etc.)

_What about .ba, .euk, etc?_

In [4]:
package_dictionary = scoped_mapping.get_package_dictionary(biosample_packages_file)
package_dictionary.to_sql('package_dictionary', biosample_cnx, if_exists='replace', index=False)
package_dictionary

Unnamed: 0,Name,DisplayName,ShortName,EnvPackage,EnvPackageDisplay,NotAppropriateFor,Description,Example
0,Generic.1.0,Generic,,,,,Generic,
1,SARS-CoV-2.cl.1.0,SARS-CoV-2: clinical or host-associated; versi...,SARS-CoV-2: clinical or host-associated,,,wgs_single;wgs_batch;wgs_diploid,Use for SARS-CoV-2 samples that are relevant t...,
2,Pathogen.cl.1.0,Pathogen: clinical or host-associated; version...,Pathogen: clinical or host-associated,,,,Clinical or host-associated pathogen,SAMN02928182
3,Pathogen.env.1.0,Pathogen: environmental/food/other; version 1.0,Pathogen: environmental/food/other,,,,"Environmental, food or other pathogen",SAMN02730065
4,Microbe.1.0,Microbe; version 1.0,Microbe,,,,Use for bacteria or other unicellular microbes...,SAMN02911891
...,...,...,...,...,...,...,...,...
149,MIUVIG.plant-associated.5.0,"MIUVIG: uncultivated virus genome, plant-assoc...",MIUVIG Uncultivated Virus Genome,plant-associated,plant-associated,wgs_single;wgs_batch;wgs_diploid,Use for uncultivated virus genome identified i...,
150,MIUVIG.sediment.5.0,"MIUVIG: uncultivated virus genome, sediment; v...",MIUVIG Uncultivated Virus Genome,sediment,sediment,wgs_single;wgs_batch;wgs_diploid,Use for uncultivated virus genome identified i...,
151,MIUVIG.soil.5.0,"MIUVIG: uncultivated virus genome, soil; versi...",MIUVIG Uncultivated Virus Genome,soil,soil,wgs_single;wgs_batch;wgs_diploid,Use for uncultivated virus genome identified i...,
152,MIUVIG.wastewater.5.0,"MIUVIG: uncultivated virus genome, wastewater;...",MIUVIG Uncultivated Virus Genome,wastewater,wastewater,wgs_single;wgs_batch;wgs_diploid,Use for uncultivated virus genome identified i...,


## Do the Biosample checklist/package fields match any of the cannonical values?

Start by counting the Biosample rows/records. See XXX notes on extracting this *harmonized* database

In [5]:

q = """
select count(*) as biosample_row_count
from biosample b
"""
[biosample_row_count, query_duration] = scoped_mapping.timed_query(q, biosample_cnx, print_timing=False)

print(query_duration)
biosample_row_count

0:00:00.073628


Unnamed: 0,biosample_row_count
0,14300584


## How many of those rows can be inner-joined with the canonical checklists/packages?
Specifically, joining `biosample.package_name` with `package_dictionary.DisplayName`

_Note that indices are built as part of the makefile_


- create index biosample_package_name_idx on biosample(package_name);
- create index package_dictionary_DisplayName_idx on package_dictionary(DisplayName);
- create index biosample_package_idx on biosample(package);
- create index biosample_p_pn_idx on biosample(package, package_name);

In [6]:

q = """
select
    count(*) as cannonical_package_name_count
from
    biosample b
inner join package_dictionary pd on
    b.package_name = pd.DisplayName
"""
[cannonical_package_name_count, query_duration] = scoped_mapping.timed_query(q, biosample_cnx, print_timing=True)

print(query_duration)

cannonical_package_name_count

2021-05-26 12:50:36.366903
2021-05-26 12:50:38.109956
0:00:01.743053
0:00:01.743053


Unnamed: 0,cannonical_package_name_count
0,14300584


## Combinations of `package` and `package_name` values in the Biosample dataset

In [7]:

q = """
select
    package,
    package_name,
    count(*) as count
from
    biosample b
group by
    package ,
    package_name
order by
    package ,
    package_name
"""
[package_name_combos, query_duration] = scoped_mapping.timed_query(q, biosample_cnx, print_timing=True)

print(query_duration)

package_name_combos

2021-05-26 12:50:38.117583
2021-05-26 12:50:40.011075
0:00:01.893492
0:00:01.893492


Unnamed: 0,package,package_name,count
0,Beta-lactamase.1.0,Beta-lactamase; version 1.0,556
1,Generic.1.0,Generic,10186430
2,Human.1.0,Human; version 1.0,368893
3,Invertebrate.1.0,Invertebrate; version 1.0,131085
4,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,28686
...,...,...,...
124,Model.organism.animal.1.0,Model organism or animal; version 1.0,382980
125,Pathogen.cl.1.0,Pathogen: clinical or host-associated; version...,511040
126,Pathogen.env.1.0,Pathogen: environmental/food/other; version 1.0,246497
127,Plant.1.0,Plant; version 1.0,385939


## What about the Biosample `env_package` values?
Are they also a small, highly regular set, like the `package` and `package_name` combinations?

In [8]:

q = """
select
    env_package,
    count(*) as count
from
    biosample b
group by
    env_package
order by
    count(*) desc
"""
[env_package_count, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

print(query_duration)

env_package_count

0:00:00.906749


Unnamed: 0,env_package,count
0,,14083847
1,host-associated,49254
2,human-gut,47921
3,water,16367
4,human-skin,13706
...,...,...
87,env_package,1
88,gut microbiome,1
89,marine sediment (ENVO:00002113),1
90,saliva,1


## `env_package` is a mixture of `ENVO` term ids and strings
Those strings may or may not be the term's labels. There are many redundancies due to small spelling and punctiation variations.

Start by getting a set of all canonical `env_package` values recognized by INSDC

In [9]:

package_dictionary = scoped_mapping.make_tidy_col(package_dictionary, 'EnvPackage', 'eptidy')
package_dictionary =scoped_mapping.make_tidy_col(package_dictionary, 'EnvPackageDisplay', 'epdtidy')
# update in sqlite
package_dictionary.to_sql('package_dictionary', biosample_cnx, if_exists='replace', index=False)
valid_combo = []
valid_combo = scoped_mapping.add_unique_to_list(valid_combo, package_dictionary['eptidy'])
valid_combo = scoped_mapping.add_unique_to_list(valid_combo, package_dictionary['epdtidy'])

valid_combo

['',
 'air',
 'built',
 'host associated',
 'human associated',
 'human gut',
 'human oral',
 'human skin',
 'human vaginal',
 'microbial',
 'microbial mat biofilm',
 'miscellaneous',
 'miscellaneous or artificial',
 'no environmental package',
 'plant associated',
 'sediment',
 'soil',
 'wastewater',
 'wastewater sludge',
 'water']

## Determine ID patterns for common ontologies, like `ENVO`

In [10]:

q = """
select
    distinct stanza
    from statements s
where
    predicate = 'rdf:type'
    and "object" = 'owl:Class'
    and stanza = subject"""
# include non-envo IDs that come from envo?
[ids_from_envo, query_duration] = scoped_mapping.timed_query(q, envo_cnx)
print(query_duration)
ids_from_envo = scoped_mapping.add_prefix_col(ids_from_envo, 'stanza', 'prefix')

id_patterns = scoped_mapping.get_multi_term_patterns(ids_from_envo, 'stanza', 'prefix')



0:00:00.029921


In [11]:
id_patterns

{'BFO': 'BFO:\\d{7}',
 'CARO': 'CARO:\\d{7}',
 'CHEBI': 'CHEBI:\\d{4,6}',
 'ENVO': 'ENVO:\\d{7,8}',
 'FAO': 'FAO:0000001',
 'FOODON': 'FOODON:\\d{8}',
 'GO': 'GO:\\d{7}',
 'IAO': 'IAO:\\d{7}',
 'NCBITaxon': 'NCBITaxon:\\d+',
 'OBI': 'OBI:\\d{7}',
 'PATO': 'PATO:\\d{7}',
 'PCO': 'PCO:\\d{7}',
 'PO': 'PO:\\d{7}',
 'RO': 'RO:0002577',
 'UBERON': 'UBERON:\\d{7}'}

## Apply some normalizastion rules to the `env_package` values

In [12]:
env_package_normalized = scoped_mapping.env_package_nomralizastion(env_package_count, 'env_package',
                                                                   id_patterns['ENVO'])

env_package_normalized = scoped_mapping.add_overrides(env_package_normalized, 'remaining_tidied', 'ep_override',
                                                   env_package_overrides)

env_package_normalized = scoped_mapping.flag_canonical(env_package_normalized, 'ep_override', 'is_canonical',
                                                    valid_combo)

env_package_normalized.to_sql('env_package_normalized', biosample_cnx, if_exists='replace', index=False)


## What do the successful normalizations look like?

In [13]:

q = """
select
    env_package,
    count,
    lhs,
    extract,
    ep_override
from
    env_package_normalized
where
    is_canonical = 1
"""
[successful_normalizastions, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

print(query_duration)

successful_normalizastions

0:00:00.002106


Unnamed: 0,env_package,count,lhs,extract,ep_override
0,,14083847,,,
1,host-associated,49254,,,host associated
2,human-gut,47921,,,human gut
3,water,16367,,,water
4,human-skin,13706,,,human skin
5,built environment,12391,,,built
6,soil,11974,,,soil
7,misc environment,11715,,,miscellaneous
8,missing,8453,,,no environmental package
9,human-oral,7882,,,human oral


# Are there any normalization failures?

In [14]:

q = """
select
    env_package,
    count,
    lhs,
    extract,
    ep_override
from
    env_package_normalized
where
    is_canonical = 0
"""
[normalizastion_failures, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

print(query_duration)

normalizastion_failures

0:00:00.001773


Unnamed: 0,env_package,count,lhs,extract,ep_override
0,miscellaneous natural or artificial environment,611,,,miscellaneous natural or artificial environment
1,mimarks,479,,,mimarks
2,mouse-gut,406,,,mouse gut
3,gut,172,,,gut
4,biofilm,114,,,biofilm
5,human-not providedsopharyngeal,107,,,human not providedsopharyngeal
6,mice gut,87,,,mice gut
7,CV,60,,,cv
8,"home, outdoor environment",44,,,home outdoor environment
9,fermentation-associated,42,,,fermentation associated


# Utilizing taxonomy for broad subsetting


**This uses an SQLite database in which the transitive closure over subClassOf has already been materialized. See the README and Makefile.**

Specifically, flag the biosamples whose `taxon_id` indicates they are an unclassified entity. Ignoring the others will throw out samples of multicellular organisms, like fruit flies.



## Get a listing of all taxa that are transitive subclasses of `NCBITaxon:2787823`

I.e. 'unclassified entities'

In [15]:
q = """
select
    distinct s.subject
from
    entailed_edge ee
join statements s on
    ee.subject = s.subject
where
    ee.predicate = 'rdfs:subClassOf'
    and ee.object = 'NCBITaxon:2787823'
    and s.predicate = 'rdfs:label'
"""
[unclassified_taxa, query_duration] = scoped_mapping.timed_query(q, ncbitaxon_cnx)
unclassified_taxa['unclassified'] = True

print(query_duration)

unclassified_taxa

0:00:07.726242


Unnamed: 0,subject,unclassified
0,NCBITaxon:1006967,True
1,NCBITaxon:1041057,True
2,NCBITaxon:1046002,True
3,NCBITaxon:1046003,True
4,NCBITaxon:1046004,True
...,...,...
989,NCBITaxon:939928,True
990,NCBITaxon:941420,True
991,NCBITaxon:941421,True
992,NCBITaxon:941422,True


## Get taxon counuts from the Biosample metadata

In [16]:

q = """
select
    taxonomy_id biosample_taxid,
    count(*) as count
from
    biosample b
group by
    taxonomy_id
order by
    count(*) desc
"""
[biosample_tax_id_counts, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)
biosample_tax_id_counts['curie'] = 'NCBITaxon:' + biosample_tax_id_counts['biosample_taxid'].astype(str)

print(query_duration)

0:00:01.406164


## Merge the two taxonomy dataframes

I.e. flag the the Biosample records whose `taxonomy_id` field belongs to a subclass of 'unclassified entries'.

In [17]:
biosample_tax_id_counts = biosample_tax_id_counts.merge(unclassified_taxa, left_on='curie',
                                                        right_on='subject', how='left')
biosample_tax_id_counts.unclassified.fillna(False, inplace=True)

biosample_tax_id_counts

Unnamed: 0,biosample_taxid,count,curie,subject,unclassified
0,9606,6819707,NCBITaxon:9606,,False
1,10090,964219,NCBITaxon:10090,,False
2,408170,290862,NCBITaxon:408170,NCBITaxon:408170,True
3,410658,280666,NCBITaxon:410658,NCBITaxon:410658,True
4,646099,208741,NCBITaxon:646099,NCBITaxon:646099,True
...,...,...,...,...,...
163372,999891,1,NCBITaxon:999891,,False
163373,999892,1,NCBITaxon:999892,,False
163374,999898,1,NCBITaxon:999898,,False
163375,999931,1,NCBITaxon:999931,,False


## Add labels to all taxa

In [18]:

q = """
select
    subject ,
    value
from statements
where
    predicate = 'rdfs:label' and subject = stanza
"""
[all_tax_labels, query_duration] = scoped_mapping.timed_query(q, ncbitaxon_cnx)

biosample_tax_id_counts = biosample_tax_id_counts.merge(all_tax_labels, left_on='curie',
                                                        right_on='subject', how='left')

biosample_tax_id_counts = biosample_tax_id_counts[['curie', 'biosample_taxid', 'count', 'unclassified', 'value']]
biosample_tax_id_counts.columns = ['curie', 'biosample_taxid', 'count', 'unclassified', 'label']

print(query_duration)
biosample_tax_id_counts.to_sql('biobiosample_tax_id_counts', biosample_cnx, if_exists='replace', index=False)

biosample_tax_id_counts

0:00:03.939463


Unnamed: 0,curie,biosample_taxid,count,unclassified,label
0,NCBITaxon:9606,9606,6819707,False,Homo sapiens
1,NCBITaxon:10090,10090,964219,False,Mus musculus
2,NCBITaxon:408170,408170,290862,True,human gut metagenome
3,NCBITaxon:410658,410658,280666,True,soil metagenome
4,NCBITaxon:646099,646099,208741,True,human metagenome
...,...,...,...,...,...
163372,NCBITaxon:999891,999891,1,False,Bacillus amyloliquefaciens TA208
163373,NCBITaxon:999892,999892,1,False,[Propionibacterium] humerusii P08
163374,NCBITaxon:999898,999898,1,False,Peptococcaceae bacterium CEB3
163375,NCBITaxon:999931,999931,1,False,Barrientosiimonas humi


**Almost all of the taxa that are common in the biosample collection are either unclassified/metagenomes or easily recognized cellular organisms. Cellular organism samples are deprioritized in this exercise**

Exceptions include:
- 32630 = synthetic construct (other entries; other sequences; artificial sequences)
    - 'other entries' would add 16k rows on top of the 1k 'unclassified entities'
    - metagenomes account for 331 of the 'unclassified entities'
    - there are also a small number of uncultured/unclassified microorganisms in the biosample dataset
- 77133 = uncultured bacterium (cellular organisms; Bacteria; environmental samples)
    - 'cellular organisms' would add 2M rows on top of the 1k 'unclassified entities'
    - 'cellular organisms; Bacteria; environmental samples' adds 26k
    
----

## Get a table of MIxS annotations to be mapped to ontology classes.

Explicitly scope based on normalized package data


In [19]:
biosample_col_to_map = 'env_broad_scale'
scoping_col          = 'env_package_normalized.ep_override'
scoping_value        = 'water'

**In this case, the scoping includes an inner join requirement for 'unclassified entities'**

In [20]:
q = 'select ' + biosample_col_to_map + """, count(*) as count
from
    biosample b
join env_package_normalized on
    b.env_package = env_package_normalized.env_package
inner join biobiosample_tax_id_counts stic on
    b.taxonomy_id = stic.biosample_taxid
where """ + scoping_col + " = '" + scoping_value + \
    "' group by " + biosample_col_to_map + """
order by
    count(*) desc"""

[mapping_candidates, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

mapping_candidates

Unnamed: 0,env_broad_scale,count
0,small lake biome,3264
1,marine biome (ENVO:00000447),1382
2,marine biome,1355
3,large lake biome,1198
4,freshwater biome,1051
...,...,...
252,Arctic,1
253,Aquatic biome,1
254,01000035,1
255,00000891,1


## The Biosample format allows for pipe-delimited environmental package lists. 

Separate those out into their components.

In [21]:

multi_frames = []
for row in mapping_candidates.itertuples(index=True, name='Pandas'):
    split_check = row.env_broad_scale
    if split_check is None:
        split_check = ''
    splitted = pd.Series(split_check.split("|"))
    splitted_count = len(splitted)
    repeated = [split_check] * splitted_count
    repeated = pd.Series(repeated)
    as_frame = pd.DataFrame(dict(repeated=repeated, splitted=splitted)).reset_index()
    multi_frames.append(as_frame)
concat_frame = pd.concat(multi_frames)
concat_frame = concat_frame[['repeated', 'splitted']]
mapping_candidates = mapping_candidates.merge(concat_frame, left_on=biosample_col_to_map,
                                              right_on='repeated', how='left')

mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted
0,small lake biome,3264,small lake biome,small lake biome
1,marine biome (ENVO:00000447),1382,marine biome (ENVO:00000447),marine biome (ENVO:00000447)
2,marine biome,1355,marine biome,marine biome
3,large lake biome,1198,large lake biome,large lake biome
4,freshwater biome,1051,freshwater biome,freshwater biome
...,...,...,...,...
258,Arctic,1,Arctic,Arctic
259,Aquatic biome,1,Aquatic biome,Aquatic biome
260,01000035,1,01000035,01000035
261,00000891,1,00000891,00000891


## Normalize a few different ways `ENVO` IDs have been entered**

In [22]:

mapping_candidates['envo_tidy'] = mapping_candidates.splitted.str.replace('envo[:_ ]', 'ENVO:', regex=True, case=False)

mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,envo_tidy
0,small lake biome,3264,small lake biome,small lake biome,small lake biome
1,marine biome (ENVO:00000447),1382,marine biome (ENVO:00000447),marine biome (ENVO:00000447),marine biome (ENVO:00000447)
2,marine biome,1355,marine biome,marine biome,marine biome
3,large lake biome,1198,large lake biome,large lake biome,large lake biome
4,freshwater biome,1051,freshwater biome,freshwater biome,freshwater biome
...,...,...,...,...,...
258,Arctic,1,Arctic,Arctic,Arctic
259,Aquatic biome,1,Aquatic biome,Aquatic biome,Aquatic biome
260,01000035,1,01000035,01000035,01000035
261,00000891,1,00000891,00000891,00000891


# Now try to extract ontology terms that are already present

In [23]:
candidate_series_decomposition = scoped_mapping.decompose_series(mapping_candidates['envo_tidy'],
                                                                 id_patterns[target_onto_prefix])

mapping_candidates = pd.concat([mapping_candidates, candidate_series_decomposition], axis=1)

mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,envo_tidy,string,extract,remaining_string,remaining_tidied
0,small lake biome,3264,small lake biome,small lake biome,small lake biome,small lake biome,,small lake biome,small lake biome
1,marine biome (ENVO:00000447),1382,marine biome (ENVO:00000447),marine biome (ENVO:00000447),marine biome (ENVO:00000447),marine biome (ENVO:00000447),ENVO:00000447,marine biome (),marine biome
2,marine biome,1355,marine biome,marine biome,marine biome,marine biome,,marine biome,marine biome
3,large lake biome,1198,large lake biome,large lake biome,large lake biome,large lake biome,,large lake biome,large lake biome
4,freshwater biome,1051,freshwater biome,freshwater biome,freshwater biome,freshwater biome,,freshwater biome,freshwater biome
...,...,...,...,...,...,...,...,...,...
258,Arctic,1,Arctic,Arctic,Arctic,Arctic,,Arctic,arctic
259,Aquatic biome,1,Aquatic biome,Aquatic biome,Aquatic biome,Aquatic biome,,Aquatic biome,aquatic biome
260,01000035,1,01000035,01000035,01000035,01000035,,01000035,01000035
261,00000891,1,00000891,00000891,00000891,00000891,,00000891,00000891


## Join the extracted IDs with their labels

Start by connecting to the rdftab database from which the terms and label-like annotatiosn will be obtained

In [24]:

ontodb = '../semantic-sql/db/' + target_onto_prefix.lower() + '.db'
ontocon = sqlite3.connect(ontodb)

## extracting the labels

In [25]:
q = """
select
    subject ,
    value
from
    statements s
where
    predicate = 'rdfs:label'
"""
[onto_labels, query_duration] = scoped_mapping.timed_query(q, ontocon)

onto_labels

Unnamed: 0,subject,value
0,IAO:0000111,editor preferred term~editor preferred label
1,IAO:0000112,example of usage
2,IAO:0000114,has curation status
3,IAO:0000115,definition
4,IAO:0000116,editor note
...,...,...
6774,ENVO:01001862,Solar radiation
6775,<https://www.wikidata.org/wiki/Q2>,Earth
6776,<https://www.wikidata.org/wiki/Q2306597>,Suni
6777,<https://www.wikidata.org/wiki/Q525>,Sol


## and merging 

In [26]:
mapping_candidates = mapping_candidates.merge(onto_labels, left_on='extract', right_on='subject', how='left')
mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,envo_tidy,string,extract,remaining_string,remaining_tidied,subject,value
0,small lake biome,3264,small lake biome,small lake biome,small lake biome,small lake biome,,small lake biome,small lake biome,,
1,marine biome (ENVO:00000447),1382,marine biome (ENVO:00000447),marine biome (ENVO:00000447),marine biome (ENVO:00000447),marine biome (ENVO:00000447),ENVO:00000447,marine biome (),marine biome,ENVO:00000447,marine biome
2,marine biome,1355,marine biome,marine biome,marine biome,marine biome,,marine biome,marine biome,,
3,large lake biome,1198,large lake biome,large lake biome,large lake biome,large lake biome,,large lake biome,large lake biome,,
4,freshwater biome,1051,freshwater biome,freshwater biome,freshwater biome,freshwater biome,,freshwater biome,freshwater biome,,
...,...,...,...,...,...,...,...,...,...,...,...
258,Arctic,1,Arctic,Arctic,Arctic,Arctic,,Arctic,arctic,,
259,Aquatic biome,1,Aquatic biome,Aquatic biome,Aquatic biome,Aquatic biome,,Aquatic biome,aquatic biome,,
260,01000035,1,01000035,01000035,01000035,01000035,,01000035,01000035,,
261,00000891,1,00000891,00000891,00000891,00000891,,00000891,00000891,,


## Use cosine string distance to see if the labels match closely enough

I.e. the labels claimed by the Biosample data set and the labels asserted in the ontology. if they're close enough, consider the assigned ID legit


_How close is close enough?_

In [27]:

my_cosine_obj = Cosine(my_string_dist_arg)
mapping_candidates['value'] = mapping_candidates['value'].fillna('')
mapping_candidates['cosine'] = mapping_candidates.apply(
    lambda my_row: my_cosine_obj.distance(my_row['remaining_tidied'].lower(), my_row['value'].lower()), axis=1)
mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,envo_tidy,string,extract,remaining_string,remaining_tidied,subject,value,cosine
0,small lake biome,3264,small lake biome,small lake biome,small lake biome,small lake biome,,small lake biome,small lake biome,,,1.0
1,marine biome (ENVO:00000447),1382,marine biome (ENVO:00000447),marine biome (ENVO:00000447),marine biome (ENVO:00000447),marine biome (ENVO:00000447),ENVO:00000447,marine biome (),marine biome,ENVO:00000447,marine biome,0.0
2,marine biome,1355,marine biome,marine biome,marine biome,marine biome,,marine biome,marine biome,,,1.0
3,large lake biome,1198,large lake biome,large lake biome,large lake biome,large lake biome,,large lake biome,large lake biome,,,1.0
4,freshwater biome,1051,freshwater biome,freshwater biome,freshwater biome,freshwater biome,,freshwater biome,freshwater biome,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
258,Arctic,1,Arctic,Arctic,Arctic,Arctic,,Arctic,arctic,,,1.0
259,Aquatic biome,1,Aquatic biome,Aquatic biome,Aquatic biome,Aquatic biome,,Aquatic biome,aquatic biome,,,1.0
260,01000035,1,01000035,01000035,01000035,01000035,,01000035,01000035,,,1.0
261,00000891,1,00000891,00000891,00000891,00000891,,00000891,00000891,,,1.0


**Previously, we did a reality check on the claimed IDs and labels. If a label is claimed without any ID, that could still be a path to an ontology term.**

We'll be doing some merging, so make sure column names aren't reused


In [28]:
mapping_candidates.columns = ['env_broad_scale', 'count', 'repeated', 'splitted', 'envo_tidy', 'string', 'extract',
                              'remaining_string', 'remaining_tidied', 'term_id', 'lab_from_id', 'lfi_cosine']
mapping_candidates = mapping_candidates.merge(onto_labels, left_on='remaining_tidied', right_on='value', how='left')

mapping_candidates.columns = ['env_broad_scale', 'count', 'repeated', 'splitted', 'envo_tidy', 'string', 'extract',
                              'remaining_string', 'remaining_tidied', 'term_id', 'lab_from_id',
                              'lfi_cosine', 'term_id_from_lab', 'value']
mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,envo_tidy,string,extract,remaining_string,remaining_tidied,term_id,lab_from_id,lfi_cosine,term_id_from_lab,value
0,small lake biome,3264,small lake biome,small lake biome,small lake biome,small lake biome,,small lake biome,small lake biome,,,1.0,,
1,marine biome (ENVO:00000447),1382,marine biome (ENVO:00000447),marine biome (ENVO:00000447),marine biome (ENVO:00000447),marine biome (ENVO:00000447),ENVO:00000447,marine biome (),marine biome,ENVO:00000447,marine biome,0.0,ENVO:00000447,marine biome
2,marine biome,1355,marine biome,marine biome,marine biome,marine biome,,marine biome,marine biome,,,1.0,ENVO:00000447,marine biome
3,large lake biome,1198,large lake biome,large lake biome,large lake biome,large lake biome,,large lake biome,large lake biome,,,1.0,,
4,freshwater biome,1051,freshwater biome,freshwater biome,freshwater biome,freshwater biome,,freshwater biome,freshwater biome,,,1.0,ENVO:00000873,freshwater biome
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258,Arctic,1,Arctic,Arctic,Arctic,Arctic,,Arctic,arctic,,,1.0,,
259,Aquatic biome,1,Aquatic biome,Aquatic biome,Aquatic biome,Aquatic biome,,Aquatic biome,aquatic biome,,,1.0,ENVO:00002030,aquatic biome
260,01000035,1,01000035,01000035,01000035,01000035,,01000035,01000035,,,1.0,,
261,00000891,1,00000891,00000891,00000891,00000891,,00000891,00000891,,,1.0,,


In [29]:
# mapping_candidates.to_sql('mapping_scratch', biosample_cnx, if_exists='replace', index=False)

## Find consensus term IDs and labels


In [30]:

mapping_candidates['consensus_id'] = mapping_candidates['term_id_from_lab']
mapping_candidates['consensus_lab'] = mapping_candidates['value']

flag = mapping_candidates['consensus_id'].isnull() &\
    (( ~ mapping_candidates['term_id'].isnull() &\
    mapping_candidates.lfi_cosine.le(my_max_string_dist)) |\
     (~ mapping_candidates['term_id'].isnull() &\
      mapping_candidates['remaining_tidied'].eq('')))
    
replacements = mapping_candidates.loc[flag, 'term_id']
mapping_candidates.loc[flag, 'consensus_id'] = replacements

replacements = mapping_candidates.loc[flag, 'lab_from_id']
mapping_candidates.loc[flag, 'consensus_lab'] = replacements

flag = mapping_candidates.consensus_id.isna()
antiflag = ~ flag
mapping_candidates['id_or_lab_ok'] = antiflag

mapping_candidates['assembled_consensus'] = mapping_candidates['consensus_lab'] + ' [' + \
    mapping_candidates['consensus_id'] + ']'

In [31]:
mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,envo_tidy,string,extract,remaining_string,remaining_tidied,term_id,lab_from_id,lfi_cosine,term_id_from_lab,value,consensus_id,consensus_lab,id_or_lab_ok,assembled_consensus
0,small lake biome,3264,small lake biome,small lake biome,small lake biome,small lake biome,,small lake biome,small lake biome,,,1.0,,,,,False,
1,marine biome (ENVO:00000447),1382,marine biome (ENVO:00000447),marine biome (ENVO:00000447),marine biome (ENVO:00000447),marine biome (ENVO:00000447),ENVO:00000447,marine biome (),marine biome,ENVO:00000447,marine biome,0.0,ENVO:00000447,marine biome,ENVO:00000447,marine biome,True,marine biome [ENVO:00000447]
2,marine biome,1355,marine biome,marine biome,marine biome,marine biome,,marine biome,marine biome,,,1.0,ENVO:00000447,marine biome,ENVO:00000447,marine biome,True,marine biome [ENVO:00000447]
3,large lake biome,1198,large lake biome,large lake biome,large lake biome,large lake biome,,large lake biome,large lake biome,,,1.0,,,,,False,
4,freshwater biome,1051,freshwater biome,freshwater biome,freshwater biome,freshwater biome,,freshwater biome,freshwater biome,,,1.0,ENVO:00000873,freshwater biome,ENVO:00000873,freshwater biome,True,freshwater biome [ENVO:00000873]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258,Arctic,1,Arctic,Arctic,Arctic,Arctic,,Arctic,arctic,,,1.0,,,,,False,
259,Aquatic biome,1,Aquatic biome,Aquatic biome,Aquatic biome,Aquatic biome,,Aquatic biome,aquatic biome,,,1.0,ENVO:00002030,aquatic biome,ENVO:00002030,aquatic biome,True,aquatic biome [ENVO:00002030]
260,01000035,1,01000035,01000035,01000035,01000035,,01000035,01000035,,,1.0,,,,,False,
261,00000891,1,00000891,00000891,00000891,00000891,,00000891,00000891,,,1.0,,,,,False,


## Save these easy term mappings to SQLite

In [32]:
mapping_candidates.to_sql('mapping_scratch', biosample_cnx, if_exists='replace', index=False)

## For which Biosample annotations were no easy mappings found?
How many Biosamples uses those annotations?

In [33]:
flag = ~ mapping_candidates.id_or_lab_ok
needs_search = mapping_candidates.loc[flag]

needs_search = needs_search[['remaining_tidied', 'count']]

sum_by_needed = needs_search.groupby('remaining_tidied')['count'].sum()

sum_by_needed = sum_by_needed.to_frame()
sum_by_needed['remaining_tidied'] = sum_by_needed.index

sum_by_needed = sum_by_needed.sort_values('count', ascending=False)
sum_by_needed.reset_index(drop=True)

sum_by_needed.to_sql('sum_by_needed', biosample_cnx, if_exists='replace', index=False)

In [34]:
sum_by_needed

Unnamed: 0_level_0,count,remaining_tidied
remaining_tidied,Unnamed: 1_level_1,Unnamed: 2_level_1
small lake biome,3287,small lake biome
large lake biome,1198,large lake biome
15,879,15
1000686,850,1000686
surface seawater,480,surface seawater
...,...,...
hydrothermal vents,1,hydrothermal vents
hanford h 101,1,hanford h 101
hanford h 100,1,hanford h 100
gulf of eilat,1,gulf of eilat


## Extract the tidied strings

In [35]:
ebs_raw_list = list(sum_by_needed['remaining_tidied'])
ebs_raw_list.sort()

## Submit those tidied strings to a search engine

Specifically OLS search. This takes roughly one second per submission

_Turn logging back on to show status?_
_Print the count and pre- and post- datestamps_


In [36]:
ebs_search_res = scoped_mapping.search_get_annotations_wrapper(ebs_raw_list,
                                                               bad_chars=chars_to_whiteout,
                                                               cat_name=biosample_col_to_map,
                                                               ontoprefix='envo,gaz',
                                                               query_fields='',
                                                               rr=5,
                                                               string_dist_arg=my_string_dist_arg)

## Filter out the best of the acceptable mappings
From a string distance perspective

In [37]:
my_best_acceptable = scoped_mapping.get_best_acceptable(ebs_search_res, max_string_dist=my_max_string_dist)
my_best_acceptable

Unnamed: 0,category,raw,query,name,string_dist_rank,string_dist,obo_id,label,search_rank,ontology_prefix,scope,type,iri,ontology_name
0,env_broad_scale,,,,1,0.0,,,1,,,,,
48,env_broad_scale,arabian sea,arabian sea,Arabian Sea,1,0.0,GAZ:00002457,Arabian Sea,1,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_00002457,gaz
81,env_broad_scale,atlantic ocean,atlantic ocean,Atlantic Ocean,1,0.0,GAZ:00000344,Atlantic Ocean,1,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_00000344,gaz
101,env_broad_scale,boreal forest,boreal forest,boreal forest,1,0.0,ENVO:01000250,subpolar coniferous forest biome,1,ENVO,has_narrow_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_01000250,envo
133,env_broad_scale,caribbean sea,caribbean sea,Caribbean Sea,1,0.0,GAZ:00002820,Caribbean Sea,1,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_00002820,gaz
213,env_broad_scale,coastal water,coastal water,coastal water,1,0.0,ENVO:00002150,coastal sea water,1,ENVO,has_broad_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_00002150,envo
417,env_broad_scale,eukaryotes,eukaryotes,eukaryotes,1,0.0,NCBITaxon:2759,Eukaryota,1,ENVO,synonym,synonym,http://purl.obolibrary.org/obo/NCBITaxon_2759,envo
508,env_broad_scale,forest,forest,Forest,1,0.0,GAZ:00454366,Forest,1,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_00454366,gaz
520,env_broad_scale,freshwater,freshwater,freshwater,1,0.0,ENVO:00002011,fresh water,1,ENVO,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_00002011,envo
585,env_broad_scale,hot springs,hot springs,Hot Springs,1,0.0,GAZ:22224982,Hot Springs,1,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_22224982,gaz


----

## Filter out the submissions with no acceptable matches

In [38]:
no_acceptable_mappings = scoped_mapping.get_no_acceptable_mappings(ebs_search_res, my_best_acceptable)

no_acceptable_mappings.to_sql('no_acceptable_mappings', biosample_cnx, if_exists='replace', index=False)

no_acceptable_mappings

Unnamed: 0,category,raw,query,name,string_dist_rank,string_dist,obo_id,label,search_rank,ontology_prefix,scope,type,iri,ontology_name
1,env_broad_scale,0,0,Milecastle 0,1,1.000,GAZ:00456187,Milecastle 0,1,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_00456187,gaz
2,env_broad_scale,0,0,lead(0),2,1.000,CHEBI:27889,lead(0),2,ENVO,label,label,http://purl.obolibrary.org/obo/CHEBI_27889,envo
3,env_broad_scale,0,0,iron(0),3,1.000,CHEBI:82664,iron(0),3,ENVO,label,label,http://purl.obolibrary.org/obo/CHEBI_82664,envo
4,env_broad_scale,0,0,platinum(0),4,1.000,CHEBI:33400,platinum(0),4,ENVO,label,label,http://purl.obolibrary.org/obo/CHEBI_33400,envo
5,env_broad_scale,0,0,uranium(0),5,1.000,CHEBI:49936,uranium(0),5,ENVO,label,label,http://purl.obolibrary.org/obo/CHEBI_49936,envo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199,env_broad_scale,westerlies biome,westerlies biome,EcosytemType,8,0.927,ENVO:00000428,biome,1,ENVO,has_related_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_00000428,envo
1196,env_broad_scale,westerlies biome,westerlies biome,major habitat type,9,0.941,ENVO:00000428,biome,1,ENVO,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_00000428,envo
1197,env_broad_scale,westerlies biome,westerlies biome,major habitat type,10,0.941,ENVO:00000428,biome,1,ENVO,hasExactSynonym,,http://purl.obolibrary.org/obo/ENVO_00000428,envo
1201,env_broad_scale,westerlies biome,westerlies biome,marine realm,11,1.000,ENVO:00000447,marine biome,2,ENVO,has_related_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_00000447,envo


## Mapping failure patterns

- Some get a good hit if 'biome' is added


_How to manually review and then add back in?__


Add to biosample SQLite database:

    - mapping_candidates -> mapping_scratch (easy ID-based and exact-tidied-label-based)
    - ebs_search_results (no acceptable + all acceptable)?
    - my_best_acceptable
    - no_acceptable_mappings
    
----

## Try searching the failures against all ontologies in OLS

In [39]:
still_unmapped = list(set(list(no_acceptable_mappings['raw'])))
still_unmapped.sort()

salvage_search_res = scoped_mapping.search_get_annotations_wrapper(still_unmapped,
                                                               bad_chars='._-',
                                                               cat_name='salvage',
                                                               ontoprefix='',
                                                               query_fields='',
                                                               rr=5,
                                                               string_dist_arg=2)

salvage_best_acceptable = scoped_mapping.get_best_acceptable(salvage_search_res,
                                                             max_string_dist=0.2)


## We appear to be at a point of diminishing returns

In [40]:
salvage_best_acceptable

Unnamed: 0,category,raw,query,name,string_dist_rank,string_dist,obo_id,label,search_rank,ontology_prefix,scope,type,iri,ontology_name
17,salvage,0,0,0,1,0.0,NCIT:C159767,Zero Full Term Pregnancies,3,NCIT,hasExactSynonym,,http://purl.obolibrary.org/obo/NCIT_C159767,ncit
78,salvage,15,15,15,1,0.0,NCIT:C113429,Fifteen,5,NCIT,synonym,synonym,http://purl.obolibrary.org/obo/NCIT_C113429,ncit
111,salvage,arctic,arctic,Arctic,1,0.0,NCIT:C44738,Arctic,1,NCIT,label,label,http://purl.obolibrary.org/obo/NCIT_C44738,ncit
169,salvage,brackish water river,brackish water river,brackish water,1,0.153,ENVO:00002019,brackish water,1,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_00002019,envo
194,salvage,brine pool interface layer,brine pool interface layer,interface layer,1,0.192,ENVO:01001684,interface layer,3,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_01001684,envo
307,salvage,coastal ocean,coastal ocean,coastal ocean water,1,0.184,ENVO:00002150,coastal sea water,1,ENVO,has_exact_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_00002150,envo
527,salvage,deep mediterranean,deep mediterranean,Mediterranean,1,0.142,LBO:0001043,Mediterranean,2,LBO,label,label,http://purl.obolibrary.org/obo/LBO_0001043,lbo
801,salvage,envo marine biome,envo marine biome,marine biome,1,0.171,ENVO:00000447,marine biome,1,ENM,label,label,http://purl.obolibrary.org/obo/ENVO_00000447,enm
830,salvage,estuarine,estuarine,estuarine mud,1,0.184,ENVO:00002160,estuarine mud,1,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_00002160,envo
914,salvage,for marine biome,for marine biome,marine biome,1,0.144,ENVO:00000447,marine biome,4,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_00000447,envo


----

In [41]:
my_best_acceptable.columns = 'ols_' + my_best_acceptable.columns
my_best_acceptable.to_sql('best_acceptable', biosample_cnx, if_exists='replace', index=False)
my_best_acceptable

Unnamed: 0,ols_category,ols_raw,ols_query,ols_name,ols_string_dist_rank,ols_string_dist,ols_obo_id,ols_label,ols_search_rank,ols_ontology_prefix,ols_scope,ols_type,ols_iri,ols_ontology_name
0,env_broad_scale,,,,1,0.0,,,1,,,,,
48,env_broad_scale,arabian sea,arabian sea,Arabian Sea,1,0.0,GAZ:00002457,Arabian Sea,1,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_00002457,gaz
81,env_broad_scale,atlantic ocean,atlantic ocean,Atlantic Ocean,1,0.0,GAZ:00000344,Atlantic Ocean,1,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_00000344,gaz
101,env_broad_scale,boreal forest,boreal forest,boreal forest,1,0.0,ENVO:01000250,subpolar coniferous forest biome,1,ENVO,has_narrow_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_01000250,envo
133,env_broad_scale,caribbean sea,caribbean sea,Caribbean Sea,1,0.0,GAZ:00002820,Caribbean Sea,1,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_00002820,gaz
213,env_broad_scale,coastal water,coastal water,coastal water,1,0.0,ENVO:00002150,coastal sea water,1,ENVO,has_broad_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_00002150,envo
417,env_broad_scale,eukaryotes,eukaryotes,eukaryotes,1,0.0,NCBITaxon:2759,Eukaryota,1,ENVO,synonym,synonym,http://purl.obolibrary.org/obo/NCBITaxon_2759,envo
508,env_broad_scale,forest,forest,Forest,1,0.0,GAZ:00454366,Forest,1,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_00454366,gaz
520,env_broad_scale,freshwater,freshwater,freshwater,1,0.0,ENVO:00002011,fresh water,1,ENVO,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_00002011,envo
585,env_broad_scale,hot springs,hot springs,Hot Springs,1,0.0,GAZ:22224982,Hot Springs,1,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_22224982,gaz


In [46]:
big_merge = mapping_candidates.merge(my_best_acceptable, how='outer', left_on='remaining_tidied', right_on='ols_raw')

In [47]:
big_merge.to_sql('big_merge', biosample_cnx, if_exists='replace', index=False)

In [48]:
flag = ~ big_merge['id_or_lab_ok'] & ~ big_merge['ols_obo_id'].eq('') & ~ big_merge['ols_obo_id'].isna()

replacement = big_merge.loc[flag, 'ols_obo_id']

big_merge.loc[flag, 'consensus_id'] = replacement

replacement = big_merge.loc[flag, 'ols_label']

big_merge.loc[flag, 'consensus_lab'] = replacement

replacement = big_merge.loc[flag, 'consensus_lab'] + ' [' + big_merge.loc[flag, 'consensus_id'] + ']'

big_merge.loc[flag, 'assembled_consensus'] = replacement



In [49]:
big_merge.to_sql('big_merge', biosample_cnx, if_exists='replace', index=False)