- don't forget to consult "model" too
- uncultured vs metagenome tax id?
- what about "gold" or "path"? WON"T FIND IN NCBI BIOSAMPLE
- are there samples with a host_taxid and a (sample) taxonomy_id from "unclassified"?
- "x biome" vs "x"
- be careful with GAZ matches
- take note of non-local salvage hits
- what to do about hits against obsolete classes... does OLS provide a mechanism to avoid?
- is it worth trimming and concatenating "x1", "x2", "x3", where x is a mappable term? numbers may be low.


In [2]:
from datetime import datetime
from pkg_resources import get_distribution, DistributionNotFound
from strsimpy.cosine import Cosine
import pandas as pd
import re
import requests as requests
import sqlite3
import string
import urllib
import yaml
from xml.etree import ElementTree
from tdda import rexpy
import scoped_mapping

## User-provided data
See repo README for notes on setting up SQLite databases of OBO ontologies with semantic-sql, relation-graph and rdftab

In [3]:
# from https://www.ncbi.nlm.nih.gov/biosample/docs/packages/?format=xml
# see also https://www.ncbi.nlm.nih.gov/biosample/docs/packages/
biosample_packages_file = "../target/biosample_packages.xml"

# from ftp://ftp.ncbi.nlm.nih.gov//biosample/biosample_set.xml.gz
# via harmonized_table.db.gz
# in https://drive.google.com/drive/u/0/folders/1eL0v0stoduahjDpoDJIk3z2pJBAU4b2Y
biosample_sqlite_file   = "../target/harmonized_table.db"

# see readme
ncbitaxon_sqlite_file   = "../semantic-sql/db/ncbitaxon.db"
envo_sqlite_file        = "../semantic-sql/db/envo.db"

biosample_cnx = sqlite3.connect(biosample_sqlite_file)
ncbitaxon_cnx = sqlite3.connect(ncbitaxon_sqlite_file)
envo_cnx      = sqlite3.connect(envo_sqlite_file)

target_onto_prefix = "ENVO"
chars_to_whiteout  = "._-"
my_query_fields    = ""  # OLS weighted default
my_row_req         = 3
my_string_dist_arg = 2
my_max_string_dist = 0.1


## it nice to see everything accounted for
but we can prioritize NMDC for now:

Soil 15,777
Sediment 7,147
Plant-associated 3,142

Could some of these "no environmental package" mappings be losing important granularity?

map `None` and '' to "no environmental package"?

In [4]:
env_package_overrides = {
    "built environment": "built",
    "misc environment": "miscellaneous",
    "missing": "no environmental package",
    "unknown": "no environmental package",
    "default": "no environmental package",
    "unspecified": "no environmental package",
    "not available": "no environmental package",
    "not collected": "no environmental package",
    "miscellaneous natural or artificial environment": "miscellaneous",
    "not applicable": "no environmental package",
    "soil-associated": "soil",
    "soil associated": "soil"
}

## What Biosample field should be mapped to ontology classes?

In [5]:
biosample_col_to_map = "env_broad_scale"

## How should the mapping effort be scoped?
Additionally taxonomic filters may be applied below

In [6]:
# scoping_col = "env_package_normalized.rt_override"
scoping_col = "env_package_normalized.EnvPackage"
scoping_value = "soil"

## Settings for manual review of OLS search-based results
Results based on merging Biosample annotations to ontology classes by label or embedded ID are not exported for review at this time

In [7]:
ols_review_file = "../local/ols_review.tsv"
ols_review_seperator = "\t"
strategy_col = "strategy"
include_col = "include"
first_pass_include_val = True
first_pass_strategy_val = "env_braod_scale vs envo and gaz @ 0.1"
salvage_include_val = False
salvage_strategy_val = "env_braod_scale vs all of ols @ 0.2"

## Settings for SSSOM output

In [8]:
sssom_subject_prefix = "biosample_ebs"
sssom_file = "biosample_ebs_water_packages_unclassified_taxa_sssom.tsv"

## Print a sample of the data we're working with
Specifically, INSDC/NCBI Biosample metadata

In [9]:
q = """
select
    id,
    env_package,
    package,
    package_name,
    host_taxid,
    taxonomy_id,
    env_broad_scale,
    env_local_scale,
    env_medium
    from biosample b
limit 10
"""
biosample_first_ten = pd.read_sql(q, biosample_cnx)
biosample_first_ten

Unnamed: 0,id,env_package,package,package_name,host_taxid,taxonomy_id,env_broad_scale,env_local_scale,env_medium
0,BIOSAMPLE:SAMN00000002,missing,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,9606.0,445970,terrestrial biome [ENVO:00000446],human-associated habitat [ENVO:00009003],biological product [ENVO:02000043]
1,BIOSAMPLE:SAMN00000003,missing,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,9606.0,445972,terrestrial biome [ENVO:00000446],human-associated habitat [ENVO:00009003],biological product [ENVO:02000043]
2,BIOSAMPLE:SAMN00000004,missing,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,9606.0,449673,terrestrial biome [ENVO:00000446],human-associated habitat [ENVO:00009003],biological product [ENVO:02000043]
3,BIOSAMPLE:SAMN00000005,,Generic.1.0,Generic,,6526,,,
4,BIOSAMPLE:SAMN00000006,,Generic.1.0,Generic,,9483,,,
5,BIOSAMPLE:SAMN00000007,missing,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,9606.0,445974,terrestrial biome [ENVO:00000446],human-associated habitat [ENVO:00009003],biological product [ENVO:02000043]
6,BIOSAMPLE:SAMN00000008,missing,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,9606.0,411461,terrestrial biome [ENVO:00000446],human-associated habitat [ENVO:00009003],biological product [ENVO:02000043]
7,BIOSAMPLE:SAMN00000009,,Generic.1.0,Generic,,13616,,,
8,BIOSAMPLE:SAMN00000010,,Generic.1.0,Generic,,451639,,,
9,BIOSAMPLE:SAMN00000011,,Generic.1.0,Generic,,451638,,,


## Get the canonical checklist and package terms from NCBI

Unfortunately it doesn't do a very good job of differentiating checklists (MIMAG, MIMARKS, etc.) from packages (soil, water, etc.)

_What about .ba, .euk, etc?_

In [10]:
package_dictionary = scoped_mapping.get_package_dictionary(biosample_packages_file)
package_dictionary.to_sql(
    "package_dictionary", biosample_cnx, if_exists="replace", index=False
)
package_dictionary

Unnamed: 0,Name,DisplayName,ShortName,EnvPackage,EnvPackageDisplay,NotAppropriateFor,Description,Example
0,Generic.1.0,Generic,,,,,Generic,
1,SARS-CoV-2.cl.1.0,SARS-CoV-2: clinical or host-associated; versi...,SARS-CoV-2: clinical or host-associated,,,wgs_single;wgs_batch;wgs_diploid,Use for SARS-CoV-2 samples that are relevant t...,
2,Pathogen.cl.1.0,Pathogen: clinical or host-associated; version...,Pathogen: clinical or host-associated,,,,Clinical or host-associated pathogen,SAMN02928182
3,Pathogen.env.1.0,Pathogen: environmental/food/other; version 1.0,Pathogen: environmental/food/other,,,,"Environmental, food or other pathogen",SAMN02730065
4,Microbe.1.0,Microbe; version 1.0,Microbe,,,,Use for bacteria or other unicellular microbes...,SAMN02911891
...,...,...,...,...,...,...,...,...
149,MIUVIG.plant-associated.5.0,"MIUVIG: uncultivated virus genome, plant-assoc...",MIUVIG Uncultivated Virus Genome,plant-associated,plant-associated,wgs_single;wgs_batch;wgs_diploid,Use for uncultivated virus genome identified i...,
150,MIUVIG.sediment.5.0,"MIUVIG: uncultivated virus genome, sediment; v...",MIUVIG Uncultivated Virus Genome,sediment,sediment,wgs_single;wgs_batch;wgs_diploid,Use for uncultivated virus genome identified i...,
151,MIUVIG.soil.5.0,"MIUVIG: uncultivated virus genome, soil; versi...",MIUVIG Uncultivated Virus Genome,soil,soil,wgs_single;wgs_batch;wgs_diploid,Use for uncultivated virus genome identified i...,
152,MIUVIG.wastewater.5.0,"MIUVIG: uncultivated virus genome, wastewater;...",MIUVIG Uncultivated Virus Genome,wastewater,wastewater,wgs_single;wgs_batch;wgs_diploid,Use for uncultivated virus genome identified i...,


## Do the Biosample checklist/package fields match any of the canonical values?

Start by counting the Biosample rows/records. See XXX notes on extracting this *harmonized* database

In [11]:
q = """
select count(*) as biosample_row_count
from biosample b
"""
[biosample_row_count, query_duration] = scoped_mapping.timed_query(
    q, biosample_cnx, print_timing=False
)

print(query_duration)
biosample_row_count

0:00:00.302293


Unnamed: 0,biosample_row_count
0,14300584


## How many of those rows can be inner-joined with the canonical checklists/packages?
Specifically, joining `biosample.package_name` with `package_dictionary.DisplayName`

_Note that indices are built as part of the makefile_


- create index biosample_package_name_idx on biosample(package_name);
- create index package_dictionary_DisplayName_idx on package_dictionary(DisplayName);
- create index biosample_package_idx on biosample(package);
- create index biosample_p_pn_idx on biosample(package, package_name);

In [12]:
q = """
select
    count(*) as canonical_package_name_count
from
    biosample b
inner join package_dictionary pd on
    b.package_name = pd.DisplayName
"""
[canonical_package_name_count, query_duration] = scoped_mapping.timed_query(
    q, biosample_cnx, print_timing=True
)

print(query_duration)

canonical_package_name_count

2021-06-09 09:52:57.256159
2021-06-09 09:52:58.825005
0:00:01.568846
0:00:01.568846


Unnamed: 0,canonical_package_name_count
0,14300584


## Combinations of `package` and `package_name` values in the Biosample dataset

In [13]:
q = """
select
    package,
    package_name,
    count(*) as count
from
    biosample b
group by
    package ,
    package_name
order by
    package ,
    package_name
"""
[package_name_combos, query_duration] = scoped_mapping.timed_query(
    q, biosample_cnx, print_timing=True
)

print(query_duration)

package_name_combos

2021-06-09 09:52:58.831158
2021-06-09 09:53:01.800620
0:00:02.969462
0:00:02.969462


Unnamed: 0,package,package_name,count
0,Beta-lactamase.1.0,Beta-lactamase; version 1.0,556
1,Generic.1.0,Generic,10186430
2,Human.1.0,Human; version 1.0,368893
3,Invertebrate.1.0,Invertebrate; version 1.0,131085
4,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,28686
...,...,...,...
124,Model.organism.animal.1.0,Model organism or animal; version 1.0,382980
125,Pathogen.cl.1.0,Pathogen: clinical or host-associated; version...,511040
126,Pathogen.env.1.0,Pathogen: environmental/food/other; version 1.0,246497
127,Plant.1.0,Plant; version 1.0,385939


## What about the Biosample `env_package` values?
Are they also a small, highly regular set, like the `package` and `package_name` combinations?

In [14]:
q = """
select
    env_package,
    count(*) as count
from
    biosample b
group by
    env_package
order by
    count(*) desc
"""
[env_package_count, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

print(query_duration)

env_package_count

0:00:01.133042


Unnamed: 0,env_package,count
0,,14083847
1,host-associated,49254
2,human-gut,47921
3,water,16367
4,human-skin,13706
...,...,...
87,env_package,1
88,gut microbiome,1
89,marine sediment (ENVO:00002113),1
90,saliva,1


## `env_package` is a mixture of `ENVO` term ids and strings
Those strings may or may not be the term's labels. There are many redundancies due to small spelling and punctuation variations.

Start by tidying `env_package` etc. values recognized by INSDC

In [15]:
package_dictionary = scoped_mapping.make_tidy_col(
    package_dictionary, "EnvPackage", "eptidy"
)
package_dictionary = scoped_mapping.make_tidy_col(
    package_dictionary, "EnvPackageDisplay", "epdtidy"
)

# update in sqlite
package_dictionary.to_sql(
    "package_dictionary", biosample_cnx, if_exists="replace", index=False
)

## Determine ID patterns for common ontologies, like `ENVO`

In [16]:
q = """
select
    distinct stanza
    from statements s
where
    predicate = 'rdf:type'
    and "object" = 'owl:Class'
    and stanza = subject"""
# include non-envo IDs that come from envo?
[ids_from_envo, query_duration] = scoped_mapping.timed_query(q, envo_cnx)
print(query_duration)


0:00:00.043757


In [17]:
ids_from_envo = scoped_mapping.add_prefix_col(ids_from_envo, "stanza", "prefix")

In [18]:
id_patterns = scoped_mapping.get_multi_term_patterns(ids_from_envo, "stanza", "prefix")

id_patterns

{'BFO': 'BFO:\\d{7}',
 'CARO': 'CARO:\\d{7}',
 'CHEBI': 'CHEBI:\\d{4,6}',
 'ENVO': 'ENVO:\\d{7,8}',
 'FAO': 'FAO:0000001',
 'FOODON': 'FOODON:\\d{8}',
 'GO': 'GO:\\d{7}',
 'IAO': 'IAO:\\d{7}',
 'NCBITaxon': 'NCBITaxon:\\d+',
 'OBI': 'OBI:\\d{7}',
 'PATO': 'PATO:\\d{7}',
 'PCO': 'PCO:\\d{7}',
 'PO': 'PO:\\d{7}',
 'RO': 'RO:0002577',
 'UBERON': 'UBERON:\\d{7}'}

## Apply some normalization rules to the `env_package` values

In [19]:
env_package_normalized = scoped_mapping.env_package_nomralizastion(
    env_package_count, "env_package", id_patterns["ENVO"]
)

# getting rid of redundant? 'string' column
env_package_normalized = env_package_normalized[
    [
        "env_package",
        "count",
        "lhs",
        "rhs",
        "extract",
        "remaining_string",
        "remaining_tidied",
    ]
]

env_package_normalized

Unnamed: 0,env_package,count,lhs,rhs,extract,remaining_string,remaining_tidied
0,,14083847,,,,,
1,host-associated,49254,,host-associated,,host-associated,host associated
2,human-gut,47921,,human-gut,,human-gut,human gut
3,water,16367,,water,,water,water
4,human-skin,13706,,human-skin,,human-skin,human skin
...,...,...,...,...,...,...,...
87,env_package,1,,env_package,,env_package,env package
88,gut microbiome,1,,gut microbiome,,gut microbiome,gut microbiome
89,marine sediment (ENVO:00002113),1,,marine sediment (ENVO:00002113),ENVO:00002113,marine sediment (),marine sediment
90,saliva,1,,saliva,,saliva,saliva


In [20]:
# pick "EnvPackage"/"eptidy" from package dictionary as canonical,
# not "EnvPackageDisplay"/"epdtidy"
# but stillwant to support making XXX values from YYY canonical according to "EnvPackageDisplay"/"epdtidy"
# so make a mapping/override table

epd_to_ep = package_dictionary[["eptidy", "epdtidy"]]
# drop duplicates
epd_to_ep = epd_to_ep.drop_duplicates()

# drop blank eptidy rows
ep_blank_flag = epd_to_ep["eptidy"].eq("")
epd_to_ep = epd_to_ep.loc[~ep_blank_flag]

# drop rows where eptidy and epdtidy are the same
identical_flag = epd_to_ep["eptidy"] == epd_to_ep["epdtidy"]
epd_to_ep = epd_to_ep.loc[~identical_flag]

epd_to_ep

Unnamed: 0,eptidy,epdtidy
21,microbial,microbial mat biofilm
22,miscellaneous,miscellaneous or artificial
26,wastewater,wastewater sludge


In [21]:
# and add to manualy asserted overrides above
overrides_supplement = dict(zip(epd_to_ep["epdtidy"], epd_to_ep["eptidy"]))

overrides_supplement

{'microbial mat biofilm': 'microbial',
 'miscellaneous or artificial': 'miscellaneous',
 'wastewater sludge': 'wastewater'}

In [22]:
# not getting soil-associated
env_package_overrides.update(overrides_supplement)

env_package_overrides

{'built environment': 'built',
 'misc environment': 'miscellaneous',
 'missing': 'no environmental package',
 'unknown': 'no environmental package',
 'default': 'no environmental package',
 'unspecified': 'no environmental package',
 'not available': 'no environmental package',
 'not collected': 'no environmental package',
 'miscellaneous natural or artificial environment': 'miscellaneous',
 'not applicable': 'no environmental package',
 'soil-associated': 'soil',
 'soil associated': 'soil',
 'microbial mat biofilm': 'microbial',
 'miscellaneous or artificial': 'miscellaneous',
 'wastewater sludge': 'wastewater'}

In [23]:
env_package_normalized = scoped_mapping.add_overrides(
    env_package_normalized, "remaining_tidied", "rt_override", env_package_overrides
)

In [24]:
denorm_frame = package_dictionary[["EnvPackage", "eptidy"]]
denorm_frame = denorm_frame.drop_duplicates()
denorm_frame

Unnamed: 0,EnvPackage,eptidy
0,,
12,No environmental package,no environmental package
13,air,air
14,built,built
15,host-associated,host associated
16,human-associated,human associated
17,human-gut,human gut
18,human-oral,human oral
19,human-skin,human skin
20,human-vaginal,human vaginal


In [25]:
env_package_normalized = env_package_normalized.merge(
    denorm_frame, how="left", left_on="rt_override", right_on="eptidy"
)

env_package_normalized = env_package_normalized[
    [
        "env_package",
        "count",
        "lhs",
        "rhs",
        "extract",
        "remaining_string",
        "remaining_tidied",
        "rt_override",
        "EnvPackage",
    ]
]

non_canonical_flag = env_package_normalized["EnvPackage"].isna()
env_package_normalized["is_canonical"] = True
env_package_normalized.loc[non_canonical_flag, "is_canonical"] = False

# env_package = env_package annotation from NCBI Biosample file XXX
# count = number of biosamples using that env_package annotation
# lhs = checklist info
# rhs = potential package info
# extract = potential OBO ID from rhs column (currently harcoded and only looking for ENVO IDs)
# remaining_string = rhs/string, with potential OBO IDs removed
# remaining_tidied = remaining_string with case, whitespace and punctuation normailzastion
# rt_override = some remaining_tidied values can be replaced according to env_package_overrides
# EnvPackage = corresponding de-normalized value from package_dictionary
# is_canonical = false when EnvPackage is NaN

env_package_normalized.to_sql(
    "env_package_normalized", biosample_cnx, if_exists="replace", index=False
)

env_package_normalized.to_csv(
    "biosample_env_package_normalizastion.tsv", sep=ols_review_seperator, index=False
)

In [26]:
env_package_normalized

Unnamed: 0,env_package,count,lhs,rhs,extract,remaining_string,remaining_tidied,rt_override,EnvPackage,is_canonical
0,,14083847,,,,,,,,True
1,host-associated,49254,,host-associated,,host-associated,host associated,host associated,host-associated,True
2,human-gut,47921,,human-gut,,human-gut,human gut,human gut,human-gut,True
3,water,16367,,water,,water,water,water,water,True
4,human-skin,13706,,human-skin,,human-skin,human skin,human skin,human-skin,True
...,...,...,...,...,...,...,...,...,...,...
87,env_package,1,,env_package,,env_package,env package,env package,,False
88,gut microbiome,1,,gut microbiome,,gut microbiome,gut microbiome,gut microbiome,,False
89,marine sediment (ENVO:00002113),1,,marine sediment (ENVO:00002113),ENVO:00002113,marine sediment (),marine sediment,marine sediment,,False
90,saliva,1,,saliva,,saliva,saliva,saliva,,False


## What do the successful normalizations look like?

In [27]:
q = """
select
    env_package,
    count,
    lhs,
    extract,
    EnvPackage
from
    env_package_normalized
where
    is_canonical = 1
"""
[successful_normalizastions, query_duration] = scoped_mapping.timed_query(
    q, biosample_cnx
)

print(query_duration)

successful_normalizastions

0:00:00.002170


Unnamed: 0,env_package,count,lhs,extract,EnvPackage
0,,14083847,,,
1,host-associated,49254,,,host-associated
2,human-gut,47921,,,human-gut
3,water,16367,,,water
4,human-skin,13706,,,human-skin
...,...,...,...,...,...
57,not collected,3,,,No environmental package
58,host associated,2,,,host-associated
59,ENVO:00000016,1,,ENVO:00000016,
60,Sediment,1,,,sediment


# Are there any normalization failures?

In [28]:
q = """
select
    env_package,
    count,
    lhs,
    extract,
    EnvPackage
from
    env_package_normalized
where
    is_canonical = 0
"""
[normalizastion_failures, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

print(query_duration)

normalizastion_failures

0:00:00.002603


Unnamed: 0,env_package,count,lhs,extract,EnvPackage
0,mimarks,479,,,
1,mouse-gut,406,,,
2,gut,172,,,
3,biofilm,114,,,
4,human-not providedsopharyngeal,107,,,
5,mice gut,87,,,
6,CV,60,,,
7,"home, outdoor environment",44,,,
8,fermentation-associated,42,,,
9,sterile water,35,,,


# Utilizing taxonomy for broad subsetting


**This uses an SQLite database in which the transitive closure over subClassOf has already been materialized. See the README and Makefile.**

Specifically, flag the Biosamples whose `taxon_id` indicates they are an unclassified entity. Ignoring the others will throw out samples of multicellular organisms, like fruit flies.



## Get a listing of all taxa that are transitive subclasses of `NCBITaxon:2787823`

I.e. 'unclassified entities'

In [29]:
q = """
select
    distinct s.subject
from
    entailed_edge ee
join statements s on
    ee.subject = s.subject
where
    ee.predicate = 'rdfs:subClassOf'
    and ee.object = 'NCBITaxon:2787823'
    and s.predicate = 'rdfs:label'
"""
[unclassified_taxa, query_duration] = scoped_mapping.timed_query(q, ncbitaxon_cnx)
unclassified_taxa["unclassified"] = True

print(query_duration)

unclassified_taxa

0:00:13.252446


Unnamed: 0,subject,unclassified
0,NCBITaxon:1006967,True
1,NCBITaxon:1041057,True
2,NCBITaxon:1046002,True
3,NCBITaxon:1046003,True
4,NCBITaxon:1046004,True
...,...,...
989,NCBITaxon:939928,True
990,NCBITaxon:941420,True
991,NCBITaxon:941421,True
992,NCBITaxon:941422,True


## Get taxon counts from the Biosample metadata

In [30]:
q = """
select
    taxonomy_id biosample_taxid,
    count(*) as count
from
    biosample b
group by
    taxonomy_id
order by
    count(*) desc
"""
[biosample_tax_id_counts, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)
biosample_tax_id_counts["curie"] = "NCBITaxon:" + biosample_tax_id_counts[
    "biosample_taxid"
].astype(str)

print(query_duration)

0:00:01.530258


## Merge the two taxonomy dataframes

I.e. flag the the Biosample records whose `taxonomy_id` field belongs to a subclass of 'unclassified entries'.

In [31]:
biosample_tax_id_counts = biosample_tax_id_counts.merge(
    unclassified_taxa, left_on="curie", right_on="subject", how="left"
)
biosample_tax_id_counts.unclassified.fillna(False, inplace=True)

biosample_tax_id_counts

Unnamed: 0,biosample_taxid,count,curie,subject,unclassified
0,9606,6819707,NCBITaxon:9606,,False
1,10090,964219,NCBITaxon:10090,,False
2,408170,290862,NCBITaxon:408170,NCBITaxon:408170,True
3,410658,280666,NCBITaxon:410658,NCBITaxon:410658,True
4,646099,208741,NCBITaxon:646099,NCBITaxon:646099,True
...,...,...,...,...,...
163372,999891,1,NCBITaxon:999891,,False
163373,999892,1,NCBITaxon:999892,,False
163374,999898,1,NCBITaxon:999898,,False
163375,999931,1,NCBITaxon:999931,,False


## Add labels to all taxa

In [32]:
q = """
select
    subject ,
    value
from statements
where
    predicate = 'rdfs:label' and subject = stanza
"""
[all_tax_labels, query_duration] = scoped_mapping.timed_query(q, ncbitaxon_cnx)

biosample_tax_id_counts = biosample_tax_id_counts.merge(
    all_tax_labels, left_on="curie", right_on="subject", how="left"
)

biosample_tax_id_counts = biosample_tax_id_counts[
    ["curie", "biosample_taxid", "count", "unclassified", "value"]
]
biosample_tax_id_counts.columns = [
    "curie",
    "biosample_taxid",
    "count",
    "unclassified",
    "label",
]

print(query_duration)
biosample_tax_id_counts.to_sql(
    "biobiosample_tax_id_counts", biosample_cnx, if_exists="replace", index=False
)

biosample_tax_id_counts

0:00:10.064933


Unnamed: 0,curie,biosample_taxid,count,unclassified,label
0,NCBITaxon:9606,9606,6819707,False,Homo sapiens
1,NCBITaxon:10090,10090,964219,False,Mus musculus
2,NCBITaxon:408170,408170,290862,True,human gut metagenome
3,NCBITaxon:410658,410658,280666,True,soil metagenome
4,NCBITaxon:646099,646099,208741,True,human metagenome
...,...,...,...,...,...
163372,NCBITaxon:999891,999891,1,False,Bacillus amyloliquefaciens TA208
163373,NCBITaxon:999892,999892,1,False,[Propionibacterium] humerusii P08
163374,NCBITaxon:999898,999898,1,False,Peptococcaceae bacterium CEB3
163375,NCBITaxon:999931,999931,1,False,Barrientosiimonas humi


**Almost all of the taxa that are common in the biosample collection are either unclassified/metagenomes or easily recognized cellular organisms. Cellular organism samples are de-prioritized in this exercise**

Exceptions include:
- 32630 = synthetic construct (other entries; other sequences; artificial sequences)
    - 'other entries' would add 16k rows on top of the 1k 'unclassified entities'
    - metagenomes account for 331 of the 'unclassified entities'
    - there are also a small number of uncultured/unclassified microorganisms in the biosample dataset
- 77133 = uncultured bacterium (cellular organisms; Bacteria; environmental samples)
    - 'cellular organisms' would add 2M rows on top of the 1k 'unclassified entities'
    - 'cellular organisms; Bacteria; environmental samples' adds 26k
    
----

## Get a table of MIxS annotations to be mapped to ontology classes.

Explicitly scope based on normalized package data. These values were set at the top of this notebook.


In [33]:
print(biosample_col_to_map)
print(scoping_col)
print(scoping_value)

env_broad_scale
env_package_normalized.EnvPackage
soil


**In this case, the scoping includes an inner join requirement for 'unclassified entities'**

In [34]:
q = (
    "select "
    + biosample_col_to_map
    + """, count(*) as count
from
    biosample b
join env_package_normalized on
    b.env_package = env_package_normalized.env_package
inner join biobiosample_tax_id_counts stic on
    b.taxonomy_id = stic.biosample_taxid
where """
    + scoping_col
    + " = '"
    + scoping_value
    + "' group by "
    + biosample_col_to_map
    + """
order by
    count(*) desc"""
)

[mapping_candidates, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

mapping_candidates

Unnamed: 0,env_broad_scale,count
0,ENVO:cropland biome,4856
1,cropland biome,1530
2,urban biome,973
3,tundra biome,516
4,terrestrial biome,483
...,...,...
239,terestrial,1
240,thermokarst ponds,1
241,urban boiome,1
242,wastewater treatment system,1


----

## The Biosample format allows for pipe-delimited environmental package lists. 

Separate those out into their components.

----


In [35]:

multi_frames = []
for row in mapping_candidates.itertuples(index=True, name="Pandas"):
    split_check = row.env_broad_scale
    if split_check is None:
        split_check = ""
    splitted = pd.Series(split_check.split("|"))
    splitted_count = len(splitted)
    repeated = [split_check] * splitted_count
    repeated = pd.Series(repeated)
    as_frame = pd.DataFrame(dict(repeated=repeated, splitted=splitted)).reset_index()
    seq_list = list(range(1, splitted_count + 1))
    as_frame["part_count"] = splitted_count
    as_frame["seq"] = seq_list
    multi_frames.append(as_frame)
concat_frame = pd.concat(multi_frames)
concat_frame = concat_frame[["repeated", "splitted", "part_count", "seq"]]
concat_frame

Unnamed: 0,repeated,splitted,part_count,seq
0,ENVO:cropland biome,ENVO:cropland biome,1,1
0,cropland biome,cropland biome,1,1
0,urban biome,urban biome,1,1
0,tundra biome,tundra biome,1,1
0,terrestrial biome,terrestrial biome,1,1
...,...,...,...,...
0,terestrial,terestrial,1,1
0,thermokarst ponds,thermokarst ponds,1,1
0,urban boiome,urban boiome,1,1
0,wastewater treatment system,wastewater treatment system,1,1


In [36]:
mapping_candidates = mapping_candidates.merge(
    concat_frame, left_on=biosample_col_to_map, right_on="repeated", how="left"
)

mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,part_count,seq
0,ENVO:cropland biome,4856,ENVO:cropland biome,ENVO:cropland biome,1.0,1.0
1,cropland biome,1530,cropland biome,cropland biome,1.0,1.0
2,urban biome,973,urban biome,urban biome,1.0,1.0
3,tundra biome,516,tundra biome,tundra biome,1.0,1.0
4,terrestrial biome,483,terrestrial biome,terrestrial biome,1.0,1.0
...,...,...,...,...,...,...
239,terestrial,1,terestrial,terestrial,1.0,1.0
240,thermokarst ponds,1,thermokarst ponds,thermokarst ponds,1.0,1.0
241,urban boiome,1,urban boiome,urban boiome,1.0,1.0
242,wastewater treatment system,1,wastewater treatment system,wastewater treatment system,1.0,1.0


## Normalize a few different ways `ENVO` IDs have been entered
In the Biosample metadata

In [37]:
# HARCODED/SINGLE PREFIX
# this helps by standardizing what could be an ID
# but if the RHS is text, then envo should just be removed
mapping_candidates["envo_tidy"] = mapping_candidates.splitted.str.replace(
    "envo[:_ ]", "ENVO:", regex=True, case=False
)

mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,part_count,seq,envo_tidy
0,ENVO:cropland biome,4856,ENVO:cropland biome,ENVO:cropland biome,1.0,1.0,ENVO:cropland biome
1,cropland biome,1530,cropland biome,cropland biome,1.0,1.0,cropland biome
2,urban biome,973,urban biome,urban biome,1.0,1.0,urban biome
3,tundra biome,516,tundra biome,tundra biome,1.0,1.0,tundra biome
4,terrestrial biome,483,terrestrial biome,terrestrial biome,1.0,1.0,terrestrial biome
...,...,...,...,...,...,...,...
239,terestrial,1,terestrial,terestrial,1.0,1.0,terestrial
240,thermokarst ponds,1,thermokarst ponds,thermokarst ponds,1.0,1.0,thermokarst ponds
241,urban boiome,1,urban boiome,urban boiome,1.0,1.0,urban boiome
242,wastewater treatment system,1,wastewater treatment system,wastewater treatment system,1.0,1.0,wastewater treatment system


# Now try to extract ontology terms that are already present

In [38]:
candidate_series_decomposition = scoped_mapping.decompose_series(
    mapping_candidates["envo_tidy"], id_patterns[target_onto_prefix]
)

mapping_candidates = pd.concat(
    [mapping_candidates, candidate_series_decomposition], axis=1
)

# SEE "HARCODED/SINGLE PREFIX" ABOVE
mapping_candidates["remaining_tidied"] = mapping_candidates.remaining_tidied.str.replace(
    "envo[:_ ]", "", regex=True, case=False
)

mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,part_count,seq,envo_tidy,string,extract,remaining_string,remaining_tidied
0,ENVO:cropland biome,4856,ENVO:cropland biome,ENVO:cropland biome,1.0,1.0,ENVO:cropland biome,ENVO:cropland biome,,ENVO:cropland biome,cropland biome
1,cropland biome,1530,cropland biome,cropland biome,1.0,1.0,cropland biome,cropland biome,,cropland biome,cropland biome
2,urban biome,973,urban biome,urban biome,1.0,1.0,urban biome,urban biome,,urban biome,urban biome
3,tundra biome,516,tundra biome,tundra biome,1.0,1.0,tundra biome,tundra biome,,tundra biome,tundra biome
4,terrestrial biome,483,terrestrial biome,terrestrial biome,1.0,1.0,terrestrial biome,terrestrial biome,,terrestrial biome,terrestrial biome
...,...,...,...,...,...,...,...,...,...,...,...
239,terestrial,1,terestrial,terestrial,1.0,1.0,terestrial,terestrial,,terestrial,terestrial
240,thermokarst ponds,1,thermokarst ponds,thermokarst ponds,1.0,1.0,thermokarst ponds,thermokarst ponds,,thermokarst ponds,thermokarst ponds
241,urban boiome,1,urban boiome,urban boiome,1.0,1.0,urban boiome,urban boiome,,urban boiome,urban boiome
242,wastewater treatment system,1,wastewater treatment system,wastewater treatment system,1.0,1.0,wastewater treatment system,wastewater treatment system,,wastewater treatment system,wastewater treatment system


## Join the extracted IDs with their labels

Start by connecting to the rdftab database from which the terms and label-like annotations will be obtained

In [39]:
ontodb = "../semantic-sql/db/" + target_onto_prefix.lower() + ".db"
ontocon = sqlite3.connect(ontodb)

## extracting the labels

In [40]:
q = """
select
    subject ,
    value
from
    statements s
where
    predicate = 'rdfs:label'
"""
[onto_labels, query_duration] = scoped_mapping.timed_query(q, ontocon)

onto_labels

Unnamed: 0,subject,value
0,IAO:0000111,editor preferred term~editor preferred label
1,IAO:0000112,example of usage
2,IAO:0000114,has curation status
3,IAO:0000115,definition
4,IAO:0000116,editor note
...,...,...
6774,ENVO:01001862,Solar radiation
6775,<https://www.wikidata.org/wiki/Q2>,Earth
6776,<https://www.wikidata.org/wiki/Q2306597>,Suni
6777,<https://www.wikidata.org/wiki/Q525>,Sol


## and merging 

In [41]:
mapping_candidates = mapping_candidates.merge(
    onto_labels, left_on="extract", right_on="subject", how="left"
)
mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,part_count,seq,envo_tidy,string,extract,remaining_string,remaining_tidied,subject,value
0,ENVO:cropland biome,4856,ENVO:cropland biome,ENVO:cropland biome,1.0,1.0,ENVO:cropland biome,ENVO:cropland biome,,ENVO:cropland biome,cropland biome,,
1,cropland biome,1530,cropland biome,cropland biome,1.0,1.0,cropland biome,cropland biome,,cropland biome,cropland biome,,
2,urban biome,973,urban biome,urban biome,1.0,1.0,urban biome,urban biome,,urban biome,urban biome,,
3,tundra biome,516,tundra biome,tundra biome,1.0,1.0,tundra biome,tundra biome,,tundra biome,tundra biome,,
4,terrestrial biome,483,terrestrial biome,terrestrial biome,1.0,1.0,terrestrial biome,terrestrial biome,,terrestrial biome,terrestrial biome,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,terestrial,1,terestrial,terestrial,1.0,1.0,terestrial,terestrial,,terestrial,terestrial,,
240,thermokarst ponds,1,thermokarst ponds,thermokarst ponds,1.0,1.0,thermokarst ponds,thermokarst ponds,,thermokarst ponds,thermokarst ponds,,
241,urban boiome,1,urban boiome,urban boiome,1.0,1.0,urban boiome,urban boiome,,urban boiome,urban boiome,,
242,wastewater treatment system,1,wastewater treatment system,wastewater treatment system,1.0,1.0,wastewater treatment system,wastewater treatment system,,wastewater treatment system,wastewater treatment system,,


## Use cosine string distance to see if the labels match closely enough

I.e. the labels claimed by the Biosample data set and the labels asserted in the ontology. if they're close enough, consider the assigned ID legit


_How close is close enough?_

In [42]:
my_cosine_obj = Cosine(my_string_dist_arg)
mapping_candidates["value"] = mapping_candidates["value"].fillna("")
mapping_candidates["remaining_tidied"] = mapping_candidates["remaining_tidied"].fillna("")
mapping_candidates["cosine"] = mapping_candidates.apply(
    lambda my_row: my_cosine_obj.distance(
        my_row["remaining_tidied"].lower(), my_row["value"].lower()
    ),
    axis=1,
)
mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,part_count,seq,envo_tidy,string,extract,remaining_string,remaining_tidied,subject,value,cosine
0,ENVO:cropland biome,4856,ENVO:cropland biome,ENVO:cropland biome,1.0,1.0,ENVO:cropland biome,ENVO:cropland biome,,ENVO:cropland biome,cropland biome,,,1.0
1,cropland biome,1530,cropland biome,cropland biome,1.0,1.0,cropland biome,cropland biome,,cropland biome,cropland biome,,,1.0
2,urban biome,973,urban biome,urban biome,1.0,1.0,urban biome,urban biome,,urban biome,urban biome,,,1.0
3,tundra biome,516,tundra biome,tundra biome,1.0,1.0,tundra biome,tundra biome,,tundra biome,tundra biome,,,1.0
4,terrestrial biome,483,terrestrial biome,terrestrial biome,1.0,1.0,terrestrial biome,terrestrial biome,,terrestrial biome,terrestrial biome,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,terestrial,1,terestrial,terestrial,1.0,1.0,terestrial,terestrial,,terestrial,terestrial,,,1.0
240,thermokarst ponds,1,thermokarst ponds,thermokarst ponds,1.0,1.0,thermokarst ponds,thermokarst ponds,,thermokarst ponds,thermokarst ponds,,,1.0
241,urban boiome,1,urban boiome,urban boiome,1.0,1.0,urban boiome,urban boiome,,urban boiome,urban boiome,,,1.0
242,wastewater treatment system,1,wastewater treatment system,wastewater treatment system,1.0,1.0,wastewater treatment system,wastewater treatment system,,wastewater treatment system,wastewater treatment system,,,1.0


In [43]:
mapping_candidates.to_clipboard(index=False)

**Previously, we did a reality check on the claimed IDs and labels. If a label is claimed without any ID, that could still be a path to an ontology term.**

We'll be doing some merging, so make sure column names are meaningful and unique


In [44]:
mapping_candidates.columns = [
    "env_broad_scale",
    "count",
    "repeated",
    "splitted",
    "part_count",
    "seq",
    "envo_tidy",
    "string",
    "extract",
    "remaining_string",
    "remaining_tidied",
    "term_id",
    "lab_from_id",
    "lfi_cosine",
]
mapping_candidates = mapping_candidates.merge(
    onto_labels, left_on="remaining_tidied", right_on="value", how="left"
)

mapping_candidates.columns = [
    "env_broad_scale",
    "count",
    "repeated",
    "splitted",
    "part_count",
    "seq",
    "envo_tidy",
    "string",
    "extract",
    "remaining_string",
    "remaining_tidied",
    "term_id",
    "lab_from_id",
    "lfi_cosine",
    "term_id_from_lab",
    "value",
]


mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,part_count,seq,envo_tidy,string,extract,remaining_string,remaining_tidied,term_id,lab_from_id,lfi_cosine,term_id_from_lab,value
0,ENVO:cropland biome,4856,ENVO:cropland biome,ENVO:cropland biome,1.0,1.0,ENVO:cropland biome,ENVO:cropland biome,,ENVO:cropland biome,cropland biome,,,1.0,ENVO:01000245,cropland biome
1,cropland biome,1530,cropland biome,cropland biome,1.0,1.0,cropland biome,cropland biome,,cropland biome,cropland biome,,,1.0,ENVO:01000245,cropland biome
2,urban biome,973,urban biome,urban biome,1.0,1.0,urban biome,urban biome,,urban biome,urban biome,,,1.0,ENVO:01000249,urban biome
3,tundra biome,516,tundra biome,tundra biome,1.0,1.0,tundra biome,tundra biome,,tundra biome,tundra biome,,,1.0,ENVO:01000180,tundra biome
4,terrestrial biome,483,terrestrial biome,terrestrial biome,1.0,1.0,terrestrial biome,terrestrial biome,,terrestrial biome,terrestrial biome,,,1.0,ENVO:00000446,terrestrial biome
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,terestrial,1,terestrial,terestrial,1.0,1.0,terestrial,terestrial,,terestrial,terestrial,,,1.0,,
240,thermokarst ponds,1,thermokarst ponds,thermokarst ponds,1.0,1.0,thermokarst ponds,thermokarst ponds,,thermokarst ponds,thermokarst ponds,,,1.0,,
241,urban boiome,1,urban boiome,urban boiome,1.0,1.0,urban boiome,urban boiome,,urban boiome,urban boiome,,,1.0,,
242,wastewater treatment system,1,wastewater treatment system,wastewater treatment system,1.0,1.0,wastewater treatment system,wastewater treatment system,,wastewater treatment system,wastewater treatment system,,,1.0,,


## Find consensus term IDs and labels


In [46]:
mapping_candidates["consensus_id"] = mapping_candidates["term_id_from_lab"]
mapping_candidates["consensus_lab"] = mapping_candidates["value"]

flag = mapping_candidates["consensus_id"].isnull() & (
    (
        ~mapping_candidates["term_id"].isnull()
        & mapping_candidates.lfi_cosine.le(my_max_string_dist)
    )
    | (
        ~mapping_candidates["term_id"].isnull()
        & mapping_candidates["remaining_tidied"].eq("")
    )
)

replacements = mapping_candidates.loc[flag, "term_id"]
mapping_candidates.loc[flag, "consensus_id"] = replacements

replacements = mapping_candidates.loc[flag, "lab_from_id"]
mapping_candidates.loc[flag, "consensus_lab"] = replacements

flag = mapping_candidates.consensus_id.isna()
antiflag = ~flag
mapping_candidates["id_or_lab_ok"] = antiflag

mapping_candidates["assembled_consensus"] = (
    mapping_candidates["consensus_lab"]
    + " ["
    + mapping_candidates["consensus_id"]
    + "]"
)

In [47]:
mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,part_count,seq,envo_tidy,string,extract,remaining_string,remaining_tidied,term_id,lab_from_id,lfi_cosine,term_id_from_lab,value,consensus_id,consensus_lab,id_or_lab_ok,assembled_consensus
0,ENVO:cropland biome,4856,ENVO:cropland biome,ENVO:cropland biome,1.0,1.0,ENVO:cropland biome,ENVO:cropland biome,,ENVO:cropland biome,cropland biome,,,1.0,ENVO:01000245,cropland biome,ENVO:01000245,cropland biome,True,cropland biome [ENVO:01000245]
1,cropland biome,1530,cropland biome,cropland biome,1.0,1.0,cropland biome,cropland biome,,cropland biome,cropland biome,,,1.0,ENVO:01000245,cropland biome,ENVO:01000245,cropland biome,True,cropland biome [ENVO:01000245]
2,urban biome,973,urban biome,urban biome,1.0,1.0,urban biome,urban biome,,urban biome,urban biome,,,1.0,ENVO:01000249,urban biome,ENVO:01000249,urban biome,True,urban biome [ENVO:01000249]
3,tundra biome,516,tundra biome,tundra biome,1.0,1.0,tundra biome,tundra biome,,tundra biome,tundra biome,,,1.0,ENVO:01000180,tundra biome,ENVO:01000180,tundra biome,True,tundra biome [ENVO:01000180]
4,terrestrial biome,483,terrestrial biome,terrestrial biome,1.0,1.0,terrestrial biome,terrestrial biome,,terrestrial biome,terrestrial biome,,,1.0,ENVO:00000446,terrestrial biome,ENVO:00000446,terrestrial biome,True,terrestrial biome [ENVO:00000446]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,terestrial,1,terestrial,terestrial,1.0,1.0,terestrial,terestrial,,terestrial,terestrial,,,1.0,,,,,False,
240,thermokarst ponds,1,thermokarst ponds,thermokarst ponds,1.0,1.0,thermokarst ponds,thermokarst ponds,,thermokarst ponds,thermokarst ponds,,,1.0,,,,,False,
241,urban boiome,1,urban boiome,urban boiome,1.0,1.0,urban boiome,urban boiome,,urban boiome,urban boiome,,,1.0,,,,,False,
242,wastewater treatment system,1,wastewater treatment system,wastewater treatment system,1.0,1.0,wastewater treatment system,wastewater treatment system,,wastewater treatment system,wastewater treatment system,,,1.0,,,,,False,


In [48]:
# mapping_candidates.to_clipboard(index=False)

## Save these easy term mappings to SQLite

In [49]:
# mapping_candidates.to_sql('mapping_scratch', biosample_cnx, if_exists='replace', index=False)

## For which Biosample annotations were no easy mappings found?
How many Biosamples uses those annotations?

In [50]:
flag = ~mapping_candidates.id_or_lab_ok
needs_search = mapping_candidates.loc[flag]

needs_search = needs_search[["remaining_tidied", "count"]]

sum_by_needed = needs_search.groupby("remaining_tidied")["count"].sum()

sum_by_needed = sum_by_needed.to_frame()
sum_by_needed["remaining_tidied"] = sum_by_needed.index

sum_by_needed = sum_by_needed.sort_values("count", ascending=False)
sum_by_needed.reset_index(drop=True)

sum_by_needed.to_sql("sum_by_needed", biosample_cnx, if_exists="replace", index=False)

In [51]:
sum_by_needed

Unnamed: 0_level_0,count,remaining_tidied
remaining_tidied,Unnamed: 1_level_1,Unnamed: 2_level_1
forest,456,forest
tundra,347,tundra
hot springs,172,hot springs
tropical and subtropical moist broadleaf forest biome,170,tropical and subtropical moist broadleaf fores...
arctic,164,arctic
...,...,...
coastal saline 9,1,coastal saline 9
coastal saline 8,1,coastal saline 8
coastal saline 7,1,coastal saline 7
coastal saline 6,1,coastal saline 6


## Extract the tidied strings

In [52]:
ebs_raw_list = list(sum_by_needed["remaining_tidied"])
ebs_raw_list.sort()

## Submit those tidied strings to a search engine

Specifically OLS search. This takes roughly one second per unique post-tidied submission

_Turn logging back on to show status?_
_Print the count and pre- and post- datestamps_


In [53]:
ebs_search_res = scoped_mapping.search_get_annotations_wrapper(
    ebs_raw_list,
    bad_chars=chars_to_whiteout,
    cat_name=biosample_col_to_map,
    ontoprefix="envo,gaz",
    query_fields="",
    rr=5,
    string_dist_arg=my_string_dist_arg,
)
ebs_search_res.to_sql(
    "ols_search_results", biosample_cnx, if_exists="replace", index=False
)

## Filter out the best of the acceptable mappings
From a string distance perspective

In [54]:
my_best_acceptable = scoped_mapping.get_best_acceptable(
    ebs_search_res, max_string_dist=my_max_string_dist
)
my_best_acceptable

Unnamed: 0,category,raw,query,name,string_dist_rank,string_dist,obo_id,label,search_rank,ontology_prefix,scope,type,iri,ontology_name
197,env_broad_scale,bacteria,bacteria,Bacteria,1,0.0,NCBITaxon:2,Bacteria,1,ENVO,label,label,http://purl.obolibrary.org/obo/NCBITaxon_2,envo
240,env_broad_scale,basin,basin,Basin,1,0.0,GAZ:22223655,Basin,1,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_22223655,gaz
296,env_broad_scale,boreal forest,boreal forest,boreal forest,1,0.0,ENVO:01000250,subpolar coniferous forest biome,1,ENVO,has_narrow_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_01000250,envo
557,env_broad_scale,coniferous forest,coniferous forest,coniferous forest,1,0.0,ENVO:01000433,needleleaf forest,1,ENVO,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_01000433,envo
688,env_broad_scale,forest,forest,Forest,1,0.0,GAZ:00454366,Forest,1,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_00454366,gaz
773,env_broad_scale,grassland,grassland,Grassland,1,0.0,ENVO:01001206,grassland ecosystem,1,ENVO,has_related_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_01001206,envo
790,env_broad_scale,hot springs,hot springs,Hot Springs,1,0.0,GAZ:22224982,Hot Springs,1,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_22224982,gaz
826,env_broad_scale,leaf litter,leaf litter,leaf litter,1,0.0,ENVO:01000628,plant litter,1,ENVO,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_01000628,envo
1017,env_broad_scale,ria de aveiro,ria de aveiro,Ria de Aveiro,1,0.0,GAZ:00146341,Ria de Aveiro,1,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_00146341,gaz
1031,env_broad_scale,rice paddy,rice paddy,rice paddy,1,0.0,ENVO:00000296,rice field,3,ENVO,has_related_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_00000296,envo


----

## Filter out the submissions with no acceptable matches

In [55]:
no_acceptable_mappings = scoped_mapping.get_no_acceptable_mappings(
    ebs_search_res, my_best_acceptable
)

no_acceptable_mappings

Unnamed: 0,category,raw,query,name,string_dist_rank,string_dist,obo_id,label,search_rank,ontology_prefix,scope,type,iri,ontology_name
0,env_broad_scale,,,,1,0.000,,,1,,,,,
8,env_broad_scale,a semi arid grassland soil fungi,a semi arid grassland soil fungi,grassland soil,1,0.297,ENVO:00005750,grassland soil,2,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_00005750,envo
10,env_broad_scale,a semi arid grassland soil fungi,a semi arid grassland soil fungi,flooded grassland biome,2,0.551,ENVO:01000195,flooded grassland biome,4,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_01000195,envo
7,env_broad_scale,a semi arid grassland soil fungi,a semi arid grassland soil fungi,semi-arid climate,3,0.662,ENVO:01000378,arid subtropical,1,ENVO,has_related_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_01000378,envo
6,env_broad_scale,a semi arid grassland soil fungi,a semi arid grassland soil fungi,semi-arid climate,4,0.662,ENVO:01000378,arid subtropical,1,ENVO,hasRelatedSynonym,,http://purl.obolibrary.org/obo/ENVO_01000378,envo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1375,env_broad_scale,western amazon basin,western amazon basin,River Amazon,7,0.539,GAZ:00047130,River Amazon,4,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_00047130,gaz
1371,env_broad_scale,western amazon basin,western amazon basin,Amazonia,8,0.588,GAZ:00006844,Amazonia,2,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_00006844,gaz
1379,env_broad_scale,western amazon basin,western amazon basin,Guiana Basin,9,0.671,GAZ:00143796,Guiana Basin,5,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_00143796,gaz
1376,env_broad_scale,western amazon basin,western amazon basin,rio amazonas {language: portuguese},10,0.709,GAZ:00047130,River Amazon,4,GAZ,synonym,synonym,http://purl.obolibrary.org/obo/GAZ_00047130,gaz


## Try searching the failures against all ontologies in OLS

In [56]:
still_unmapped = list(set(list(no_acceptable_mappings["raw"])))
still_unmapped.sort()

salvage_search_res = scoped_mapping.search_get_annotations_wrapper(
    still_unmapped,
    bad_chars="._-",
    cat_name="salvage",
    ontoprefix="",
    query_fields="",
    rr=5,
    string_dist_arg=2,
)

salvage_search_res.to_sql(
    "salvage_search_res", biosample_cnx, if_exists="replace", index=False
)

## We appear to be at a point of diminishing returns
At the very least, will require some review

In [57]:
salvage_best_acceptable = scoped_mapping.get_best_acceptable(
    salvage_search_res, max_string_dist=0.2
)
salvage_best_acceptable

Unnamed: 0,category,raw,query,name,string_dist_rank,string_dist,obo_id,label,search_rank,ontology_prefix,scope,type,iri,ontology_name
106,salvage,arctic,arctic,Arctic,1,0.0,NCIT:C44738,Arctic,1,NCIT,label,label,http://purl.obolibrary.org/obo/NCIT_C44738,ncit
287,salvage,artificial soil,artificial soil,Artificial,1,0.198,NCIT:C61464,Artificial,1,NCIT,label,label,http://purl.obolibrary.org/obo/NCIT_C61464,ncit
319,salvage,bamboo,bamboo,Bamboo Dam,1,0.191,GAZ:00148513,Bamboo Dam,2,GAZ,label,label,http://purl.obolibrary.org/obo/GAZ_00148513,gaz
389,salvage,beech forest,beech forest,beech forest soil,1,0.171,ENVO:00005770,beech forest soil,1,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_00005770,envo
983,salvage,cold desert soil,cold desert soil,cold desert,1,0.184,ENVO:01000382,cold desert,1,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_01000382,envo
1026,salvage,cotton,cotton,cotton,1,0.0,NCIT:C77117,Cotton Rat,1,NCIT,synonym,synonym,http://purl.obolibrary.org/obo/NCIT_C77117,ncit
1072,salvage,crop,crop,Crop,1,0.0,NCIT:C111162,Crop,1,NCIT,label,label,http://purl.obolibrary.org/obo/NCIT_C111162,ncit
1095,salvage,cryoconite,cryoconite,cryoconite hole,1,0.198,ENVO:03000039,cryoconite hole,2,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_03000039,envo
1361,salvage,deciduous broadleaf forest,deciduous broadleaf forest,tropical deciduous broadleaf forest,1,0.133,ENVO:01000387,tropical deciduous broadleaf forest,4,MICRO,label,label,http://purl.obolibrary.org/obo/ENVO_01000387,micro
1390,salvage,deyeuxia arundinacea,deyeuxia arundinacea,Deyeuxia arundinacea,1,0.0,NCBITaxon:1767996,Deyeuxia arundinacea,1,NCBITAXON,label,label,http://purl.obolibrary.org/obo/NCBITaxon_1767996,ncbitaxon


## Save the mappings for review!

_Note that the merge on ID or label mappings are not included here_


In [59]:
my_best_acceptable[include_col] = first_pass_include_val
salvage_best_acceptable[include_col] = salvage_include_val
best_and_salvage = pd.concat([my_best_acceptable, salvage_best_acceptable])

best_and_salvage["ols_curation_notes"] = ""

best_and_salvage.to_csv(ols_review_file, sep=ols_review_seperator, index=False)


In [60]:
best_and_salvage.to_clipboard(index=False)

In [None]:
raise SystemExit("Don't skip reviewing and saving!")

## Do some review in a spreadsheet application!
Specifically, columns `include` and `ols_curation_notes`

Then Save

----

## Now read the reviewed spreadsheet back in

In [None]:
curated = pd.read_csv(ols_review_file, sep=ols_review_seperator)
curated

In [63]:
# OR
curated = pd.read_clipboard()
curated

Unnamed: 0,raw,query,name,obo_id,label,ontology_prefix,scope,type,ontology_name,include,ols_curation_notes
0,ectomycorrhizae,ectomycorrhizae,ectomycorrhiza,BTO:0002237,ectomycorrhiza,BTO,label,label,bto,False,Mam 20210609
1,tropical and subtropical moist broadleaf fores...,tropical and subtropical moist broadleaf fores...,subtropical broadleaf forest biome,ENVO:01000201,subtropical broadleaf forest biome,ENM,label,label,enm,False,Omits tropical and moose
2,beech forest,beech forest,beech forest soil,ENVO:00005770,beech forest soil,ENVO,label,label,envo,False,Adds soil
3,cold desert soil,cold desert soil,cold desert,ENVO:01000382,cold desert,ENVO,label,label,envo,False,Loses soil
4,cryoconite,cryoconite,cryoconite hole,ENVO:03000039,cryoconite hole,ENVO,label,label,envo,False,Adds hole
5,spruce forest,spruce forest,spruce forest soil,ENVO:00005784,spruce forest soil,ENVO,label,label,envo,False,Adds soil
6,temperate broadleaf and mixed forest biome,temperate broadleaf and mixed forest biome,temperate mixed broadleaf forest,ENVO:01000389,temperate mixed broadleaf forest,ENVO,label,label,envo,False,Pretty close
7,temperate deciduous forest,temperate deciduous forest,temperate deciduous broadleaf forest,ENVO:01000385,temperate deciduous broadleaf forest,ENVO,label,label,envo,False,Adds broadleaf
8,konjac,konjac,konjac,FOODON:03414400,konjac,FOODON,label,label,foodon,False,
9,bamboo,bamboo,Bamboo Dam,GAZ:00148513,Bamboo Dam,GAZ,label,label,gaz,False,


## Filter on `include`

In [64]:
flag = curated["include"]
curated = curated.loc[flag]

curated.columns = "ols_" + curated.columns
curated.to_sql("curated", biosample_cnx, if_exists="replace", index=False)
curated

Unnamed: 0,ols_raw,ols_query,ols_name,ols_obo_id,ols_label,ols_ontology_prefix,ols_scope,ols_type,ols_ontology_name,ols_include,ols_ols_curation_notes
24,montane shrubland,montane shrubland,montane shrubland biome,ENVO:01000216,montane shrubland biome,ENVO,label,label,envo,True,Mam 20210609
25,paddy soil,paddy soil,rice paddy soil,ENVO:00005740,paddy field soil,ENVO,has_related_synonym,annotation,envo,True,Mam 20210609
26,polar desert,polar desert,polar desert biome,ENVO:01000186,polar desert biome,ENVO,label,label,envo,True,Mam 20210609
27,temperate coniferous forests,temperate coniferous forests,temperate coniferous forest biome,ENVO:01000211,temperate coniferous forest biome,ENVO,label,label,envo,True,Mam 20210609
28,terrestrial,terrestrial,terrestrial realm,ENVO:00000446,terrestrial biome,ENVO,has_related_synonym,annotation,envo,True,Mam 20210609
29,wastewater treatment system,wastewater treatment system,wastewater treatment plant,ENVO:00002043,wastewater treatment plant,ENVO,label,label,envo,True,Mam 20210609
30,bacteria,bacteria,Bacteria,NCBITaxon:2,Bacteria,ENVO,label,label,envo,True,
31,boreal forest,boreal forest,boreal forest,ENVO:01000250,subpolar coniferous forest biome,ENVO,has_narrow_synonym,annotation,envo,True,
32,coniferous forest,coniferous forest,coniferous forest,ENVO:01000433,needleleaf forest,ENVO,synonym,synonym,envo,True,
33,grassland,grassland,Grassland,ENVO:01001206,grassland ecosystem,ENVO,has_related_synonym,annotation,envo,True,


In [65]:
merge_search_merged = mapping_candidates.merge(
    curated, how="outer", left_on="remaining_tidied", right_on="ols_raw"
)

In [66]:
flag = (
    ~merge_search_merged["id_or_lab_ok"]
    & ~merge_search_merged["ols_obo_id"].eq("")
    & ~merge_search_merged["ols_obo_id"].isna()
)

replacement = merge_search_merged.loc[flag, "ols_obo_id"]

merge_search_merged.loc[flag, "consensus_id"] = replacement

replacement = merge_search_merged.loc[flag, "ols_label"]

merge_search_merged.loc[flag, "consensus_lab"] = replacement

replacement = (
    merge_search_merged.loc[flag, "consensus_lab"]
    + " ["
    + merge_search_merged.loc[flag, "consensus_id"]
    + "]"
)

merge_search_merged.loc[flag, "assembled_consensus"] = replacement

In [67]:
merge_search_merged.to_sql(
    "merge_search_merged", biosample_cnx, if_exists="replace", index=False
)

In [68]:
summary = merge_search_merged[["env_broad_scale", "count", "assembled_consensus"]]
summary = summary.sort_values("count", ascending=False)
summary

Unnamed: 0,env_broad_scale,count,assembled_consensus
0,ENVO:cropland biome,4856,cropland biome [ENVO:01000245]
1,cropland biome,1530,cropland biome [ENVO:01000245]
2,urban biome,973,urban biome [ENVO:01000249]
3,tundra biome,516,tundra biome [ENVO:01000180]
4,terrestrial biome,483,terrestrial biome [ENVO:00000446]
...,...,...,...
186,coastal saline 12,1,
185,coastal saline 11,1,
184,coastal saline 10,1,
183,cellar mud,1,


## TODOs etc.

Advantages and disadvantages of including non-local annotations in salvage search? Example = ???
small/large (freshwater) lake

Include those native labels in the review frame?

Do more of that kind of thing (term ID/label joining) with rdftab, not OLS

What to save where? new column in biosample sqlite? or move content from original column and insert the new values into that original column?

SSSOM:
- add string distance/confidence... multiple columns?
- not all of these are OLS mapped


In [69]:
frame_for_sssom = merge_search_merged

na_flag = frame_for_sssom["splitted"].isna()
frame_for_sssom = frame_for_sssom[~na_flag]

na_flag = frame_for_sssom["consensus_id"].isna()
frame_for_sssom = frame_for_sssom[~na_flag]

raw_queries = list(frame_for_sssom["splitted"])
urlencodeds = list(map(urllib.parse.quote, raw_queries))
urlencodeds = [sssom_subject_prefix + ":" + item for item in urlencodeds]

results_rows = len(frame_for_sssom.index)
iso8601_stamp = datetime.now().replace(microsecond=0).isoformat()

sssom_frame = {
    "subject_category": [biosample_col_to_map] * results_rows,
    "subject_label": frame_for_sssom["splitted"],
    "predicate_id": ["skos:relatedMatch"] * results_rows,
    "object_id": frame_for_sssom["consensus_id"],
    "object_label": frame_for_sssom["consensus_lab"],
    "match_type": ["Lexical"] * results_rows,
    "creator_id": ["https://github.com/turbomam/scoped-mapping"] * results_rows,
    #     "mapping_tool": ["https://www.ebi.ac.uk/ols/docs/api"] * results_rows,
    "mapping_date": [iso8601_stamp] * results_rows,
    #     "confidence": 1 - frame_for_sssom["string_dist"],
    "subject_id": urlencodeds,
}

sssom_frame = pd.DataFrame(sssom_frame)

sssom_frame.to_csv(sssom_file, sep="\t", index=False)

sssom_frame


Unnamed: 0,subject_category,subject_label,predicate_id,object_id,object_label,match_type,creator_id,mapping_date,subject_id
0,env_broad_scale,ENVO:cropland biome,skos:relatedMatch,ENVO:01000245,cropland biome,Lexical,https://github.com/turbomam/scoped-mapping,2021-06-09T10:27:37,biosample_ebs:ENVO%3Acropland%20biome
1,env_broad_scale,cropland biome,skos:relatedMatch,ENVO:01000245,cropland biome,Lexical,https://github.com/turbomam/scoped-mapping,2021-06-09T10:27:37,biosample_ebs:cropland%20biome
2,env_broad_scale,urban biome,skos:relatedMatch,ENVO:01000249,urban biome,Lexical,https://github.com/turbomam/scoped-mapping,2021-06-09T10:27:37,biosample_ebs:urban%20biome
3,env_broad_scale,tundra biome,skos:relatedMatch,ENVO:01000180,tundra biome,Lexical,https://github.com/turbomam/scoped-mapping,2021-06-09T10:27:37,biosample_ebs:tundra%20biome
4,env_broad_scale,terrestrial biome,skos:relatedMatch,ENVO:00000446,terrestrial biome,Lexical,https://github.com/turbomam/scoped-mapping,2021-06-09T10:27:37,biosample_ebs:terrestrial%20biome
...,...,...,...,...,...,...,...,...,...
227,env_broad_scale,marsh,skos:relatedMatch,ENVO:00000035,marsh,Lexical,https://github.com/turbomam/scoped-mapping,2021-06-09T10:27:37,biosample_ebs:marsh
228,env_broad_scale,mixed forest biome,skos:relatedMatch,ENVO:01000198,mixed forest biome,Lexical,https://github.com/turbomam/scoped-mapping,2021-06-09T10:27:37,biosample_ebs:mixed%20forest%20biome
230,env_broad_scale,montane shrubland,skos:relatedMatch,ENVO:01000216,montane shrubland biome,Lexical,https://github.com/turbomam/scoped-mapping,2021-06-09T10:27:37,biosample_ebs:montane%20shrubland
232,env_broad_scale,paddy soil,skos:relatedMatch,ENVO:00005740,paddy field soil,Lexical,https://github.com/turbomam/scoped-mapping,2021-06-09T10:27:37,biosample_ebs:paddy%20soil


# From here down requires refactoring and generalization 

That may even be true for some of the blocks above this!

## join together pipe-delimited env_braod_scale mappings 
what other constraints? If part_count is > 1, do we need to check for empty or null env_broad_scale, repeated or splitted?

In [70]:
multi_part_flag = merge_search_merged["part_count"] > 1

multi_part_frame = merge_search_merged.loc[multi_part_flag]

multi_part_frame = multi_part_frame[
    [
        "env_broad_scale",
        "count",
        "repeated",
        "splitted",
        "part_count",
        "seq",
        "consensus_id",
        "consensus_lab",
    ]
]

multi_part_frame = multi_part_frame.sort_values(
    ["env_broad_scale", "seq"], ascending=(True, True)
)

part_count = multi_part_frame.groupby(["env_broad_scale"], sort=False)[
    "part_count"
].max()
seq_max = multi_part_frame.groupby(["env_broad_scale"], sort=False)["seq"].max()

parts_check = pd.DataFrame(dict(part_count=part_count, seq_max=seq_max)).reset_index()

all_parts_flag = parts_check["seq_max"] == parts_check["part_count"]
ebs_with_all_parts = parts_check.loc[all_parts_flag, "env_broad_scale"]
ebs_with_all_parts

all_parts_flag = multi_part_frame["env_broad_scale"].isin(list(ebs_with_all_parts))
all_parts_frame = multi_part_frame.loc[all_parts_flag]
all_parts_frame

Unnamed: 0,env_broad_scale,count,repeated,splitted,part_count,seq,consensus_id,consensus_lab


In [None]:
unique_pipesep_ebs = list(set(list(all_parts_frame["env_broad_scale"])))
unique_pipesep_ebs.sort()
repipe_dict_list = []
for one_pipesep in unique_pipesep_ebs:
    flag = all_parts_frame["env_broad_scale"] == one_pipesep
    temp_frame = all_parts_frame.loc[flag]
    ids_list = list(temp_frame["consensus_id"])
    lab_list = list(temp_frame["consensus_lab"])
    s = "|"
    ids_repipe = s.join(ids_list)
    lab_repipe = s.join(lab_list)
    repipe_dict = {
        "env_broad_scale": one_pipesep,
        "consensus_id": ids_repipe,
        "consensus_lab": lab_repipe,
    }
    repipe_dict_list.append(repipe_dict)
repipe_frame = pd.DataFrame(repipe_dict_list)
repipe_frame

In [None]:
temp = sssom_frame[["subject_label", "object_id", "object_label"]]
temp.columns = repipe_frame.columns

direct_and_repipe = pd.concat([temp, repipe_frame])
direct_and_repipe

In [None]:
direct_and_repipe.to_sql(
    "direct_and_repipe", biosample_cnx, if_exists="replace", index=False
)

## refactor this some more

In [None]:
q = (
    """
select
	b.id,
	b.env_broad_scale ,
	dar.consensus_id ,
	dar.consensus_lab
from
	direct_and_repipe dar
left join biosample b on
	b.env_broad_scale = dar.env_broad_scale
inner join biobiosample_tax_id_counts stic on
	b.taxonomy_id = stic.biosample_taxid
join env_package_normalized on
	b.env_package = env_package_normalized.env_package
where """
    + scoping_col
    + " = '"
    + scoping_value
    + "'"
)

[per_biosample_scoped_ebs_mapping_results, query_duration] = scoped_mapping.timed_query(
    q, biosample_cnx, print_timing=True
)

In [None]:
per_biosample_scoped_ebs_mapping_results

In [None]:
per_biosample_scoped_ebs_mapping_results.to_csv(
    "per_biosample_scoped_ebs_mapping_results.tsv",
    sep=ols_review_seperator,
    index=False,
)