In [1]:
from datetime import datetime
from pkg_resources import get_distribution, DistributionNotFound
from strsimpy.cosine import Cosine
import pandas as pd
import re
import requests as requests
import sqlite3
import string
import urllib
import yaml
from xml.etree import ElementTree
from tdda import rexpy
import scoped_mapping

In [2]:
biosample_sqlite_file = "../target/harmonized_table.db"
# TODO process these as a list?
ncbitaxon_sqlite_file = "../semantic-sql/db/ncbitaxon.db"
envo_sqlite_file = "../semantic-sql/db/envo.db"
ncbitaxon_cnx = sqlite3.connect(ncbitaxon_sqlite_file)
envo_cnx = sqlite3.connect(envo_sqlite_file)
target_onto_prefix = 'ENVO'
chars_to_whiteout = '._-'
my_query_fields = ''
my_row_req = 3

env_package_overrides = {
    'built environment': 'built',
    'misc environment': 'miscellaneous',
    'missing': 'no environmental package',
    'unknown': 'no environmental package',
    'default': 'no environmental package',
    'unspecified': 'no environmental package',
    'not available': 'no environmental package',
    'not collected': 'no environmental package'
}

In [3]:
biosample_cnx = sqlite3.connect(biosample_sqlite_file)

In [4]:
# Sample of the data we're working with
q = """
select
    id,
    env_package,
    package,
    package_name,
    host_taxid,
    taxonomy_id,
    env_broad_scale,
    env_local_scale,
    env_medium
    from biosample b
limit 10
"""
biosample_first_ten = pd.read_sql(q, biosample_cnx)
biosample_first_ten

Unnamed: 0,id,env_package,package,package_name,host_taxid,taxonomy_id,env_broad_scale,env_local_scale,env_medium
0,BIOSAMPLE:SAMN00000002,missing,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,9606.0,445970,terrestrial biome [ENVO:00000446],human-associated habitat [ENVO:00009003],biological product [ENVO:02000043]
1,BIOSAMPLE:SAMN00000003,missing,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,9606.0,445972,terrestrial biome [ENVO:00000446],human-associated habitat [ENVO:00009003],biological product [ENVO:02000043]
2,BIOSAMPLE:SAMN00000004,missing,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,9606.0,449673,terrestrial biome [ENVO:00000446],human-associated habitat [ENVO:00009003],biological product [ENVO:02000043]
3,BIOSAMPLE:SAMN00000005,,Generic.1.0,Generic,,6526,,,
4,BIOSAMPLE:SAMN00000006,,Generic.1.0,Generic,,9483,,,
5,BIOSAMPLE:SAMN00000007,missing,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,9606.0,445974,terrestrial biome [ENVO:00000446],human-associated habitat [ENVO:00009003],biological product [ENVO:02000043]
6,BIOSAMPLE:SAMN00000008,missing,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,9606.0,411461,terrestrial biome [ENVO:00000446],human-associated habitat [ENVO:00009003],biological product [ENVO:02000043]
7,BIOSAMPLE:SAMN00000009,,Generic.1.0,Generic,,13616,,,
8,BIOSAMPLE:SAMN00000010,,Generic.1.0,Generic,,451639,,,
9,BIOSAMPLE:SAMN00000011,,Generic.1.0,Generic,,451638,,,


In [5]:
# Get the canonical checklist and package terms from NCBI
# Unfortunately it doesn't do a very good job of differentiating 
# checklists (MIMAG, MIMARKS, etc.) 
# from packages (soil, water, etc.)
# what about ba , euk, etc?
package_dictionary = scoped_mapping.get_package_dictionary()
package_dictionary.to_sql('package_dictionary', biosample_cnx, if_exists='replace', index=False)
package_dictionary

Unnamed: 0,Name,DisplayName,ShortName,EnvPackage,EnvPackageDisplay,NotAppropriateFor,Description,Example
0,Generic.1.0,Generic,,,,,Generic,
1,SARS-CoV-2.cl.1.0,SARS-CoV-2: clinical or host-associated; versi...,SARS-CoV-2: clinical or host-associated,,,wgs_single;wgs_batch;wgs_diploid,Use for SARS-CoV-2 samples that are relevant t...,
2,Pathogen.cl.1.0,Pathogen: clinical or host-associated; version...,Pathogen: clinical or host-associated,,,,Clinical or host-associated pathogen,SAMN02928182
3,Pathogen.env.1.0,Pathogen: environmental/food/other; version 1.0,Pathogen: environmental/food/other,,,,"Environmental, food or other pathogen",SAMN02730065
4,Microbe.1.0,Microbe; version 1.0,Microbe,,,,Use for bacteria or other unicellular microbes...,SAMN02911891
...,...,...,...,...,...,...,...,...
149,MIUVIG.plant-associated.5.0,"MIUVIG: uncultivated virus genome, plant-assoc...",MIUVIG Uncultivated Virus Genome,plant-associated,plant-associated,wgs_single;wgs_batch;wgs_diploid,Use for uncultivated virus genome identified i...,
150,MIUVIG.sediment.5.0,"MIUVIG: uncultivated virus genome, sediment; v...",MIUVIG Uncultivated Virus Genome,sediment,sediment,wgs_single;wgs_batch;wgs_diploid,Use for uncultivated virus genome identified i...,
151,MIUVIG.soil.5.0,"MIUVIG: uncultivated virus genome, soil; versi...",MIUVIG Uncultivated Virus Genome,soil,soil,wgs_single;wgs_batch;wgs_diploid,Use for uncultivated virus genome identified i...,
152,MIUVIG.wastewater.5.0,"MIUVIG: uncultivated virus genome, wastewater;...",MIUVIG Uncultivated Virus Genome,wastewater,wastewater,wgs_single;wgs_batch;wgs_diploid,Use for uncultivated virus genome identified i...,


In [6]:
# Do the Biosample checklist/package fields match any of the cannonical values?
# How many Biosample rows are there?
q = """
select count(*) as biosample_row_count
from biosample b
"""
[biosample_row_count, query_duration] = scoped_mapping.timed_query(q, biosample_cnx, print_timing=False)

print(query_duration)
biosample_row_count

0:00:00.466004


Unnamed: 0,biosample_row_count
0,14300584


In [7]:
# How many of those rows can be inner-joined with the canonical checklists/packages?
# Specifically, joining biosample.package_name = package_dictionary.DisplayName
# TODO add indexing to docs and or makefile
# create index biosample_package_name_idx on biosample(package_name);
# create index package_dictionary_DisplayName_idx on package_dictionary(DisplayName);
# create index biosample_package_idx on biosample(package);
# create index biosample_p_pn_idx on biosample(package, package_name);
q = """
select
    count(*) as cannonical_package_name_count
from
    biosample b
inner join package_dictionary pd on
    b.package_name = pd.DisplayName
"""
[cannonical_package_name_count, query_duration] = scoped_mapping.timed_query(q, biosample_cnx, print_timing=True)

print(query_duration)

cannonical_package_name_count

2021-05-24 21:13:36.208937
2021-05-24 21:13:37.977819
0:00:01.768882
0:00:01.768882


Unnamed: 0,cannonical_package_name_count
0,14300584


In [8]:
# What do the combinations of package and package_name look like in the Biosample dataset?
q = """
select
    package,
    package_name,
    count(*) as count
from
    biosample b
group by
    package ,
    package_name
order by
    package ,
    package_name
"""
[package_name_combos, query_duration] = scoped_mapping.timed_query(q, biosample_cnx, print_timing=True)

print(query_duration)

package_name_combos

2021-05-24 21:13:37.985821
2021-05-24 21:13:41.457979
0:00:03.472158
0:00:03.472158


Unnamed: 0,package,package_name,count
0,Beta-lactamase.1.0,Beta-lactamase; version 1.0,556
1,Generic.1.0,Generic,10186430
2,Human.1.0,Human; version 1.0,368893
3,Invertebrate.1.0,Invertebrate; version 1.0,131085
4,MIGS.ba.5.0,MIGS: cultured bacteria/archaea; version 5.0,28686
...,...,...,...
124,Model.organism.animal.1.0,Model organism or animal; version 1.0,382980
125,Pathogen.cl.1.0,Pathogen: clinical or host-associated; version...,511040
126,Pathogen.env.1.0,Pathogen: environmental/food/other; version 1.0,246497
127,Plant.1.0,Plant; version 1.0,385939


In [9]:
# What about the Biosample env_package values?
# Are they also a small, highly regular set?
q = """
select
    env_package,
    count(*) as count
from
    biosample b
group by
    env_package
order by
    count(*) desc
"""
[env_package_count, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

print(query_duration)

env_package_count

0:00:01.153799


Unnamed: 0,env_package,count
0,,14083847
1,host-associated,49254
2,human-gut,47921
3,water,16367
4,human-skin,13706
...,...,...
87,env_package,1
88,gut microbiome,1
89,marine sediment (ENVO:00002113),1
90,saliva,1


In [10]:
# env_package is going to need some cleanup
# First, get a set of all canonical env_package values
package_dictionary = scoped_mapping.make_tidy_col(package_dictionary, 'EnvPackage', 'eptidy')
package_dictionary =scoped_mapping.make_tidy_col(package_dictionary, 'EnvPackageDisplay', 'epdtidy')
# update in sqlite
package_dictionary.to_sql('package_dictionary', biosample_cnx, if_exists='replace', index=False)
valid_combo = []
valid_combo = scoped_mapping.add_unique_to_list(valid_combo, package_dictionary['eptidy'])
valid_combo = scoped_mapping.add_unique_to_list(valid_combo, package_dictionary['epdtidy'])

valid_combo

['',
 'air',
 'built',
 'host associated',
 'human associated',
 'human gut',
 'human oral',
 'human skin',
 'human vaginal',
 'microbial',
 'microbial mat biofilm',
 'miscellaneous',
 'miscellaneous or artificial',
 'no environmental package',
 'plant associated',
 'sediment',
 'soil',
 'wastewater',
 'wastewater sludge',
 'water']

In [11]:
# determine ID patterns
q = """
select
    distinct stanza
    from statements s
where
    predicate = 'rdf:type'
    and "object" = 'owl:Class'
    and stanza = subject"""
# include non-envo IDs that come from envo?
[ids_from_envo, query_duration] = scoped_mapping.timed_query(q, envo_cnx)
print(query_duration)
ids_from_envo = scoped_mapping.add_prefix_col(ids_from_envo, 'stanza', 'prefix')

id_patterns = scoped_mapping.get_multi_term_patterns(ids_from_envo, 'stanza', 'prefix')

env_package_normalized = scoped_mapping.env_package_nomralizastion(env_package_count, 'env_package',
                                                                target_onto_prefix, id_patterns['ENVO'])

env_package_normalized = scoped_mapping.add_overrides(env_package_normalized, 'remaining_tidied', 'ep_override',
                                                   env_package_overrides)

env_package_normalized = scoped_mapping.flag_canonical(env_package_normalized, 'ep_override', 'is_canonical',
                                                    valid_combo)

env_package_normalized.to_sql('env_package_normalized', biosample_cnx, if_exists='replace', index=False)


0:00:00.030432


In [12]:
# What do the successful normalizations look like?
q = """
select
    env_package,
    count,
    lhs,
    extract,
    ep_override
from
    env_package_normalized
where
    is_canonical = 1
"""
[successful_normalizastions, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

print(query_duration)

successful_normalizastions

0:00:00.002367


Unnamed: 0,env_package,count,lhs,extract,ep_override
0,,14083847,,,
1,host-associated,49254,,,host associated
2,human-gut,47921,,,human gut
3,water,16367,,,water
4,human-skin,13706,,,human skin
5,built environment,12391,,,built
6,soil,11974,,,soil
7,misc environment,11715,,,miscellaneous
8,missing,8453,,,no environmental package
9,human-oral,7882,,,human oral


In [13]:
# Are there any normalization failures?
q = """
select
    env_package,
    count,
    lhs,
    extract,
    ep_override
from
    env_package_normalized
where
    is_canonical = 0
"""
[normalizastion_failures, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

print(query_duration)

normalizastion_failures

0:00:00.002144


Unnamed: 0,env_package,count,lhs,extract,ep_override
0,miscellaneous natural or artificial environment,611,,,miscellaneous natural or artificial environment
1,mimarks,479,,,mimarks
2,mouse-gut,406,,,mouse gut
3,gut,172,,,gut
4,biofilm,114,,,biofilm
5,human-not providedsopharyngeal,107,,,human not providedsopharyngeal
6,mice gut,87,,,mice gut
7,CV,60,,,cv
8,"home, outdoor environment",44,,,home outdoor environment
9,fermentation-associated,42,,,fermentation associated


In [14]:
# utilizing ncbtitaxon for broad subsetting
# specifically, flag the biosamples whose taxon_id indicates they are an unclassified entity
# ignoring the others will throw out samples OF multicellular organisms, like fruit flies
# Add previous notes about what kinds of samples are missed by this bifurcation
# like bacteria.unclassified_bacteria

q = """
select
    distinct s.subject
from
    entailed_edge ee
join statements s on
    ee.subject = s.subject
where
    ee.predicate = 'rdfs:subClassOf'
    and ee.object = 'NCBITaxon:2787823'
    and s.predicate = 'rdfs:label'
"""
[unclassified_taxa, query_duration] = scoped_mapping.timed_query(q, ncbitaxon_cnx)
unclassified_taxa['unclassified'] = True

print(query_duration)

unclassified_taxa

0:00:10.028503


Unnamed: 0,subject,unclassified
0,NCBITaxon:1006967,True
1,NCBITaxon:1041057,True
2,NCBITaxon:1046002,True
3,NCBITaxon:1046003,True
4,NCBITaxon:1046004,True
...,...,...
989,NCBITaxon:939928,True
990,NCBITaxon:941420,True
991,NCBITaxon:941421,True
992,NCBITaxon:941422,True


In [15]:

q = """
select
    taxonomy_id biosample_taxid,
    count(*) as count
from
    biosample b
group by
    taxonomy_id
order by
    count(*) desc
"""
[biosample_tax_id_counts, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)
biosample_tax_id_counts['curie'] = 'NCBITaxon:' + biosample_tax_id_counts['biosample_taxid'].astype(str)

print(query_duration)

0:00:01.352609


In [16]:
# Merge the two taxon id datasets
# I.e. flag the the Biosample records whose taxonomy_id field belongs to a subclass of 'unclassified entries'.
biosample_tax_id_counts = biosample_tax_id_counts.merge(unclassified_taxa, left_on='curie',
                                                        right_on='subject', how='left')
biosample_tax_id_counts.unclassified.fillna(False, inplace=True)

biosample_tax_id_counts

Unnamed: 0,biosample_taxid,count,curie,subject,unclassified
0,9606,6819707,NCBITaxon:9606,,False
1,10090,964219,NCBITaxon:10090,,False
2,408170,290862,NCBITaxon:408170,NCBITaxon:408170,True
3,410658,280666,NCBITaxon:410658,NCBITaxon:410658,True
4,646099,208741,NCBITaxon:646099,NCBITaxon:646099,True
...,...,...,...,...,...
163372,999891,1,NCBITaxon:999891,,False
163373,999892,1,NCBITaxon:999892,,False
163374,999898,1,NCBITaxon:999898,,False
163375,999931,1,NCBITaxon:999931,,False


In [17]:
# should really add labels to all of them
q = """
select
    subject ,
    value
from statements
where
    predicate = 'rdfs:label' and subject = stanza
"""
[all_tax_labels, query_duration] = scoped_mapping.timed_query(q, ncbitaxon_cnx)

biosample_tax_id_counts = biosample_tax_id_counts.merge(all_tax_labels, left_on='curie',
                                                        right_on='subject', how='left')

biosample_tax_id_counts = biosample_tax_id_counts[['curie', 'biosample_taxid', 'count', 'unclassified', 'value']]
biosample_tax_id_counts.columns = ['curie', 'biosample_taxid', 'count', 'unclassified', 'label']

print(query_duration)
biosample_tax_id_counts.to_sql('biobiosample_tax_id_counts', biosample_cnx, if_exists='replace', index=False)

biosample_tax_id_counts

0:00:05.466133


Unnamed: 0,curie,biosample_taxid,count,unclassified,label
0,NCBITaxon:9606,9606,6819707,False,Homo sapiens
1,NCBITaxon:10090,10090,964219,False,Mus musculus
2,NCBITaxon:408170,408170,290862,True,human gut metagenome
3,NCBITaxon:410658,410658,280666,True,soil metagenome
4,NCBITaxon:646099,646099,208741,True,human metagenome
...,...,...,...,...,...
163372,NCBITaxon:999891,999891,1,False,Bacillus amyloliquefaciens TA208
163373,NCBITaxon:999892,999892,1,False,[Propionibacterium] humerusii P08
163374,NCBITaxon:999898,999898,1,False,Peptococcaceae bacterium CEB3
163375,NCBITaxon:999931,999931,1,False,Barrientosiimonas humi


Almost all of the taxa that are common in the biosample collection are either unclassified/metagenomes or easily recognized cellular organisms

exceptions include:
- 32630 = synthetic construct (other entries; other sequences; artificial sequences)
    - 'other entries' would add 16k rows on top of the 1k 'unclassified entities'
    - metagenomes account for 331 of the 'unclassified entities'
    - there are also a small number of uncultured/unclassified microorganisms in the biosample dataset
- 77133 = uncultured bacterium (cellular organisms; Bacteria; environmental samples)
    - 'cellular organisms' would add 2M rows on top of the 1k 'unclassified entities'
    - 'cellular organisms; Bacteria; environmental samples' adds 26k
    
----

In [18]:
# Get a table of scoped mixs annotations to be mapped to ontology classes.
biosample_col_to_map = 'env_broad_scale'
scoping_col = 'env_package_normalized.ep_override'
scoping_value = 'water'
# In this case, the scoping includes an inner join requirement for 'unclassified entities'

q = 'select ' + biosample_col_to_map + """, count(*) as count
from
    biosample b
join env_package_normalized on
    b.env_package = env_package_normalized.env_package
inner join biobiosample_tax_id_counts stic on
    b.taxonomy_id = stic.biosample_taxid
where """ + scoping_col + " = '" + scoping_value + \
    "' group by " + biosample_col_to_map + """
order by
    count(*) desc"""
[mapping_candidates, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

mapping_candidates

Unnamed: 0,env_broad_scale,count
0,small lake biome,3264
1,marine biome (ENVO:00000447),1382
2,marine biome,1355
3,large lake biome,1198
4,freshwater biome,1051
...,...,...
252,Arctic,1
253,Aquatic biome,1
254,01000035,1
255,00000891,1


In [19]:
# The Biosample format allows for pipe-delimited environmental package lists
# Separate those out into their components
multi_frames = []
for row in mapping_candidates.itertuples(index=True, name='Pandas'):
    split_check = row.env_broad_scale
    if split_check is None:
        split_check = ''
    splitted = pd.Series(split_check.split("|"))
    splitted_count = len(splitted)
    repeated = [split_check] * splitted_count
    repeated = pd.Series(repeated)
    as_frame = pd.DataFrame(dict(repeated=repeated, splitted=splitted)).reset_index()
    multi_frames.append(as_frame)
concat_frame = pd.concat(multi_frames)
concat_frame = concat_frame[['repeated', 'splitted']]
mapping_candidates = mapping_candidates.merge(concat_frame, left_on=biosample_col_to_map,
                                              right_on='repeated', how='left')

mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted
0,small lake biome,3264,small lake biome,small lake biome
1,marine biome (ENVO:00000447),1382,marine biome (ENVO:00000447),marine biome (ENVO:00000447)
2,marine biome,1355,marine biome,marine biome
3,large lake biome,1198,large lake biome,large lake biome
4,freshwater biome,1051,freshwater biome,freshwater biome
...,...,...,...,...
258,Arctic,1,Arctic,Arctic
259,Aquatic biome,1,Aquatic biome,Aquatic biome
260,01000035,1,01000035,01000035
261,00000891,1,00000891,00000891


In [20]:
# do the spliting and extraction here

# Now try to extract ontology terms that are already present
candidate_series_decomposition = scoped_mapping.decompose_series(mapping_candidates['splitted'], id_patterns[target_onto_prefix])
mapping_candidates = pd.concat([mapping_candidates, candidate_series_decomposition], axis=1)

mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,string,extract,remaining_string,remaining_tidied
0,small lake biome,3264,small lake biome,small lake biome,small lake biome,,small lake biome,small lake biome
1,marine biome (ENVO:00000447),1382,marine biome (ENVO:00000447),marine biome (ENVO:00000447),marine biome (ENVO:00000447),ENVO:00000447,marine biome (),marine biome
2,marine biome,1355,marine biome,marine biome,marine biome,,marine biome,marine biome
3,large lake biome,1198,large lake biome,large lake biome,large lake biome,,large lake biome,large lake biome
4,freshwater biome,1051,freshwater biome,freshwater biome,freshwater biome,,freshwater biome,freshwater biome
...,...,...,...,...,...,...,...,...
258,Arctic,1,Arctic,Arctic,Arctic,,Arctic,arctic
259,Aquatic biome,1,Aquatic biome,Aquatic biome,Aquatic biome,,Aquatic biome,aquatic biome
260,01000035,1,01000035,01000035,01000035,,01000035,01000035
261,00000891,1,00000891,00000891,00000891,,00000891,00000891


In [22]:
# And join the extracted IDs with their labels
# start by conencting to the rdftab database 
# from which the terms and label-like annotatiosn will be obtained
ontodb = '../semantic-sql/db/' + target_onto_prefix.lower() + '.db'
ontocon = sqlite3.connect(ontodb)

In [23]:
q = """
select
    subject ,
    value
from
    statements s
where
    predicate = 'rdfs:label'
"""
[onto_labels, query_duration] = scoped_mapping.timed_query(q, ontocon)

onto_labels

Unnamed: 0,subject,value
0,IAO:0000111,editor preferred term~editor preferred label
1,IAO:0000112,example of usage
2,IAO:0000114,has curation status
3,IAO:0000115,definition
4,IAO:0000116,editor note
...,...,...
6774,ENVO:01001862,Solar radiation
6775,<https://www.wikidata.org/wiki/Q2>,Earth
6776,<https://www.wikidata.org/wiki/Q2306597>,Suni
6777,<https://www.wikidata.org/wiki/Q525>,Sol


In [24]:
mapping_candidates = mapping_candidates.merge(onto_labels, left_on='extract', right_on='subject', how='left')
mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,string,extract,remaining_string,remaining_tidied,subject,value
0,small lake biome,3264,small lake biome,small lake biome,small lake biome,,small lake biome,small lake biome,,
1,marine biome (ENVO:00000447),1382,marine biome (ENVO:00000447),marine biome (ENVO:00000447),marine biome (ENVO:00000447),ENVO:00000447,marine biome (),marine biome,ENVO:00000447,marine biome
2,marine biome,1355,marine biome,marine biome,marine biome,,marine biome,marine biome,,
3,large lake biome,1198,large lake biome,large lake biome,large lake biome,,large lake biome,large lake biome,,
4,freshwater biome,1051,freshwater biome,freshwater biome,freshwater biome,,freshwater biome,freshwater biome,,
...,...,...,...,...,...,...,...,...,...,...
258,Arctic,1,Arctic,Arctic,Arctic,,Arctic,arctic,,
259,Aquatic biome,1,Aquatic biome,Aquatic biome,Aquatic biome,,Aquatic biome,aquatic biome,,
260,01000035,1,01000035,01000035,01000035,,01000035,01000035,,
261,00000891,1,00000891,00000891,00000891,,00000891,00000891,,


In [25]:
# Use cosine string distance to see if the labels match
# I.e. the labels claimed by the Biosample data set and the labels asserted in the ontology
# if they're close enough, consider the assigned ID legit
# how close is close enough?
my_cosine_obj = Cosine(1)
mapping_candidates['value'] = mapping_candidates['value'].fillna('')
mapping_candidates['cosine'] = mapping_candidates.apply(
    lambda my_row: my_cosine_obj.distance(my_row['remaining_tidied'].lower(), my_row['value'].lower()), axis=1)
mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,string,extract,remaining_string,remaining_tidied,subject,value,cosine
0,small lake biome,3264,small lake biome,small lake biome,small lake biome,,small lake biome,small lake biome,,,1.0
1,marine biome (ENVO:00000447),1382,marine biome (ENVO:00000447),marine biome (ENVO:00000447),marine biome (ENVO:00000447),ENVO:00000447,marine biome (),marine biome,ENVO:00000447,marine biome,0.0
2,marine biome,1355,marine biome,marine biome,marine biome,,marine biome,marine biome,,,1.0
3,large lake biome,1198,large lake biome,large lake biome,large lake biome,,large lake biome,large lake biome,,,1.0
4,freshwater biome,1051,freshwater biome,freshwater biome,freshwater biome,,freshwater biome,freshwater biome,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...
258,Arctic,1,Arctic,Arctic,Arctic,,Arctic,arctic,,,1.0
259,Aquatic biome,1,Aquatic biome,Aquatic biome,Aquatic biome,,Aquatic biome,aquatic biome,,,1.0
260,01000035,1,01000035,01000035,01000035,,01000035,01000035,,,1.0
261,00000891,1,00000891,00000891,00000891,,00000891,00000891,,,1.0


In [26]:
# Get ready to join in the other direction
# I.e. trying to find ontology term IDs based on perfect label matches. Be careful not to reuse column names.
mapping_candidates.columns = ['env_broad_scale', 'count', 'repeated', 'splitted', 'string', 'extract',
                              'remaining_string', 'remaining_tidied', 'term_id', 'lab_from_id', 'lfi_cosine']
mapping_candidates = mapping_candidates.merge(onto_labels, left_on='remaining_tidied', right_on='value', how='left')
mapping_candidates.columns = ['env_broad_scale', 'count', 'repeated', 'splitted', 'string', 'extract',
                              'remaining_string', 'remaining_tidied', 'term_id', 'lab_from_id',
                              'lfi_cosine', 'term_id_from_lab', 'value']
mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,string,extract,remaining_string,remaining_tidied,term_id,lab_from_id,lfi_cosine,term_id_from_lab,value
0,small lake biome,3264,small lake biome,small lake biome,small lake biome,,small lake biome,small lake biome,,,1.0,,
1,marine biome (ENVO:00000447),1382,marine biome (ENVO:00000447),marine biome (ENVO:00000447),marine biome (ENVO:00000447),ENVO:00000447,marine biome (),marine biome,ENVO:00000447,marine biome,0.0,ENVO:00000447,marine biome
2,marine biome,1355,marine biome,marine biome,marine biome,,marine biome,marine biome,,,1.0,ENVO:00000447,marine biome
3,large lake biome,1198,large lake biome,large lake biome,large lake biome,,large lake biome,large lake biome,,,1.0,,
4,freshwater biome,1051,freshwater biome,freshwater biome,freshwater biome,,freshwater biome,freshwater biome,,,1.0,ENVO:00000873,freshwater biome
...,...,...,...,...,...,...,...,...,...,...,...,...,...
258,Arctic,1,Arctic,Arctic,Arctic,,Arctic,arctic,,,1.0,,
259,Aquatic biome,1,Aquatic biome,Aquatic biome,Aquatic biome,,Aquatic biome,aquatic biome,,,1.0,ENVO:00002030,aquatic biome
260,01000035,1,01000035,01000035,01000035,,01000035,01000035,,,1.0,,
261,00000891,1,00000891,00000891,00000891,,00000891,00000891,,,1.0,,


In [27]:
# Record a consensus
# If either merging on codes or labels was successful.
# cosines for first pass check on assigned IDs still haven't been filtered?
mapping_candidates['consensus_id'] = mapping_candidates['term_id']
mapping_candidates['consensus_id'][mapping_candidates['consensus_id'].isnull()] = \
    mapping_candidates['term_id_from_lab'][mapping_candidates['consensus_id'].isnull()]
mapping_candidates['consensus_lab'] = mapping_candidates['lab_from_id']
mapping_candidates['consensus_lab'][mapping_candidates['consensus_lab'] == ''] = \
    mapping_candidates['value'][mapping_candidates['consensus_lab'] == '']
# mapping_candidates.to_sql('mapping_scratch', biosample_cnx, if_exists='replace', index=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mapping_candidates['consensus_id'][mapping_candidates['consensus_id'].isnull()] = \
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mapping_candidates['consensus_lab'][mapping_candidates['consensus_lab'] == ''] = \


```
<ipython-input-49-3e62557cf6d9>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mapping_candidates['consensus_id'][mapping_candidates['consensus_id'].isnull()] = \
<ipython-input-49-3e62557cf6d9>:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame
```

In [28]:
mapping_candidates

Unnamed: 0,env_broad_scale,count,repeated,splitted,string,extract,remaining_string,remaining_tidied,term_id,lab_from_id,lfi_cosine,term_id_from_lab,value,consensus_id,consensus_lab
0,small lake biome,3264,small lake biome,small lake biome,small lake biome,,small lake biome,small lake biome,,,1.0,,,,
1,marine biome (ENVO:00000447),1382,marine biome (ENVO:00000447),marine biome (ENVO:00000447),marine biome (ENVO:00000447),ENVO:00000447,marine biome (),marine biome,ENVO:00000447,marine biome,0.0,ENVO:00000447,marine biome,ENVO:00000447,marine biome
2,marine biome,1355,marine biome,marine biome,marine biome,,marine biome,marine biome,,,1.0,ENVO:00000447,marine biome,ENVO:00000447,marine biome
3,large lake biome,1198,large lake biome,large lake biome,large lake biome,,large lake biome,large lake biome,,,1.0,,,,
4,freshwater biome,1051,freshwater biome,freshwater biome,freshwater biome,,freshwater biome,freshwater biome,,,1.0,ENVO:00000873,freshwater biome,ENVO:00000873,freshwater biome
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258,Arctic,1,Arctic,Arctic,Arctic,,Arctic,arctic,,,1.0,,,,
259,Aquatic biome,1,Aquatic biome,Aquatic biome,Aquatic biome,,Aquatic biome,aquatic biome,,,1.0,ENVO:00002030,aquatic biome,ENVO:00002030,aquatic biome
260,01000035,1,01000035,01000035,01000035,,01000035,01000035,,,1.0,,,,
261,00000891,1,00000891,00000891,00000891,,00000891,00000891,,,1.0,,,,


In [29]:
# For which Biosample annotations were not mappings by merging found?
# It looks like remaining_tidied is retaining too much punctuation
# and loosing useful digits (relative to remaining_string)?
# Should try harder to parse not-quite-right embedded IDs like ...
needs_search = mapping_candidates.remaining_tidied[mapping_candidates.consensus_id.isna()]
needs_search_counts = needs_search.value_counts()

needs_search_counts

                                 3
polar biome envo 01000339        3
small lake biome                 2
atlantic ocean                   2
clams fluids                     2
                                ..
00000891                         1
temperate shelf and sea biome    1
cyanobacterial culture           1
deep mediterranean               1
protist                          1
Name: remaining_tidied, Length: 131, dtype: int64

In [30]:
# Use a search engine
# For the mixs annotations that didn't already have cannonical IDs or labels
ebs_raw_list = list(needs_search_counts.index)
ebs_raw_list

['',
 'polar biome envo 01000339',
 'small lake biome',
 'atlantic ocean',
 'clams fluids',
 'envo 01000023',
 'marine',
 'seawater',
 'freshwater',
 'marine pelagic',
 'oceanic',
 'shrmip culture water',
 'water river',
 'phragmites australis rhizosphere',
 'lake pond',
 'temperate glacier close to sea level',
 'temperate mediterannean sea biome',
 'deep near bottom layer of freshwater lake',
 'deep water coral reef',
 'marine biome coastal',
 'hanford h 100',
 'surface marine water',
 'saltwater',
 'coastal marine estuary',
 'plankton community',
 'envo 01000043',
 'env 00000447',
 'brackish water river',
 'ocean this term applies to plant growth forms',
 'coastal sea area',
 'snow transect pooled',
 'sponge symbiont',
 'tundra',
 'arctic ocean associated',
 'arabian sea',
 'marine biome 00000447',
 'envo mediterranean forests woodlands and shrub biome',
 'supraglacial lake water',
 'microorganism',
 'mediterranean sea',
 'large lake biome',
 'envo montane grasslands and shrubland bi

In [31]:
# get whiteout frame and relateds
ebs_wo_frame = scoped_mapping.get_whiteout_frame(ebs_raw_list, replaced_chars=chars_to_whiteout)

ebs_wo_frame

Unnamed: 0,raw,woed
,,
polar biome envo 01000339,polar biome envo 01000339,polar biome envo 01000339
small lake biome,small lake biome,small lake biome
atlantic ocean,atlantic ocean,atlantic ocean
clams fluids,clams fluids,clams fluids
...,...,...
00000891,00000891,00000891
temperate shelf and sea biome,temperate shelf and sea biome,temperate shelf and sea biome
cyanobacterial culture,cyanobacterial culture,cyanobacterial culture
deep mediterranean,deep mediterranean,deep mediterranean


In [32]:
ebs_wo_list = scoped_mapping.get_wo_list(ebs_wo_frame)
ebs_wo_list

['',
 '0',
 '00000891',
 '01000035',
 '1000686',
 '15',
 'antarctic coastal associated',
 'antarctic peninsula associated',
 'aquatic',
 'aquatic bacteria',
 'arabian sea',
 'arctic',
 'arctic ocean associated',
 'atlantic ocean',
 'atolls of the maldives',
 'bacterioplankton',
 'boreal forest',
 'brackish amp fresh water mixed',
 'brackish water river',
 'brine pool interface layer',
 'caribbean sea',
 'clams fluids',
 'clean river',
 'coastal',
 'coastal marine estuary',
 'coastal ocean',
 'coastal sea area',
 'coastal water',
 'coral lagoon',
 'costal lagoon periodically connected to the atlantic ocean',
 'costal lagoon periodicay coneccted to the atlantic ocean',
 'cyanobacterial culture',
 'deep mediterranean',
 'deep mediterranean thetis halocline',
 'deep near bottom layer of freshwater lake',
 'deep ocean',
 'deep sea',
 'deep sea marine',
 'deep water coral reef',
 'diatom culture',
 'drinking water distribution system',
 'enriched culture in triptone casein soja medium',
 'en

In [33]:
# slow... turn logging back on to show status?
ebs_search_res = scoped_mapping.search_get_annotations_wrapper(ebs_wo_list, bad_chars=chars_to_whiteout, cat_name=biosample_col_to_map,
                                                ontoprefix=target_onto_prefix.lower(), query_fields='', rr=5)
my_best_acceptable = scoped_mapping.get_best_acceptable(ebs_search_res)

my_best_acceptable

Unnamed: 0,category,raw,query,name,cosine_rank,cosine_dist,obo_id,label,search_rank,ontology_prefix,scope,type,iri,ontology_name
0,env_broad_scale,,,,1,0.0,,,1,,,,,
234,env_broad_scale,boreal forest,boreal forest,boreal forest,1,0.0,ENVO:01000250,subpolar coniferous forest biome,1,ENVO,has_narrow_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_01000250,envo
337,env_broad_scale,coastal marine estuary,coastal marine estuary,estuarine coastal surface layer,1,0.041,ENVO:01001302,estuarine coastal surface layer,2,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_01001302,envo
369,env_broad_scale,coastal sea area,coastal sea area,coastal area,1,0.024,ENVO:00000303,sea coast,4,ENVO,has_related_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_00000303,envo
377,env_broad_scale,coastal water,coastal water,coastal water,1,0.0,ENVO:00002150,coastal sea water,1,ENVO,has_broad_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_00002150,envo
598,env_broad_scale,deep water coral reef,deep water coral reef,marine coral reef deep fore reef,1,0.047,ENVO:01000149,marine coral reef deep fore reef,2,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_01000149,envo
663,env_broad_scale,eukaryotes,eukaryotes,eukaryotes,1,0.0,NCBITaxon:2759,Eukaryota,1,ENVO,synonym,synonym,http://purl.obolibrary.org/obo/NCBITaxon_2759,envo
744,env_broad_scale,freshwater,freshwater,freshwater,1,0.0,ENVO:00002011,fresh water,1,ENVO,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_00002011,envo
770,env_broad_scale,freshwater great lakes,freshwater great lakes,freshwater lake,1,0.029,ENVO:00000021,freshwater lake,5,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_00000021,envo
827,env_broad_scale,hot springs,hot springs,hot spring,1,0.035,ENVO:00000051,hot spring,2,ENVO,hasExactSynonym,,http://purl.obolibrary.org/obo/ENVO_00000051,envo


In [None]:
no_acceptable_mappings = scoped_mapping.get_no_acceptable_mappings(ebs_search_res, my_best_acceptable)

no_acceptable_mappings

- Some broad scales look like place names
- Some get a good hit if 'biome' is added
- how to manually review and then add back in?
- add to biosample SQLite database:
    - no_acceptable_mappings
    - my_best_acceptable
    - ebs_search_results (no acceptable + all acceptable)?
    - mapping_candidates -> mapping_scratch (ID-based and exact-tidied-label-based)