In [1]:
from datetime import datetime
from pkg_resources import get_distribution, DistributionNotFound
from strsimpy.cosine import Cosine
import pandas as pd
import re
import requests as requests
import sqlite3
import string
import urllib
import yaml
from xml.etree import ElementTree
from tdda import rexpy
import scoped_mapping

In [2]:
biosample_sqlite_file = "/Users/MAM/Documents/gitrepos/biosample-analysis/target/harmonized_table.db"
# TODO process these as a list?
ncbitaxon_sqlite_file = "/Users/MAM/Documents/gitrepos/semantic-sql/db/ncbitaxon.db"
envo_sqlite_file = "/Users/MAM/Documents/gitrepos/semantic-sql/db/envo.db"
ncbitaxon_cnx = sqlite3.connect(ncbitaxon_sqlite_file)
envo_cnx = sqlite3.connect(envo_sqlite_file)
target_onto_prefix = 'ENVO'
chars_to_whiteout = '._-'
my_query_fields = ''
my_row_req = 3

env_package_overrides = {
    'built environment': 'built',
    'misc environment': 'miscellaneous',
    'missing': 'no environmental package',
    'unknown': 'no environmental package',
    'default': 'no environmental package',
    'unspecified': 'no environmental package',
    'not available': 'no environmental package',
    'not collected': 'no environmental package'
}

In [3]:
biosample_cnx = sqlite3.connect(biosample_sqlite_file)

In [4]:
# Sample of the data we're working with
q = """
select
    id,
    env_package,
    package,
    package_name,
    host_taxid,
    taxonomy_id,
    env_broad_scale,
    env_local_scale,
    env_medium
    from biosample b
limit 10
"""
biosample_first_ten = pd.read_sql(q, biosample_cnx)
print(biosample_first_ten)

                       id env_package      package  \
0  BIOSAMPLE:SAMN00000002     missing  MIGS.ba.5.0   
1  BIOSAMPLE:SAMN00000003     missing  MIGS.ba.5.0   
2  BIOSAMPLE:SAMN00000004     missing  MIGS.ba.5.0   
3  BIOSAMPLE:SAMN00000005        None  Generic.1.0   
4  BIOSAMPLE:SAMN00000006        None  Generic.1.0   
5  BIOSAMPLE:SAMN00000007     missing  MIGS.ba.5.0   
6  BIOSAMPLE:SAMN00000008     missing  MIGS.ba.5.0   
7  BIOSAMPLE:SAMN00000009        None  Generic.1.0   
8  BIOSAMPLE:SAMN00000010        None  Generic.1.0   
9  BIOSAMPLE:SAMN00000011        None  Generic.1.0   

                                   package_name host_taxid taxonomy_id  \
0  MIGS: cultured bacteria/archaea; version 5.0       9606      445970   
1  MIGS: cultured bacteria/archaea; version 5.0       9606      445972   
2  MIGS: cultured bacteria/archaea; version 5.0       9606      449673   
3                                       Generic       None        6526   
4                                  

In [5]:
# Get the canonical checklist and package terms from NCBI
# Unfortunately it doesn't do a very good job of differentiating 
# checklists (MIMAG, MIMARKS, etc.) 
# from packages (soil, water, etc.)
# what about ba , euk, etc?
package_dictionary = scoped_mapping.get_package_dictionary()
print(package_dictionary)
package_dictionary.to_sql('package_dictionary', biosample_cnx, if_exists='replace', index=False)

                            Name  \
0                    Generic.1.0   
1              SARS-CoV-2.cl.1.0   
2                Pathogen.cl.1.0   
3               Pathogen.env.1.0   
4                    Microbe.1.0   
..                           ...   
149  MIUVIG.plant-associated.5.0   
150          MIUVIG.sediment.5.0   
151              MIUVIG.soil.5.0   
152        MIUVIG.wastewater.5.0   
153             MIUVIG.water.5.0   

                                           DisplayName  \
0                                              Generic   
1    SARS-CoV-2: clinical or host-associated; versi...   
2    Pathogen: clinical or host-associated; version...   
3      Pathogen: environmental/food/other; version 1.0   
4                                 Microbe; version 1.0   
..                                                 ...   
149  MIUVIG: uncultivated virus genome, plant-assoc...   
150  MIUVIG: uncultivated virus genome, sediment; v...   
151  MIUVIG: uncultivated virus genome, soil;

In [6]:
# Do the Biosample checklist/package fields match any of the cannonical values?
# How many Biosample rows are there?
q = """
select count(*) as biosample_row_count
from biosample b
"""
[biosample_row_count, query_duration] = scoped_mapping.timed_query(q, biosample_cnx, print_timing=False)
print(biosample_row_count)
print(query_duration)

   biosample_row_count
0             14300584
0:00:00.049769


In [7]:
# How many of those rows can be inner-joined with the canonical checklists/packages?
# Specifically, joining biosample.package_name = package_dictionary.DisplayName
# TODO add indexing to docs and or makefile
# create index biosample_package_name_idx on biosample(package_name);
# create index package_dictionary_DisplayName_idx on package_dictionary(DisplayName);
# create index biosample_package_idx on biosample(package);
# create index biosample_p_pn_idx on biosample(package, package_name);
q = """
select
    count(*) as cannonical_package_name_count
from
    biosample b
inner join package_dictionary pd on
    b.package_name = pd.DisplayName
"""
[cannonical_package_name_count, query_duration] = scoped_mapping.timed_query(q, biosample_cnx, print_timing=True)
print(cannonical_package_name_count)
print(query_duration)

2021-05-24 14:19:24.990680
2021-05-24 14:19:26.123880
0:00:01.133200
   cannonical_package_name_count
0                       14300584
0:00:01.133200


In [8]:
# What do the combinations of package and package_name look like in the Biosample dataset?
q = """
select
    package,
    package_name,
    count(*) as count
from
    biosample b
group by
    package ,
    package_name
order by
    package ,
    package_name
"""
[package_name_combos, query_duration] = scoped_mapping.timed_query(q, biosample_cnx, print_timing=True)
print(package_name_combos)
print(query_duration)

2021-05-24 14:19:26.580389
2021-05-24 14:19:28.497415
0:00:01.917026
                       package  \
0           Beta-lactamase.1.0   
1                  Generic.1.0   
2                    Human.1.0   
3             Invertebrate.1.0   
4                  MIGS.ba.5.0   
..                         ...   
124  Model.organism.animal.1.0   
125            Pathogen.cl.1.0   
126           Pathogen.env.1.0   
127                  Plant.1.0   
128                  Virus.1.0   

                                          package_name     count  
0                          Beta-lactamase; version 1.0       556  
1                                              Generic  10186430  
2                                   Human; version 1.0    368893  
3                            Invertebrate; version 1.0    131085  
4         MIGS: cultured bacteria/archaea; version 5.0     28686  
..                                                 ...       ...  
124              Model organism or animal; version 1.

In [9]:
# What about the Biosample env_package values?
# Are they also a small, highly regular set?
q = """
select
    env_package,
    count(*) as count
from
    biosample b
group by
    env_package
order by
    count(*) desc
"""
[env_package_count, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)
print(env_package_count)
print(query_duration)

                        env_package     count
0                              None  14083847
1                   host-associated     49254
2                         human-gut     47921
3                             water     16367
4                        human-skin     13706
..                              ...       ...
87                      env_package         1
88                   gut microbiome         1
89  marine sediment (ENVO:00002113)         1
90                           saliva         1
91       sea water, [ENVO:00002149]         1

[92 rows x 2 columns]
0:00:01.018992


In [10]:
# env_package is going to need some cleanup
# First, get a set of all canonical env_package values
package_dictionary = scoped_mapping.make_tidy_col(package_dictionary, 'EnvPackage', 'eptidy')
package_dictionary =scoped_mapping.make_tidy_col(package_dictionary, 'EnvPackageDisplay', 'epdtidy')
# update in sqlite
package_dictionary.to_sql('package_dictionary', biosample_cnx, if_exists='replace', index=False)
valid_combo = []
valid_combo = scoped_mapping.add_unique_to_list(valid_combo, package_dictionary['eptidy'])
valid_combo = scoped_mapping.add_unique_to_list(valid_combo, package_dictionary['epdtidy'])
print(valid_combo)

['', 'air', 'built', 'host associated', 'human associated', 'human gut', 'human oral', 'human skin', 'human vaginal', 'microbial', 'microbial mat biofilm', 'miscellaneous', 'miscellaneous or artificial', 'no environmental package', 'plant associated', 'sediment', 'soil', 'wastewater', 'wastewater sludge', 'water']


In [20]:
# determine ID patterns
q = """
select
    distinct stanza
    from statements s
where
    predicate = 'rdf:type'
    and "object" = 'owl:Class'
    and stanza = subject"""
# include non-envo IDs that come from envo?
[ids_from_envo, query_duration] = scoped_mapping.timed_query(q, envo_cnx)
print(query_duration)
ids_from_envo = scoped_mapping.add_prefix_col(ids_from_envo, 'stanza', 'prefix')
print(ids_from_envo)

id_patterns = scoped_mapping.get_multi_term_patterns(ids_from_envo, 'stanza', 'prefix')
print(id_patterns)

def env_package_nomralizastion(dataframe, col_to_normalize, pattern_name, id_replacement_rule):
    dataframe[['lhs', 'rhs']] = dataframe[col_to_normalize].str.split('.', expand=True)
    flag = dataframe['rhs'].apply(lambda x: x is None)
    temp = dataframe['lhs'][flag]
    dataframe.loc[flag, 'rhs'] = temp
    dataframe.loc[flag, 'lhs'] = ''
    series_decomposition = scoped_mapping.decompose_series(dataframe['rhs'], id_replacement_rule)
    dataframe = pd.concat([dataframe, series_decomposition], axis=1)
    return dataframe

env_package_normalized = env_package_nomralizastion(env_package_count, 'env_package',
                                                                target_onto_prefix, id_patterns['ENVO'])

env_package_normalized = scoped_mapping.add_overrides(env_package_normalized, 'remaining_tidied', 'ep_override',
                                                   env_package_overrides)

env_package_normalized = scoped_mapping.flag_canonical(env_package_normalized, 'ep_override', 'is_canonical',
                                                    valid_combo)

env_package_normalized.to_sql('env_package_normalized', biosample_cnx, if_exists='replace', index=False)



0:00:00.026371
              stanza  prefix
0        BFO:0000001     BFO
1        BFO:0000002     BFO
2        BFO:0000003     BFO
3        BFO:0000004     BFO
4        BFO:0000006     BFO
...              ...     ...
6561  UBERON:3000972  UBERON
6562  UBERON:3000977  UBERON
6563  UBERON:3010200  UBERON
6564  UBERON:4100000  UBERON
6565  UBERON:8410024  UBERON

[6566 rows x 2 columns]
{'BFO': 'BFO:\\d{7}', 'CARO': 'CARO:\\d{7}', 'CHEBI': 'CHEBI:\\d{4,6}', 'ENVO': 'ENVO:\\d{7,8}', 'FAO': 'FAO:0000001', 'FOODON': 'FOODON:\\d{8}', 'GO': 'GO:\\d{7}', 'IAO': 'IAO:\\d{7}', 'NCBITaxon': 'NCBITaxon:\\d+', 'OBI': 'OBI:\\d{7}', 'PATO': 'PATO:\\d{7}', 'PCO': 'PCO:\\d{7}', 'PO': 'PO:\\d{7}', 'RO': 'RO:0002577', 'UBERON': 'UBERON:\\d{7}'}
