In [3]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /data/wmanuel3/mapping/mcon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /data/wmanuel3/mapping/mcon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords
from string import punctuation

stopwords = stopwords.words('english')
stopwords.append('vaccine')
punctuation = list(punctuation)

def preprocessing(words, stpw, punct):
    tokens = word_tokenize(words.lower())
    out = set([t for t in tokens if t not in stpw and t not in punct])
    return out

In [160]:
from owlready2 import *
import re
def load_vo(ontology_url, stopwords, punctuation):
    """
    Load the Vaccine Ontology (VO) from the given URL and return a dictionary of the concepts in the ontology.
    The dictionary has the following structure:
    {concept_id: {'Label': label, #.label annotation
                  'Vaccine Name': vaccine_name, #.VO_0003158 annotation
                  'Definition': definition, #.IAO_0000115 annotation
                  'Editor Note': editor_note, #.IAO_0000116 annotation
                  'FDA Vaccine Indications': fda_vaccine_indications, #.VO_0003160 annotation
                  'Tokens': tokens, #preprocessed tokens from the label and vaccine name
                  'CVX_Codes': cvx_codes #list of CVX codes from .seeAlso and .VO_0005438 annotations
                  }
    }
    """
    vo = owlready2.get_ontology(ontology_url)
    vo.load()
    vo_dict = {}
    for i, concept in enumerate(vo.classes()):
        cvx_codes = []
        vo_id = re.search(r"\.(.*)", str(concept)).group(1)
        if vo_id == 'Thing':
            continue
        else:
            lab = ''.join(concept.label)
            vac_prop = ''.join(concept.VO_0003158)
            defn = ''.join(concept.IAO_0000115)
            edt_note = ''.join(concept.IAO_0000116)
            vac_fda = ''.join(concept.VO_0003160)
            if len(concept.VO_0005438) > 0:
                for cvx in concept.VO_0005438:
                    cvx_codes.append(cvx)
            for cvx in concept.seeAlso:
                if cvx.startswith('CVX'):
                    code = re.search(r"^CVX\D*:\D*(\d*)", cvx).group(1)
                    cvx_codes.append(code)
            
            if vac_prop != '':
                tokens = preprocessing(vac_prop,stopwords,punctuation)
            else:
                tokens = preprocessing(lab, stopwords, punctuation)
            vo_dict[vo_id] = {'Label':lab,
                              'Vaccine Name':vac_prop,
                              'Definition':defn,
                              'Editor Note':edt_note,
                              'FDA Vaccine Indications':vac_fda,
                              'Tokens':tokens,
                              'CVX_Codes':cvx_codes}
        if i % 100 == 0:
            print(f'Processed {i} concepts')
    return vo_dict

In [161]:
vo_04_24 = 'https://data.bioontology.org/ontologies/VO/submissions/291/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb'
vo_dict_04_24 = load_vo(vo_04_24,stopwords, punctuation)

Processed 0 concepts
Processed 100 concepts
Processed 200 concepts
Processed 300 concepts
Processed 400 concepts
Processed 500 concepts
Processed 600 concepts
Processed 700 concepts
Processed 800 concepts
Processed 900 concepts
Processed 1000 concepts
Processed 1100 concepts
Processed 1200 concepts
Processed 1300 concepts
Processed 1400 concepts
Processed 1500 concepts
Processed 1600 concepts
Processed 1700 concepts
Processed 1800 concepts
Processed 1900 concepts
Processed 2000 concepts
Processed 2100 concepts
Processed 2200 concepts
Processed 2300 concepts
Processed 2400 concepts
Processed 2500 concepts
Processed 2600 concepts
Processed 2700 concepts
Processed 2800 concepts
Processed 2900 concepts
Processed 3000 concepts
Processed 3100 concepts
Processed 3200 concepts
Processed 3300 concepts
Processed 3400 concepts
Processed 3500 concepts
Processed 3600 concepts
Processed 3700 concepts
Processed 3800 concepts
Processed 3900 concepts
Processed 4000 concepts
Processed 4100 concepts
Proc

In [129]:
# 1.1.218 (02-26-2023)
vo_02_23 = 'https://data.bioontology.org/ontologies/VO/submissions/282/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb'
vo_dict_02_23 = load_vo(vo_02_23,stopwords, punctuation)

Processed 100 concepts
Processed 200 concepts
Processed 300 concepts
Processed 400 concepts
Processed 500 concepts
Processed 600 concepts
Processed 700 concepts
Processed 800 concepts
Processed 900 concepts
Processed 1000 concepts
Processed 1100 concepts
Processed 1200 concepts
Processed 1300 concepts
Processed 1400 concepts
Processed 1500 concepts
Processed 1600 concepts
Processed 1700 concepts
Processed 1800 concepts
Processed 1900 concepts
Processed 2000 concepts
Processed 2100 concepts
Processed 2200 concepts
Processed 2300 concepts
Processed 2400 concepts
Processed 2500 concepts
Processed 2600 concepts
Processed 2700 concepts
Processed 2800 concepts
Processed 2900 concepts
Processed 3000 concepts
Processed 3100 concepts
Processed 3200 concepts
Processed 3300 concepts
Processed 3400 concepts
Processed 3500 concepts
Processed 3600 concepts
Processed 3700 concepts
Processed 3800 concepts
Processed 3900 concepts
Processed 4000 concepts
Processed 4100 concepts
Processed 4200 concepts
P

In [130]:
# 1.1.218 (10-05-2022)
vo_10_22 = 'https://data.bioontology.org/ontologies/VO/submissions/280/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb'
vo_dict_10_22 = load_vo(vo_10_22,stopwords, punctuation)

Processed 100 concepts
Processed 200 concepts
Processed 300 concepts
Processed 400 concepts
Processed 500 concepts
Processed 600 concepts
Processed 700 concepts
Processed 800 concepts
Processed 900 concepts
Processed 1000 concepts
Processed 1100 concepts
Processed 1200 concepts
Processed 1300 concepts
Processed 1400 concepts
Processed 1500 concepts
Processed 1600 concepts
Processed 1700 concepts
Processed 1800 concepts
Processed 1900 concepts
Processed 2000 concepts
Processed 2100 concepts
Processed 2200 concepts
Processed 2300 concepts
Processed 2400 concepts
Processed 2500 concepts
Processed 2600 concepts
Processed 2700 concepts
Processed 2800 concepts
Processed 2900 concepts
Processed 3000 concepts
Processed 3100 concepts
Processed 3200 concepts
Processed 3300 concepts
Processed 3400 concepts
Processed 3500 concepts
Processed 3600 concepts
Processed 3700 concepts
Processed 3800 concepts
Processed 3900 concepts
Processed 4000 concepts
Processed 4100 concepts
Processed 4200 concepts
P

In [131]:
import pandas as pd
vo_df_04_24 = pd.DataFrame.from_records(vo_dict_04_24).T
vo_df_04_24['VO_ID'] = vo_df_04_24.index
# Keep vo_df rows where VO_ID begins with 'VO'
vo_df_04_24 = vo_df_04_24[vo_df_04_24['VO_ID'].str.startswith('VO')]
len(vo_dict_04_24)

vo_df_02_23 = pd.DataFrame.from_records(vo_dict_02_23).T
vo_df_02_23['VO_ID'] = vo_df_02_23.index
vo_df_02_23 = vo_df_02_23[vo_df_02_23['VO_ID'].str.startswith('VO')]

vo_df_10_22 = pd.DataFrame.from_records(vo_dict_10_22).T
vo_df_10_22['VO_ID'] = vo_df_10_22.index
vo_df_10_22 = vo_df_10_22[vo_df_10_22['VO_ID'].str.startswith('VO')]
# Check the number of concepts in each version
print(f'VO 04-24: {len(vo_df_04_24)}, VO 02-23: {len(vo_df_02_23)}, VO 10-22: {len(vo_df_10_22)}')

VO 04-24: 7176, VO 02-23: 6238, VO 10-22: 6225


In [132]:
# Get vo_df['CVX_Codes'] counts for empty []
vo_df['CVX_Codes'].apply(lambda x: len(x)).value_counts()
# vo_df['CVX_Codes'].value_counts()

0    6985
1     181
2       7
3       3
Name: CVX_Codes, dtype: int64

In [39]:
# vo_df_old['CVX_Codes'].value_counts()
vo_df_old['CVX_Codes'].apply(lambda x: len(x)).value_counts()

0    6167
1      61
2       7
3       3
Name: CVX_Codes, dtype: int64

In [162]:
# vo_dict['VO_0005440']
# vo_dict['VO_0006011']
vo_dict_04_24['VO_0005435']

{'Label': 'SARS-COV-2 COVID-19 mRNA, bivalent, original/Omicron BA.1, Non-US Vaccine Product (Comirnaty Bivalent), Pfizer-BioNTech',
 'Vaccine Name': '',
 'Definition': 'A Comirnaty COVID-19 mRNA vaccine produced by Pfizer-BioNTech,which is a bivalent vaccine that is effective against the original strain of COVID-19 as well as the Omicron BA.1 variant.A non-US vaccine product.',
 'Editor Note': '',
 'FDA Vaccine Indications': '',
 'Tokens': {'ba.1',
  'bivalent',
  'comirnaty',
  'covid-19',
  'mrna',
  'non-us',
  'original/omicron',
  'pfizer-biontech',
  'product',
  'sars-cov-2'},
 'CVX_Codes': ['520']}

In [144]:
# vo_dict_old['VO_0005440']
# vo_dict_old['VO_0006011']
vo_dict_02_23['VO_0006067']

KeyError: 'VO_0006067'

In [145]:
vo_dict_10_22['VO_0006067']

KeyError: 'VO_0006067'

In [181]:
#New dataframe joining vo_df and vo_df_old on VO_ID. 
vo_df_all = vo_df_04_24[['VO_ID','CVX_Codes']].merge(vo_df_02_23[['VO_ID','CVX_Codes']], on='VO_ID', how='outer', suffixes=('_new', '_old'))
vo_df_all['Identical']= vo_df_all['CVX_Codes_new'] == vo_df_all['CVX_Codes_old']
# vo_df_all[vo_df_all['Identical'] == False].to_csv('vo_cvx_codes_diff_2.csv', index=False)

# Drop null rows
vo_df_all = vo_df_all.dropna()

In [184]:
vo_df_all[(vo_df_all['Identical'] == True) & (vo_df_all['CVX_Codes_new'].apply(lambda x: len(x)) > 0)]

Unnamed: 0,VO_ID,CVX_Codes_new,CVX_Codes_old,Identical
3,VO_0000003,[75],[75],True
4,VO_0000004,[48],[48],True
5,VO_0000005,"[115, 20]","[115, 20]",True
6,VO_0000006,"[140, 141]","[140, 141]",True
14,VO_0000014,[24],[24],True
...,...,...,...,...
6171,VO_0010715,[48],[48],True
6178,VO_0010722,[120],[120],True
6193,VO_0010737,"[08, 43, 44]","[08, 43, 44]",True
7004,VO_0011558,[134],[134],True


Potential Issue for Evaluation: 
    
    Cannot use newer version for mapping evaluation. All new CVX annotations are for NEW CONCEPTS 

In [None]:
yuanyi_results = ['VO_0000738','VO_0003092','VO_0000731','VO_0000945','VO_0000654','VO_0010637','VO_0000656','VO_0003834','VO_0000765','VO_0000771','VO_0003442','VO_0003122','VO_0000648','VO_0010727','VO:0003418','VO_0000660','VO_0000659','VO_0010715','VO_0003168','VO_0004872','VO_0004877','VO_0012167','VO_0000295','VO_0000762','VO_0000764','VO_0000761','VO_0000422','VO_0000451','VO_0000463','VO_0012166','VO_0000433','VO_0000152','VO_0004112','VO_0000743','VO_0000102','VO_0010738','VO_0004175','VO_0010736','VO_0004877','VO_0003439','VO_0000031','VO_0000980','VO_0000978','VO_0004281','VO_0010710','VO_0000094','VO_0003139','VO_0003067','VO_0003152','VO_0010743','VO_0003095','VO_0003155','VO_0005158','VO_0005155','VO_0005159','VO_0005093','VO_0004991','VO_0005187','VO_0005163','VO_0005088','VO_0005166','VO_0005141','VO_0004992','VO_0005162','VO_0005192','VO_0005084','VO_0005081','VO_0005099','VO_0001320']

In [187]:

cvx_file = 'src_data/cvx_code.csv'
cvx_df = pd.read_csv(cvx_file, sep='|', names = ['Short Description','Full Vaccine name','CVX Code','Vaccine Status','Last Updated Date','Notes'])
cvx_df

FileNotFoundError: [Errno 2] No such file or directory: 'src_data/cvx_code.csv'

In [80]:
# From vo_df_all, get rows where CVX_Codes_new is not empty and CVX_Codes_old is empty
vo_df_all[vo_df_all['VO_ID']=='VO_0000003']['CVX_Codes_new']

3    [75]
Name: CVX_Codes_new, dtype: object

In [180]:
vo_df_all[vo_df_all['VO_ID']=='VO_0000003']['CVX_Codes_old']

3    [75]
Name: CVX_Codes_old, dtype: object

In [51]:
vo_df_all.dtypes

VO_ID            object
CVX_Codes_new    object
CVX_Codes_old    object
dtype: object

In [33]:
vo_df[['VO_ID','CVX_Codes']]

Unnamed: 0,VO_ID,CVX_Codes
BFO_0000001,BFO_0000001,[]
BFO_0000002,BFO_0000002,[]
BFO_0000003,BFO_0000003,[]
BFO_0000004,BFO_0000004,[]
BFO_0000006,BFO_0000006,[]
...,...,...
VO_0015475,VO_0015475,[]
VO_0015476,VO_0015476,[]
VO_0015478,VO_0015478,[]
VO_0015483,VO_0015483,[]
