In [None]:
import pandas as pd
import numpy as np
import re
import math
import datetime
import os 

from pubchempy import get_compounds, Compound
from molvs import validate_smiles
# import cirpy
# from chembl_webresource_client.new_client import new_client

## Read MIMIC-Extract Data

In [None]:
mimic_extract_path = "mimic-extract/"

In [None]:
x_train = pd.read_pickle(os.path.join(mimic_extract_path, "lvl2_imputer_train.pkl"))


x_dev = pd.read_pickle(os.path.join(mimic_extract_path, "lvl2_imputer_dev.pkl"))
x_test = pd.read_pickle(os.path.join(mimic_extract_path, "lvl2_imputer_test.pkl"))

y_train = pd.read_pickle(os.path.join(mimic_extract_path, "Ys_train.pkl"))
y_dev = pd.read_pickle(os.path.join(mimic_extract_path, "Ys_dev.pkl"))
y_test = pd.read_pickle(os.path.join(mimic_extract_path, "Ys_test.pkl"))

ys = pd.read_pickle(os.path.join(mimic_extract_path, "Ys.pkl"))

## Read MIMIC Data

In [None]:
mimic_path = "mimic-iii/"

In [None]:
icustays_df = pd.read_csv(os.path.join(mimic_path, "ICUSTAYS.csv"))
admission_df = pd.read_csv(os.path.join(mimic_path, "ADMISSIONS.csv"))
prescription_df = pd.read_csv(os.path.join(mimic_path, "PRESCRIPTIONS.csv"))

## Eliminate Prescription Table

1. based on icu id
2. based on time (first 24h)

In [None]:
icu_stay_ids = set()
for i in ys.itertuples():
    stay_id_ = i.Index[2]
    icu_stay_ids.add(stay_id_)

In [None]:
# based on icu id
print(prescription_df.shape)
sub_prescription_df = prescription_df[prescription_df.ICUSTAY_ID.isin(list(icu_stay_ids))]
print(sub_prescription_df.shape)

In [None]:
# # based on icu time (first 24h)
# sub_icu_df = icustays_df[icustays_df.ICUSTAY_ID.isin(list(icu_stay_ids))]

# patient_intime_dict = {}
# for i in sub_icu_df.itertuples():

#     patient_id_ = i.SUBJECT_ID
#     icustay_id_ = i.ICUSTAY_ID
#     intime_ = i.INTIME
#     if patient_id_ in patient_intime_dict:
#         print("ERROR", pat_id_)
#     else:
#         patient_intime_dict[patient_id_] = intime_

# print("We have ", len(patient_intime_dict), " patients in the beginning before each preprocessing step")

# drug_names_set = set()
# drug_generic_names_set = set()
# drug_ndc_set = set()
# drug_index_set = set()

# for patient_, intime_ in patient_intime_dict.items():
        
    
#     intime_date_ = datetime.datetime.strptime(intime_, '%Y-%m-%d %H:%M:%S')
#     next_date_ = (intime_date_ + datetime.timedelta(days=1))
    
#     result_ = sub_prescription_df[sub_prescription_df.SUBJECT_ID == patient_]
    
#     if len(result_) == 0: 
#         # This patients have not any drug in sub prescription df
#         continue 
    
#     for drug_ in result_.itertuples():
        
#         #if isinstance(drug.STARTDATE, float): continue
            
#         drug_start_day_ = datetime.datetime.strptime(drug_.STARTDATE, '%Y-%m-%d %H:%M:%S').day
        
#         intime_day_ = intime_date_.day
#         next_day_ = next_date_.day
        
#         if drug_start_day_ == intime_day_ or drug_start_day_ == next_day_:
#             drug_names_set.add(drug_.DRUG)
#             drug_generic_names_set.add(drug_.DRUG_NAME_GENERIC)
#             drug_ndc_set.add(drug_.NDC)
#             drug_index_set.add(drug_.Index)
#         else:
#             continue
            
## Cause it is a little bit time consuming operation, we save these variables
# pd.to_pickle(drug_names_set, "drug_names_set.p")
# pd.to_pickle(drug_generic_names_set, "drug_generic_names_set.p")
# pd.to_pickle(drug_ndc_set, "drug_ndc_set.p")
# pd.to_pickle(drug_index_set, "drug_index_set.p")

In [None]:
drug_names_set = pd.read_pickle("drug_names_set.p")
drug_generic_names_set = pd.read_pickle("drug_generic_names_set.p")
drug_ndc_set = pd.read_pickle("drug_ndc_set.p")
drug_index_set = pd.read_pickle("drug_index_set.p")

## All neccesary info is stored in these variables that we need to find
print (len(drug_names_set), len(drug_generic_names_set), len(drug_ndc_set), len(drug_index_set))

## Preprocessing Drug Names

In [None]:
def preprocess(text):
    temp_drug = text.strip()
    temp_drug = temp_drug.replace("*NF*", "")
    temp_drug = temp_drug.replace("NF*", "")
    temp_drug = temp_drug.replace("NEO*IV*", "")
    temp_drug = temp_drug.replace("NEO*PO*", "")
    temp_drug = temp_drug.replace("*nf*", "")
    temp_drug = temp_drug.replace("*nf", "")
    temp_drug = temp_drug.replace("Neo*Nasal*", "")
    temp_drug = temp_drug.replace("<IND>", "")
    temp_drug = temp_drug.replace("~<IND>", "")
    temp_drug = temp_drug.replace("NEO*IM*", "")
    temp_drug = temp_drug.replace("< IND>", "")
    temp_drug = temp_drug.replace("NS", "")
    temp_drug = temp_drug.replace("P.f.", "")
    temp_drug = temp_drug.replace("*NF", "")
    
    temp_drug = re.sub(r'\d*%', '', temp_drug)
    temp_drug = re.sub(r'\d+\.\d+', '', temp_drug)
    temp_drug = re.sub(r'\d+\.', '', temp_drug)
    temp_drug = re.sub(r"\([^()]*\)", "", temp_drug)
    temp_drug = " ".join(temp_drug.split())
    temp_drug = temp_drug.strip()
    
    return temp_drug

In [None]:
## create dictionary to store drug - preprocessed version

# drug name - preprocesed drug name

preprocessed_to_drug_dict = {}
drug_to_preprocessed_dict = {}

for drug_ in drug_names_set:
    
    pre_drug_ = preprocess(drug_)
    
    drug_to_preprocessed_dict[drug_] = pre_drug_
    
    if pre_drug_ in preprocessed_to_drug_dict:
        preprocessed_to_drug_dict[pre_drug_].append(drug_)
    else:
        preprocessed_to_drug_dict[pre_drug_] = [drug_]

In [None]:
print("There are", len(drug_to_preprocessed_dict), "unique drugs")
print("There are", len(preprocessed_to_drug_dict), "unique preproccesed drugs")

## Search All Drugs in Pubchem and other sources

### Search Based on Original Drug Name in Pubchem

In [None]:
## Cause it is a little bit time consuming operation, we save these variables

# all_drugs_pubchem_dict = {}
# all_drugs_missing_pubchem_set = set()

# for drug_name_ in prescription_df.DRUG.unique():
    
#     comps_ = get_compounds(drug_name, 'name')
    
#     if len(comps_) != 0:
#         all_drugs_pubchem_dict[drug_name_] = comps_
#     else: 
#         all_drugs_missing_pubchem_set.add(drug_name_)

# pd.to_pickle(all_drugs_pubchem_dict, "all_drugs_pubchem_dict.p")
# pd.to_pickle(all_drugs_missing_pubchem_set, "all_drugs_missing_pubchem_set.p")

In [None]:
# all_drugs_pubchem_dict = drugs that we can find in pubchem
# all_drugs_missing_pubchem_set = drugs that we can NOT find in pubchem

all_drugs_pubchem_dict = pd.read_pickle("all_drugs_pubchem_dict.p")
all_drugs_missing_pubchem_set = pd.read_pickle("all_drugs_missing_pubchem_set.p")

In [None]:
print("There are", len(prescription_df.DRUG.unique()), "unique drugs in MIMIC-III")
print("There are", len(all_drugs_pubchem_dict), "drugs that we found in pubchem for all drugs in MIMIC-III")
print("There are still", len(all_drugs_missing_pubchem_set), "drugs that we need to find for all drugs in MIMIC-III")

print("For us, we find", len(drug_names_set.intersection(set(all_drugs_pubchem_dict.keys()))), "drugs in", len(drug_names_set))

In [None]:
#check how many rows find in prescription df

missing_index_set = set()
final_prescription_df = sub_prescription_df.loc[drug_index_set]
drug_names_that_found_in_pubchem = set(all_drugs_pubchem_dict.keys())

for i in final_prescription_df.itertuples():
    ind_ = i.Index
    drug_ = i.DRUG
    if drug_ in drug_names_that_found_in_pubchem:
        continue
    else:
        missing_index_set.add(ind_)

In [None]:
print("After search all drug names in Pubchem, we have still", len(missing_index_set), "rows that we need to find")

remain_missing_drugs = drug_names_set.difference(set(all_drugs_pubchem_dict.keys()))
print("Number of drugs that we need to find", len(remain_missing_drugs))

### Search Based on Preprocessed Drug Name in Pubchem

In [None]:
# ## Cause it is a little bit time consuming operation, we save these variables

# preprocessed_drugs_pubchem_dict = {}

# for drug_name_ in remain_missing_drugs:

#     pre_drug_name_ = preprocess(drug_name_)
    
#     if pre_drug_name_ == "" or pre_drug_name_ == " " or pre_drug_name_ == None:
#         continue

#     comps_ = get_compounds(pre_drug_name_, 'name')
    
#     if len(comps_) != 0:
#         if pre_drug_name_ not in preprocessed_drugs_pubchem_dict:
#             preprocessed_drugs_pubchem_dict[pre_drug_name_] = comps_            

# pd.to_pickle(preprocessed_drugs_pubchem_dict, "preprocessed_drugs_pubchem_dict.p")

In [None]:
preprocessed_drugs_pubchem_dict = pd.read_pickle("preprocessed_drugs_pubchem_dict.p")

In [None]:
print("We find", len(preprocessed_drugs_pubchem_dict), "more drugs in preprocessed drug names")

In [None]:
#check how many rows find in prescription df

missing_index_set = set()
final_prescription_df = sub_prescription_df.loc[drug_index_set]

drug_names_that_found_in_pubchem = set(all_drugs_pubchem_dict.keys())

# find real drug names
original_names_of_preproccesed_drugs = set()
for pre_drug_ in preprocessed_drugs_pubchem_dict.keys():
    for original_drug_ in preprocessed_to_drug_dict[pre_drug_]:
        original_names_of_preproccesed_drugs.add(original_drug_)
        
for i in final_prescription_df.itertuples():
    ind_ = i.Index
    drug_ = i.DRUG
    pre_drug_ = preprocess(drug_)
    
    if drug_ in drug_names_that_found_in_pubchem:
        continue
    elif drug_ in original_names_of_preproccesed_drugs:
        continue
    else:
        missing_index_set.add(ind_)

In [None]:
print("After searching preproccesed drug names in Pubchem, we have still", len(missing_index_set), "rows that we need to find")

### Search Based on Original Generic Drug Name in Pubchem

In [None]:
generic_drug_names_for_search_set = set()
for i in sub_prescription_df.itertuples():
    
    ind_ = i.Index
    if ind_ not in missing_index_set:
        continue
    
    drug_name_generic_ = i.DRUG_NAME_GENERIC

    generic_drug_names_for_search_set.add(drug_name_generic_)

In [None]:
print("There are", len(generic_drug_names_for_search_set), "generic drugs name that we need to check")
print("Hovewer because we have already check", len(generic_drug_names_for_search_set.intersection(all_drugs_missing_pubchem_set))
     , "we need to just check", len(generic_drug_names_for_search_set.difference(all_drugs_missing_pubchem_set)))

In [None]:
# # ## Cause it is a little bit time consuming operation, we save these variables

# missing_drugs_ = generic_drug_names_for_search_set.difference(all_drugs_missing_pubchem_set)

# generic_drugs_pubchem_dict = {}

# for generic_drug_name_ in missing_drugs_:
    
#     if isinstance(generic_drug_name_, float): continue
    
#     comps_ = get_compounds(generic_drug_name_, 'name')
    
#     if len(comps_) != 0:
#         if generic_drug_name_ not in generic_drugs_pubchem_dict:
#             generic_drugs_pubchem_dict[generic_drug_name_] = comps_
# pd.to_pickle(generic_drugs_pubchem_dict, "generic_drugs_pubchem_dict.p")

In [None]:
generic_drugs_pubchem_dict = pd.read_pickle("generic_drugs_pubchem_dict.p")

In [None]:
print("We find",len(generic_drugs_pubchem_dict), "drugs in pubchem with original generic drug name")

In [None]:
#check how many rows find in prescription df

missing_index_set = set()
final_prescription_df = sub_prescription_df.loc[drug_index_set]

drug_names_that_found_in_pubchem = set(all_drugs_pubchem_dict.keys())

# find real drug names
original_names_of_preproccesed_drugs = set()
for pre_drug_ in preprocessed_drugs_pubchem_dict.keys():
    for original_drug_ in preprocessed_to_drug_dict[pre_drug_]:
        original_names_of_preproccesed_drugs.add(original_drug_)
        
for i in final_prescription_df.itertuples():
    ind_ = i.Index
    drug_ = i.DRUG
    pre_drug_ = preprocess(drug_)
    generic_drug_ = i.DRUG_NAME_GENERIC
    
    if drug_ in drug_names_that_found_in_pubchem:
        continue
    elif drug_ in original_names_of_preproccesed_drugs:
        continue
    elif generic_drug_ in generic_drugs_pubchem_dict:
        continue
    else:
        missing_index_set.add(ind_)

In [None]:
print("After searching generic drug names in Pubchem, we have still", len(missing_index_set), "rows that we need to find")

In [None]:
# from chembl_webresource_client.new_client import new_client
# molecule = new_client.molecule
# res = molecule.search('Albumin')

# import cirpy
# _ = cirpy.resolve('Albumin (Human)', 'smiles')

### Fixing Drug Names in Manuel

In [None]:
drug_lookup = { "Metoprolol XL": "Metoprolol",
               "NIFEdipine CR": "NIFEdipine",
               "HYDROmorphone P.F.": "HYDROmorphone",
                "Nicotine Patch": "Nicotine",
               "Lidocaine 5% Patch": "Lidocaine",
               "Lidocaine Jelly 2% (Urojet)":"Lidocaine",
               "Miconazole Powder 2%": "Miconazole",
                "Lactated Ringers": "Ringer's lactate",
                "Piperacillin-Tazobactam Na": "Piperacillin-Tazobactam",
                "Albuterol 0.083% Neb Soln": "Albuterol",
               "Albuterol Inhaler": "Albuterol",
               "Albuterol MDI": "Albuterol",
               "Albuterol Neb Soln":"Albuterol",
               "MethylPREDNISolone Sodium Succ":"MethylPREDNISolone",
               "Methylprednisolone Na Succ":"MethylPREDNISolone",
               "Methylprednisolone Na Succ.":"MethylPREDNISolone",
               "Methylprednisolone Sodium Succ":"MethylPREDNISolone",
               "Lansoprazole Oral Disintegrating Tab": "Lansoprazole",
               "Isotonic Sodium Chloride": "Sodium Chloride",
               "Vancomycin Oral Liquid": "Vancomycin",
               "Mupirocin Nasal Ointment 2%": "Mupirocin",
               "Sodium Chloride Nasal": "Sodium Chloride",
               "Ciprofloxacin IV": "Ciprofloxacin",
               "Brimonidine Tartrate 0.15% Ophth.": "Brimonidine Tartrate",
               "NiCARdipine IV": "NiCARdipine",
               "Nitroglycerin SL": "Nitroglycerin",
               "Acetaminophen IV": "Acetaminophen",
               "Acetaminophen-Caff-Butalbital": "Acetaminophen",
               "Oxycodone-Acetaminophen (5mg-325mg)": "Acetaminophen",
               "Latanoprost 0.005% Ophth. Soln.": "Latanoprost",
               "Vasopressin": "argipressin",
               "Albuterol-Ipratropium": "Albuterol",
                "Aspirin EC": "aspirin",
               "Hydrocortisone Na Succ.":"Hydrocortisone",
                "Iso-Osmotic Dextrose": "Dextrose",
                "Iso-Osmotic Sodium Chloride": "Sodium Chloride",
                "D5W": "anhydrous dextrose",
               "D5W (Glass Bottle)": "anhydrous dextrose",
               "D5 1/2NS": "Sodium Chloride",
               "D5NS": "Sodium Chloride",
               "D5W (EXCEL BAG)": "anhydrous dextrose",
               "D10W": "anhydrous dextrose",
               "D7.5W": "anhydrous dextrose",
               "sw": "water",
               "*sw*": "water",
               "steril water": "water",
               "*ns*": "Sodium Chloride",
               "Humulin-R Insulin":"insulin",
               "Insulin Human Regular": "insulin",
                "Sodium Chloride 0.9%  Flush": "Sodium Chloride",
                "Oxycodone-Acetaminophen": "Oxycodone",
                "Aspirin EC": "Aspirin",
                "NS": "Sodium Chloride",
                "1/2 NS": "Sodium Chloride",
                "NS (Glass Bottle)": "Sodium Chloride",
                "Ipratropium Bromide Neb": "Ipratropium Bromide",
                "Piperacillin-Tazobactam Na": "Piperacillin",
                "NS (Mini Bag Plus)": "Sodium Chloride",
                "NS Epidural Bag (0.9% NaCl)": "Sodium Chloride",
                 "NS        (Glass Bottle)": "Sodium Chloride",
                 "NS Epidural Bag ": "Sodium Chloride",
               "Heparin Flush (10 units/ml)": "Heparin",
               "Nitroprusside Sodium": "sodium nitroprusside",
               "Eucerin": "Ensulizole",
                "Heparin Flush PICC (100 units/ml)":"Heparin",
               "Dorzolamide 2%/Timolol 0.5% Ophth.": "Dorzolamide",
               "Heparin Flush (100 units/ml)": "Heparin",
               "Heparin Flush CVL  (100 units/ml)": "Heparin",
              }

manuel_fix_dict = {}
for each_drug_ in drug_lookup:
    comps_ = get_compounds(drug_lookup[each_drug_], 'name')
    if len(comps_) != 0:
        manuel_fix_dict[each_drug_] = comps_

In [None]:
print("We find",len(manuel_fix_dict), "drugs in pubchem after change some drugs in manuely")

In [None]:
#check how many rows find in prescription df

missing_index_set = set()
final_prescription_df = sub_prescription_df.loc[drug_index_set]

drug_names_that_found_in_pubchem = set(all_drugs_pubchem_dict.keys())

# find real drug names
original_names_of_preproccesed_drugs = set()
for pre_drug_ in preprocessed_drugs_pubchem_dict.keys():
    for original_drug_ in preprocessed_to_drug_dict[pre_drug_]:
        original_names_of_preproccesed_drugs.add(original_drug_)
        
for i in final_prescription_df.itertuples():
    ind_ = i.Index
    drug_ = i.DRUG
    pre_drug_ = preprocess(drug_)
    generic_drug_ = i.DRUG_NAME_GENERIC
    
    if drug_ in drug_names_that_found_in_pubchem:
        continue
    elif drug_ in original_names_of_preproccesed_drugs:
        continue
    elif generic_drug_ in generic_drugs_pubchem_dict:
        continue
    elif drug_ in manuel_fix_dict:
        continue
    else:
        missing_index_set.add(ind_)

In [None]:
print("After fixing some drug names and search in Pubchem, we have still", len(missing_index_set), "rows that we need to find")

### Analyze the remaining rows

In [None]:
analyze_prescription_df_ = sub_prescription_df.loc[missing_index_set]

print("We need to find", len(analyze_prescription_df_.DRUG.unique()), "drug name or", 
     len(analyze_prescription_df_.DRUG_NAME_GENERIC.unique()), " generic drug name or",
     len(analyze_prescription_df_.NDC.unique()), "ndc code")

### Search Based on NDC Name 

In [None]:
ndc_path = "ndcxls/"

In [None]:
product_df = pd.read_csv(os.path.join(ndc_path, "product.csv"), sep=",")
unfinished_product_df = pd.read_csv(os.path.join(ndc_path, "ndc_unfinished/unfinished_product.csv"), sep=",")

In [None]:
def ndc_converter(raw_ndc):
    #264751020.0 => 0264-7510
    raw_ndc = str(raw_ndc)
    ndc_ = raw_ndc.split(".")[0]        
    
    if len(ndc_) == 7:
        temp1 = ndc_[:3]+"-"+ndc_[3:]
        
        new_ndc_ = "0"+ndc_
        temp2 = new_ndc_[:4]+"-"
        temp2 += new_ndc_[4:8]
        
        return temp1, temp2
    elif len(ndc_) == 8:
        temp1 = ndc_[:4]+"-"+ndc_[4:]
        
        new_ndc_ = "0"+ndc_
        temp2 = new_ndc_[:4]+"-"
        temp2 += new_ndc_[4:8]
        
        return temp1, temp2
    elif len(ndc_) == 9:
        temp1 = ndc_[:5]+"-"+ndc_[5:]
        
        new_ndc_ = "0"+ndc_
        temp2 = new_ndc_[:4]+"-"
        temp2 += new_ndc_[4:8]
        
        return temp1, temp2
    else:
        temp1 = ""
        new_ndc_ = "0"+ndc_
        temp2 = new_ndc_[:4]+"-"
        temp2 += new_ndc_[4:8]
        
        return temp1, temp2
    return None

In [None]:
# original_ndc_list_ = list(product_df.PRODUCTNDC.unique())
# ndc_list_need_to_find = list(analyze_prescription_df_.NDC.unique())

#product_df[product_df.PRODUCTNDC == '0264-7510']
#final_prescription_df[final_prescription_df.NDC == 9076502.0]

In [None]:
# modified_ndc_list = set()
# for i in original_ndc_list_:
#     q_ = i.replace("-","")
#     modified_ndc_list.add(q_)

In [None]:
# len(original_ndc_list_), len(modified_ndc_list)

# from thefuzz import fuzz
# from thefuzz import process

# for i in ndc_list_need_to_find:
#     q_ = str(i)
#     print(q_)
#     conv = ndc_converter(q_)
#     for each_mod_ in modified_ndc_list:
#         rat = fuzz.ratio(q_, each_mod_)
#         rat2 = fuzz.ratio(conv, each_mod_)
#         if rat > 75:
#             print("1. ", rat, q_, each_mod_)
#         if rat2 > 90:
#             print("2. ", rat2, conv, each_mod_)
#     print("##########")

In [None]:
ndc_dict = {}
ndc_list_ = list(analyze_prescription_df_.NDC.unique())
for ndc_code_ in ndc_list_:
    converted_, conv_2 = ndc_converter(ndc_code_)
    
    try:
        temp_drug_name = product_df[product_df.PRODUCTNDC == converted_]['NONPROPRIETARYNAME'].iloc[0]
        comps_ = get_compounds(temp_drug_name, 'name')
        if len(comps_) != 0:
            ndc_dict[ndc_code_] = comps_
    except:
        
        try:
            temp_drug_name = product_df[product_df.PRODUCTNDC == conv_2]['NONPROPRIETARYNAME'].iloc[0]
            comps_ = get_compounds(temp_drug_name, 'name')
            if len(comps_) != 0:
                ndc_dict[ndc_code_] = comps_
        except:
            pass

In [None]:
len(ndc_dict)

In [None]:
#check how many rows find in prescription df

missing_index_set = set()
final_prescription_df = sub_prescription_df.loc[drug_index_set]

drug_names_that_found_in_pubchem = set(all_drugs_pubchem_dict.keys())

# find real drug names
original_names_of_preproccesed_drugs = set()
for pre_drug_ in preprocessed_drugs_pubchem_dict.keys():
    for original_drug_ in preprocessed_to_drug_dict[pre_drug_]:
        original_names_of_preproccesed_drugs.add(original_drug_)
        
for i in final_prescription_df.itertuples():
    ind_ = i.Index
    drug_ = i.DRUG
    pre_drug_ = preprocess(drug_)
    generic_drug_ = i.DRUG_NAME_GENERIC
    ndc_ = i.NDC
    
    if drug_ in drug_names_that_found_in_pubchem:
        continue
    elif drug_ in original_names_of_preproccesed_drugs:
        continue
    elif generic_drug_ in generic_drugs_pubchem_dict:
        continue
    elif drug_ in manuel_fix_dict:
        continue
    elif ndc_ in ndc_dict:
        continue
    else:
        missing_index_set.add(ind_)

In [None]:
print("After fixing some drug names and search in Pubchem, we have still", len(missing_index_set), "rows that we need to find")

In [None]:
analyze_prescription_df_ = sub_prescription_df.loc[missing_index_set]

print("We need to find", len(analyze_prescription_df_.DRUG.unique()), "drug name or", 
     len(analyze_prescription_df_.DRUG_NAME_GENERIC.unique()), " generic drug name or",
     len(analyze_prescription_df_.NDC.unique()), "ndc code")

In [None]:
final_prescription_df.shape, len(missing_index_set)

In [None]:
32882 / 592946

### Create patient Dict based on pubchem results

In [None]:
all_drugs_pubchem_dict
preprocessed_drugs_pubchem_dict
generic_drugs_pubchem_dict
manuel_fix_dict
ndc_dict

In [None]:
patient_pubchem_dict = {}
final_missed_index_set = set()

for i in final_prescription_df.itertuples():

    pat_id_ = i.SUBJECT_ID
    
    ind_ = i.Index
    drug_ = i.DRUG
    pre_drug_ = preprocess(drug_)
    generic_drug_ = i.DRUG_NAME_GENERIC
    ndc_ = i.NDC
    
    res_ = []
    if drug_ in all_drugs_pubchem_dict:
        res = all_drugs_pubchem_dict[drug_]
    elif pre_drug_ in preprocessed_drugs_pubchem_dict:
        res = preprocessed_drugs_pubchem_dict[pre_drug_]
    elif generic_drug_ in generic_drugs_pubchem_dict:
        res = generic_drugs_pubchem_dict[generic_drug_]
    elif drug_ in manuel_fix_dict:
        res = manuel_fix_dict[drug_]
    elif ndc_ in ndc_dict:
        res = ndc_dict[ndc_]
    else:
        final_missed_index_set.add(ind_)
        
    if pat_id_ in patient_pubchem_dict:
        patient_pubchem_dict[pat_id_].append(res[0])
    else:
        patient_pubchem_dict[pat_id_] = [res[0]]
    

In [None]:
len(patient_pubchem_dict), len(final_missed_index_set)

In [None]:
for patient_, drugs_ in patient_pubchem_dict.items():    
    if len(drugs_) == 0:
        print(patient_, len(drugs_))

In [None]:
pd.to_pickle(patient_pubchem_dict, "patient_pubchem_dict.p")

In [None]:
# import pandas as pd
# q_ = pd.read_pickle("patient_pubchem_dict.p")
# all_unique_smiles = set()

# for k,v in q_.items():
#     for i in v:
#         all_unique_smiles.add(i.canonical_smiles)