In [1]:
import os
import pandas as pd
import json
import shutil
import numpy as np
from tqdm import tqdm

In [2]:
### LOAD DATA
cwd = os.getcwd()
release = '20240312'

labels = pd.read_csv(f'../releases/v2.0.0/{release}/adverse_reactions_active_labels.csv.gz', compression='gzip')
meta_data = pd.read_csv(f'../releases/v2.0.0/{release}/dm_spl_zip_files_meta_data.csv.gz', compression='gzip')
ingredients = pd.read_csv(f'../releases/v2.0.0/{release}/ingredients.csv.gz', compression='gzip')
rx_map = pd.read_csv(f'../releases/v2.0.0/{release}/rxnorm_mappings.csv.gz', compression='gzip')

In [3]:
latest_labels = labels.merge(meta_data, left_on = 'set_id', right_on = 'SETID')
latest_ingredients = ingredients.merge(meta_data, left_on = 'set_id', right_on = 'SETID')
one_ingredient = labels[labels['num_ingredients'] == 1]
one_labels = one_ingredient.set_id.to_list()
latest_ingredients = latest_ingredients[latest_ingredients['set_id'].isin(one_labels)]

print('Number of unique set ids: ', len(latest_labels.set_id.unique()))
print("Number of unique ingredients for all labels:", len(ingredients.ingredient_rx_cui.unique()))
print('Labels with one ingredient: ', len(latest_ingredients.set_id.unique()))
print('Number of unique ingredients for one-ingredient labels: ', len(latest_ingredients.ingredient_rx_cui.unique()))

Number of unique set ids:  32839
Number of unique ingredients for all labels: 2795
Labels with one ingredient:  29057
Number of unique ingredients for one-ingredient labels:  1468


In [4]:
latest_labels['UPLOAD_DATE'] = pd.to_datetime(latest_labels['UPLOAD_DATE'])
latest_ingredients['UPLOAD_DATE'] = pd.to_datetime(latest_ingredients['UPLOAD_DATE'])

In [5]:
def latest_filter(df, groupby_col):
    result_df = df.groupby(groupby_col).apply(lambda x: x[x['UPLOAD_DATE'] == x['UPLOAD_DATE'].max()])
    result_df = result_df.reset_index(drop=True)

    return result_df

In [6]:
ingredients_df = latest_filter(latest_ingredients, 'ingredient_rx_cui')
labels_df = latest_filter(latest_labels, 'set_id')

In [7]:
ingredient_ids = set(list(ingredients_df.SETID.unique()))

In [8]:
np.random.seed(42)

# Filter rows based on 'set_id' and drop duplicates
ingredients_df = ingredients_df[ingredients_df['set_id'].isin(ingredient_ids)]
ingredients_df = ingredients_df.drop_duplicates(subset=['set_id', 'TITLE'])

# Group by ingredient get the latest spl_version, sample if there are multiple labels with the same upload date and spl version
ingredients_df = ingredients_df.groupby('ingredient_rx_cui').apply(lambda x: x[x['SPL_VERSION'] == x['SPL_VERSION'].max()].sample(1)).reset_index(drop=True)

# ingredients_df.ingredient_rx_cui.value_counts()


In [9]:
labels_df = labels_df.groupby(['set_id', 'spl_version']).agg({
    'pt_meddra_term': list,
    'pt_meddra_id': list
}).reset_index()

In [10]:
def set_spl_dict(df, spl_col):
    grouped = df.groupby('set_id')[spl_col].apply(list)
    dict = grouped.to_dict()
    return dict

In [11]:
ingredient_ids = set_spl_dict(ingredients_df, 'SPL_VERSION')
label_ids = set_spl_dict(labels_df, 'spl_version')

In [12]:
def select_json_files(directory, filtered_set_ids):
    selected_files = []

    # Iterate through each file in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            file_path = os.path.join(directory, filename)

            # Open and read the JSON file
            with open(file_path, 'r') as file:
                try:
                    data = json.load(file)

                    # Check if both set_id and label_id are in the filtered lists
                    if 'set_id' in data and 'spl_version' in data:
                        if data['set_id'] in filtered_set_ids:
                            if int(data['spl_version']) in filtered_set_ids[data['set_id']]:
                                selected_files.append(filename)

                except json.JSONDecodeError:
                    print(f"Error decoding JSON in file: {filename}")

    return selected_files

In [13]:
def select_labels(ids):
    lst = []
    for num in range(5):
        directory = '../../onsides-v4/data/spl/rx/dm_spl_release_human_rx_part' + str(num + 1) + '/prescription'
        lst.append(select_json_files(directory, ids))
    return lst

In [14]:
selected_ingredients = select_labels(ingredient_ids)
selected_labels = select_labels(label_ids)

In [15]:
def flatten_list(lst, label_type):
    count = [item for row in lst for item in row]
    print(f'number of {label_type} labels:', len(count))
    return count

In [16]:
selected_ingredients = flatten_list(selected_ingredients, "ingredient")
selected_labels = flatten_list(selected_labels, "latest")

number of ingredient labels: 1468
number of latest labels: 32839


In [17]:
def allocate_jsons(directory, ids_list):
    destination_directory = f'../../onsides-v4/data/{directory}/'
    
    # Clear the destination directory first
    if os.path.exists(destination_directory):
        # Remove the entire directory and its contents
        shutil.rmtree(destination_directory)
    
    os.makedirs(destination_directory, exist_ok=True)
    
    # Now proceed to copy or move files
    for num in tqdm(range(5)):
        directory = '../../onsides-v4/data/spl/rx/dm_spl_release_human_rx_part' + str(num + 1) + '/prescription'
        
        for file in ids_list:
            # Ensure 'file' is a string and not something else
            if not isinstance(file, (str, bytes, os.PathLike)):
                print(f"Skipping invalid file: {file}")
                continue
    
            file_path = os.path.join(directory, file)
    
            if not os.path.exists(file_path):
                # print(f"File does not exist: {file_path}")
                continue
    
            # Copy or move the file
            shutil.copy(file_path, destination_directory)
            # print(f"Copied: {file_path} to {destination_directory}")

In [18]:
allocate_jsons("latest_labels_ingredients", selected_ingredients)
allocate_jsons("latest_labels", selected_labels)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  7.74it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:15<00:00,  3.16s/it]


### COLLECT LATEST TRAIN_XML AND GOLD_XML FILES

In [20]:
def tac_filenames(directory_path):
    lst = []
    for filename in os.listdir(directory_path):
        prefix = filename.split('_')[0]
        lst.append(prefix)
    return lst

In [21]:
train_titles = tac_filenames('../data/train_xml')
gold_titles = tac_filenames('../data/gold_xml')

In [22]:
def tac_df(label_list):
    df = pd.DataFrame()

    for label_name in label_list:
        label_filter = meta_data['TITLE'].str.contains(label_name, na=False)
        df = pd.concat([df, meta_data[label_filter]], ignore_index=True)

    return df

In [23]:
train_df = tac_df(train_titles)
gold_df = tac_df(gold_titles)

In [24]:
def match_title(title, title_list):
    # Iterate through the list and return the title if it's in the title string
    for list_title in title_list:
        if list_title in title:
            return list_title
    return None  # Return None or some default value if no match is found

# Apply the custom function to the 'TITLE' column
train_df['xml_title'] = train_df['TITLE'].apply(lambda x: match_title(x, train_titles))
gold_df['xml_title'] = gold_df['TITLE'].apply(lambda x: match_title(x, gold_titles))

In [25]:
def latest_filter(df, groupby_col):
    result_df = df.groupby(groupby_col).apply(lambda x: x[x['UPLOAD_DATE'] == x['UPLOAD_DATE'].max()])
    result_df = result_df.reset_index(drop=True)

    return result_df

In [27]:
train_df = latest_filter(train_df, 'xml_title')
gold_df = latest_filter(gold_df, 'xml_title')

In [28]:
# Group by 'xml_title' and get the max 'UPLOAD_DATE' for each group
latest_train = train_df.groupby('xml_title')['SPL_VERSION'].max().reset_index()
latest_gold = gold_df.groupby('xml_title')['SPL_VERSION'].max().reset_index()


# Merge this back with the original train_df to get the full rows
train_df = pd.merge(train_df, latest_train, on=['xml_title', 'SPL_VERSION'])
gold_df = pd.merge(gold_df, latest_gold, on=['xml_title', 'SPL_VERSION'])

In [29]:
print(train_df['xml_title'].nunique())
print(gold_df['xml_title'].nunique())

91
84


In [30]:
gold_df = gold_df.drop_duplicates(subset = ['xml_title', 'SPL_VERSION'])
train_df = train_df.drop_duplicates(subset = ['xml_title', 'SPL_VERSION'])

In [31]:
train_ids = {}
for setid in train_df.SETID.unique():
    train_ids[setid] = train_df[train_df.SETID == setid].SPL_VERSION.tolist()

gold_ids = {}
for setid in gold_df.SETID.unique():
    gold_ids[setid] = gold_df[gold_df.SETID == setid].SPL_VERSION.tolist()

In [32]:
train_select = []
for num in range(5):
    directory = '../../onsides-v4/data/spl/rx/dm_spl_release_human_rx_part' + str(num + 1) + '/prescription'
    train_select.append(select_json_files(directory, train_ids))
    
# print("Selected JSON Files:", train_select)

In [33]:
train_select = [item for row in train_select for item in row]
len(train_select)

90

In [34]:
gold_select = []
for num in range(5):
    directory = '../../onsides-v4/data/spl/rx/dm_spl_release_human_rx_part' + str(num + 1) + '/prescription'
    gold_select.append(select_json_files(directory, gold_ids))
    
# print("Selected JSON Files:", gold_select)

In [35]:
gold_select = [item for row in gold_select for item in row]
len(gold_select)

82

In [36]:
train_df.head()

Unnamed: 0,SETID,ZIP_FILE_NAME,UPLOAD_DATE,SPL_VERSION,TITLE,xml_title
0,3904f8dd-1aef-3490-e48f-bd55f32ed67f,20230627_3904f8dd-1aef-3490-e48f-bd55f32ed67f.zip,06/27/2023,34,"ADCETRIS (BRENTUXIMAB VEDOTIN) INJECTION, POWD...",ADCETRIS
1,c89d3ecc-4f4c-4566-8808-79152344194d,20240112_c89d3ecc-4f4c-4566-8808-79152344194d.zip,01/12/2024,9,ADREVIEW (IOBENGUANE I-123) INJECTION [MEDI-PH...,ADREVIEW
2,2150f73a-179b-4afc-b8ce-67c85cc72f04,20230802_2150f73a-179b-4afc-b8ce-67c85cc72f04.zip,08/02/2023,60,AFINITOR (EVEROLIMUS) TABLET AFINITOR DISPERZ ...,AFINITOR
3,550eb76a-e4a6-4fa1-ad65-c0fd8b0ce783,20230124_550eb76a-e4a6-4fa1-ad65-c0fd8b0ce783.zip,01/24/2023,17,"AMPYRA (DALFAMPRIDINE) TABLET, FILM COATED, EX...",AMPYRA
4,bb5a5043-0f51-11df-8a39-0800200c9a66,20231114_bb5a5043-0f51-11df-8a39-0800200c9a66.zip,11/14/2023,20,"AMYVID (FLORBETAPIR F 18) INJECTION, SOLUTION ...",AMYVID


In [37]:
destination_directory = '../../onsides-v4/data/train_xml/'

# Clear the destination directory first
if os.path.exists(destination_directory):
    # Remove the entire directory and its contents
    shutil.rmtree(destination_directory)

os.makedirs(destination_directory, exist_ok=True)

# Now proceed to copy or move files
for num in range(5):
    directory = f'../../onsides-v4/data/spl/rx/dm_spl_release_human_rx_part{num + 1}/prescription'
    
    for file in train_select:
        # Ensure 'file' is a string and not something else
        if not isinstance(file, (str, bytes, os.PathLike)):
            print(f"Skipping invalid file: {file}")
            continue

        file_path = os.path.join(directory, file)

        if not os.path.exists(file_path):
            # print(f"File does not exist: {file_path}")
            continue

        try:
            # Open and read the JSON file
            with open(file_path, 'r') as file:
                data = json.load(file)
                setid = data['set_id']  # Extract SETID from the file
        
            # Lookup xml_title in train_df
            xml_title = train_df[train_df['SETID'] == setid]['xml_title'].iloc[0] if not train_df[train_df['SETID'] == setid].empty else 'unknown'
        
            # Construct new filename
            new_filename = f"{xml_title}_{setid}.json"  # Assuming you want to save it as .json
        
            # Copy the file with the new name
            shutil.copy(file_path, os.path.join(destination_directory, new_filename))
    
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

In [41]:
destination_directory = '../../onsides-v4/data/gold_xml/'

# Clear the destination directory first
if os.path.exists(destination_directory):
    # Remove the entire directory and its contents
    shutil.rmtree(destination_directory)

os.makedirs(destination_directory, exist_ok=True)


# Now proceed to copy or move files
for num in range(5):
    directory = f'../../onsides-v4/data/spl/rx/dm_spl_release_human_rx_part{num + 1}/prescription'
    
    for file in gold_select:
        # Ensure 'file' is a string and not something else
        if not isinstance(file, (str, bytes, os.PathLike)):
            print(f"Skipping invalid file: {file}")
            continue

        file_path = os.path.join(directory, file)

        if not os.path.exists(file_path):
            # print(f"File does not exist: {file_path}")
            continue

        try:
            # Open and read the JSON file
            with open(file_path, 'r') as file:
                data = json.load(file)
                setid = data['set_id']  # Extract SETID from the file

            # Lookup xml_title in train_df
            xml_title = gold_df[gold_df['SETID'] == setid]['xml_title'].iloc[0] if not gold_df[gold_df['SETID'] == setid].empty else 'unknown'

            # Construct new filename
            new_filename = f"{xml_title}_{setid}.json"  # Assuming you want to save it as .json

            # Copy or move the file with the new name
            shutil.copy(file_path, os.path.join(destination_directory, new_filename))

        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
