In [2]:
import pathlib
import re

import numpy as np
import pandas as pd
import scipy.io
import scipy.sparse

# Build drug exposure array

Currently, there are smaller files for chunks of reports

In [3]:
aeolus_path = pathlib.Path('../data/aeolus/')
drug_files = list(aeolus_path.glob('*.npy'))

# Sort by integer, not string, so that sorted order is 1, 2, ... not 1, 10, 11, ...
drug_files = sorted(drug_files, key=lambda p: int(re.match('(?:.+_IN_)([0-9]+)(?:\.npy)', p.name).group(1)))

all_drugs = None

for drug_file in drug_files:
    try:
        sparse_arr = np.load(drug_file).item()
    except OSError:
        print(drug_file)
        pass
    
    if all_drugs is None:
        all_drugs = sparse_arr
    else:
        all_drugs = scipy.sparse.vstack([all_drugs, sparse_arr])

../data/aeolus/AEOLUS_all_reports_IN_50.npy
../data/aeolus/AEOLUS_all_reports_IN_51.npy
../data/aeolus/AEOLUS_all_reports_IN_52.npy
../data/aeolus/AEOLUS_all_reports_IN_53.npy
../data/aeolus/AEOLUS_all_reports_IN_54.npy


### Check drug exposure array

In [4]:
# Array is too big. (Too many records)
all_drugs.shape

(5338588, 4396)

In [5]:
# But the excess rows are all empty
assert all_drugs.tocsc()[4694086:].nnz == 0

# So use only real rows
sliced_drugs_array = all_drugs.tocsc()[:4694086]

# Save the drug exposure matrix to a file
scipy.sparse.save_npz('../data/meta/all_drug_exposures.npz', sliced_drugs_array)

### Rebuild and save drugs vector

In [6]:
original_drugs_vector = np.load('../data/meta/unique_ingredients.npy')

np.save('../data/meta/drugs_vector.npy', original_drugs_vector.astype(int))

# Reconstruct outcomes array

Need to use MedDRA instead of SNOMED for outcome concepts.

In [6]:
report_ids = np.load('../data/meta/all_reportids_IN.npy')

outcomes_df = pd.read_csv('../data/meta/outcomes_table.csv.xz')

outcomes_df.head(2)

Unnamed: 0,primaryid,isr,pt,outcome_concept_id,snomed_outcome_concept_id
0,,8310062.0,0,35607122,
1,,7794268.0,0,36718526,377091.0


In [7]:
n_unique_meddra = len(set(outcomes_df['outcome_concept_id']))
n_unique_snomed = len(set(outcomes_df
                          .dropna(subset=['snomed_outcome_concept_id'])
                          ['snomed_outcome_concept_id']))

print(f'SNOMED: {n_unique_snomed}, MedDRA: {n_unique_meddra}')

SNOMED: 7091, MedDRA: 17710


In [8]:
# There are no entries with both primaryid and isr or neither. Always exactly one of the two
assert outcomes_df[~(outcomes_df['isr'].isnull() | outcomes_df['primaryid'].isnull())].shape[0] == 0
assert outcomes_df[~outcomes_df['isr'].isnull() & ~outcomes_df['primaryid'].isnull()].shape[0] == 0

In [9]:
# Create a "report_id" which is either "primaryid" or "isr", whichever the report has
report_outcomes_df = (
    pd.concat([
        outcomes_df
        .loc[~outcomes_df['primaryid'].isnull()]
        .rename(columns={'primaryid': 'report_id'})
        .drop('isr', axis=1),
        outcomes_df
        .loc[~outcomes_df['isr'].isnull()]
        .rename(columns={'isr': 'report_id'})
        .drop('primaryid', axis=1),
    ], ignore_index=True, sort=False)
    .assign(
        report_id=lambda df: df['report_id'].astype(int),
    )
    # More report ids in the new dataframe than in already computed files
    .merge(pd.DataFrame(report_ids, columns=['report_id']).reset_index(),
           on='report_id', how='right')
)

report_outcomes_df.head(2)

Unnamed: 0,report_id,pt,outcome_concept_id,snomed_outcome_concept_id,index
0,100033001,0,36516812,77074.0,4394326
1,100033001,0,35708093,196523.0,4394326


In [10]:
meddra_id_to_index = {meddra_id: i for i, meddra_id
                      in enumerate(sorted(set(report_outcomes_df['outcome_concept_id'])))}

report_outcomes_df = (
    report_outcomes_df
    .assign(
        outcome_index=lambda df: df['outcome_concept_id'].map(meddra_id_to_index)
    )
    .rename(columns={'index': 'report_index'})
    .drop('pt', axis=1)
)

report_outcomes_df.to_csv('../data/meta/reports_outcomes.csv.xz', index=False, compression='xz')

report_outcomes_df.head(2)

Unnamed: 0,report_id,outcome_concept_id,snomed_outcome_concept_id,report_index,outcome_index
0,100033001,36516812,77074.0,4394326,10544
1,100033001,35708093,196523.0,4394326,3612


In [11]:
# Generate and save matrix
n_edges = report_outcomes_df.shape[0]
edges = report_outcomes_df[['report_index', 'outcome_index']].values.tolist()

outcomes_matrix = scipy.sparse.coo_matrix(
    (np.ones(n_edges), (zip(*edges)))
)
scipy.sparse.save_npz('../data/meta/all_outcomes_meddra.npz', outcomes_matrix.tocsc())

# Generate and save vector of ids
np.save(
    '../data/meta/outcomes_vector_meddra.npy',
    report_outcomes_df
    .loc[:, ['outcome_concept_id', 'outcome_index']]
    .drop_duplicates()
    .sort_values(by='outcome_index')
    ['outcome_concept_id']
    .values
)