# Format drug exposures data

This notebook formats the drug exposures for each report as a matrix of reports by drug exposures.
To make this mapping transparent, it also produces vectors indicating what IDs correspond to each index in the matrix.

This notebook uses the following files:

* `data/aeolus/*.npy`
    * This is a set of 55 files across which the reports are split, in order.
* `data/meta_unformatted/unique_ingredients.npy`

to produce the following files:

* `data/meta_formatted/drug_exposure_matrix.npz`
* `data/meta_formatted/drug_id_vector.npy`

In [1]:
import pathlib
import re

import numpy as np
import scipy.sparse

## Find all potential drug exposure files

In [2]:
aeolus_path = pathlib.Path('../../data/aeolus/')
drug_files = list(aeolus_path.glob('*.npy'))
print(len(drug_files))

# Sort by integer, not string, so that sorted order is 1, 2, ... not 1, 10, 11, ...
drug_files = sorted(drug_files, key=lambda p: int(re.match('(?:.+_IN_)([0-9]+)(?:\.npy)', p.name).group(1)))

drug_files[:11]

55


[PosixPath('../../data/aeolus/AEOLUS_all_reports_IN_0.npy'),
 PosixPath('../../data/aeolus/AEOLUS_all_reports_IN_1.npy'),
 PosixPath('../../data/aeolus/AEOLUS_all_reports_IN_2.npy'),
 PosixPath('../../data/aeolus/AEOLUS_all_reports_IN_3.npy'),
 PosixPath('../../data/aeolus/AEOLUS_all_reports_IN_4.npy'),
 PosixPath('../../data/aeolus/AEOLUS_all_reports_IN_5.npy'),
 PosixPath('../../data/aeolus/AEOLUS_all_reports_IN_6.npy'),
 PosixPath('../../data/aeolus/AEOLUS_all_reports_IN_7.npy'),
 PosixPath('../../data/aeolus/AEOLUS_all_reports_IN_8.npy'),
 PosixPath('../../data/aeolus/AEOLUS_all_reports_IN_9.npy'),
 PosixPath('../../data/aeolus/AEOLUS_all_reports_IN_10.npy')]

## Combine all drug exposure files

In [3]:
# Combine the split drug exposure matrices into a single sparse matrix
drug_exposure_matrix = None

for drug_file in drug_files:
    try:
        exposure_chunk_matrix = np.load(drug_file, allow_pickle=True).item()
    except OSError:
        print(drug_file)
        pass
    
    if drug_exposure_matrix is None:
        drug_exposure_matrix = exposure_chunk_matrix
    else:
        drug_exposure_matrix = scipy.sparse.vstack([drug_exposure_matrix, 
                                                    exposure_chunk_matrix])
        
# Convert to compressed sparse column format (allows slicing)
drug_exposure_matrix = drug_exposure_matrix.tocsc()

../../data/aeolus/AEOLUS_all_reports_IN_50.npy
../../data/aeolus/AEOLUS_all_reports_IN_51.npy
../../data/aeolus/AEOLUS_all_reports_IN_52.npy
../../data/aeolus/AEOLUS_all_reports_IN_53.npy
../../data/aeolus/AEOLUS_all_reports_IN_54.npy


## Check resulting matrix

In [4]:
# Array is too big. (Too many records - should be 4_694_086)
print(drug_exposure_matrix.shape)

# But the excess rows are all empty
assert drug_exposure_matrix[4694086:].nnz == 0

# So use only real rows
drug_exposure_matrix = drug_exposure_matrix[:4694086]

(5338588, 4396)


## Save formatted files

In [5]:
# Save the drug exposure matrix to a file
scipy.sparse.save_npz('../../data/meta_formatted/drug_exposure_matrix.npz', drug_exposure_matrix)

# Load, reformat, and save drug ID vector
drug_id_vector = np.load('../../data/meta_unformatted/unique_ingredients.npy').astype(int)
np.save('../../data/meta_formatted/drug_id_vector.npy', drug_id_vector)