# Format outcomes data

This notebook formats the MedDRA outcomes for each report as a matrix of reports by outcomes.
To make this mapping transparent, it also produces vectors indicating what IDs correspond to each index in the matrix.

This notebook uses the following files:

* `data/meta_unformatted/all_reportids_IN.npy`
* `data/meta_unformatted/outcomes_table.csv.xz`

to produce the following files:

* `data/meta_formatted/report_id_vector.npy`
* `data/meta_formatted/outcome_id_vector.npy`
* `data/meta_formatted/outcome_matrix.npz`
* `data/meta_formatted/outcomes_table.csv.xz`

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse

## Load unformatted files

In [2]:
report_id_vector = np.load('../../data/meta_unformatted/all_reportids_IN.npy').astype(int)

np.save('../../data/meta_formatted/report_id_vector.npy', report_id_vector)

print(report_id_vector.shape)
report_id_vector

(4694086,)


array([  4572294,   4440060,   4456349, ...,  87896661, 101667231,
       100823751])

In [3]:
outcomes_df = pd.read_csv('../../data/meta_unformatted/outcomes_table.csv.xz')

outcomes_df.head(2)

Unnamed: 0,primaryid,isr,pt,outcome_concept_id,snomed_outcome_concept_id
0,,8310062.0,0,35607122,
1,,7794268.0,0,36718526,377091.0


## Produce formatted outputs

In [4]:
n_unique_meddra = len(set(outcomes_df['outcome_concept_id']))
n_unique_snomed = len(set(outcomes_df
                          .dropna(subset=['snomed_outcome_concept_id'])
                          ['snomed_outcome_concept_id']))

print(f'SNOMED: {n_unique_snomed}, MedDRA: {n_unique_meddra}')

SNOMED: 7091, MedDRA: 17710


In [5]:
# There are no entries with both primaryid and isr or neither. Always exactly one of the two
assert outcomes_df[~(outcomes_df['isr'].isnull() | outcomes_df['primaryid'].isnull())].shape[0] == 0
assert outcomes_df[~outcomes_df['isr'].isnull() & ~outcomes_df['primaryid'].isnull()].shape[0] == 0

In [6]:
# Create a "report_id" which is either "primaryid" or "isr", whichever the report has
report_outcomes_df = (
    pd.concat([
        outcomes_df
        .loc[~outcomes_df['primaryid'].isnull()]
        .rename(columns={'primaryid': 'report_id'})
        .drop('isr', axis=1),
        outcomes_df
        .loc[~outcomes_df['isr'].isnull()]
        .rename(columns={'isr': 'report_id'})
        .drop('primaryid', axis=1),
    ], ignore_index=True, sort=False)
    .assign(
        report_id=lambda df: df['report_id'].astype(int),
    )
    .filter(items=['report_id', 'outcome_concept_id', 'snomed_outcome_concept_id'])
    # More report ids in the new dataframe than in already computed files
    .merge(pd.DataFrame(report_id_vector, columns=['report_id']).reset_index(),
           on='report_id', how='right')
)

report_outcomes_df.head(2)

Unnamed: 0,report_id,outcome_concept_id,snomed_outcome_concept_id,index
0,100033001,36516812,77074.0,4394326
1,100033001,35708093,196523.0,4394326


In [7]:
# Create an index of MedDRA IDs, in order to assign each to a position in the matrix
meddra_id_to_index = {meddra_id: i for i, meddra_id
                      in enumerate(sorted(set(report_outcomes_df['outcome_concept_id'])))}

# Map the new index to the MedDRA concept IDs.
report_outcomes_df = (
    report_outcomes_df
    .assign(
        outcome_index=lambda df: df['outcome_concept_id'].map(meddra_id_to_index)
    )
    .rename(columns={'index': 'report_index'})
)

report_outcomes_df.head(2)

Unnamed: 0,report_id,outcome_concept_id,snomed_outcome_concept_id,report_index,outcome_index
0,100033001,36516812,77074.0,4394326,10544
1,100033001,35708093,196523.0,4394326,3612


In [8]:
# Save outcomes table
report_outcomes_df.to_csv('../../data/meta_formatted/outcomes_table.csv.xz', index=False,
                          compression='xz')

# Generate and save vector of ids
outcome_id_vector = np.array(sorted(meddra_id_to_index.keys(), key=meddra_id_to_index.get))
np.save('../../data/meta_formatted/outcome_id_vector.npy', outcome_id_vector)

# Generate and save matrix
n_edges = report_outcomes_df.shape[0]
edges = report_outcomes_df[['report_index', 'outcome_index']].values.tolist()

outcomes_matrix = scipy.sparse.coo_matrix(
    (np.ones(n_edges), (zip(*edges)))
)
scipy.sparse.save_npz('../../data/meta_formatted/outcome_matrix.npz', outcomes_matrix.tocsc())