In [11]:
import os
import sys
import csv
import json

import numpy as np
import scipy as sp
import pandas as pd

from tqdm.notebook import tqdm

In [13]:
data_path = '../data'
dataset_path = '../data/faers/drug/datasets/drug_2005-2005'
print(os.path.exists(dataset_path))

dataset_info = json.load(open(os.path.join(dataset_path, 'dataset.json')))
num_reports = dataset_info['reports_by_drugs']['shape'][0]
num_reports

True


219850

|               | Reaction | No Reaction |
|---------------|----------|-------------|
| Indication    | A        | B           |
| No Indication | C        | D           |

PRR = (A/B) / (C/D)


In [34]:
# indications
indication2report = json.load(open(os.path.join(dataset_path, 'indication2report.json')))
print(f"Num indications: {len(indication2report)}")

reaction2report = json.load(open(os.path.join(dataset_path, 'reaction2report.json')))
print(f"Num Reactions: {len(reaction2report)}")

indication_prrs_data = list()
for indication, ind_reports in tqdm(indication2report.items()):
    for reaction, rxn_reports in reaction2report.items():
        A = float(len(set(ind_reports) & set(rxn_reports)))
        B = float(len(ind_reports)-A)
        C = float(len(rxn_reports)-A)
        D = float(num_reports-(A+B+C))
        if B == 0:
            B = 1
        if C == 0:
            C = 1
        PRR = (A/B)/(C/D)
        indication_prrs_data.append([indication, reaction, A, B, C, D, PRR])

indication_prrs = pd.DataFrame(indication_prrs_data, columns=['indication', 'reaction', 'A', 'B', 'C', 'D', 'PRR'])

Num indications: 4196
Num Reactions: 9242


  0%|          | 0/4196 [00:00<?, ?it/s]

In [53]:
indication_prrs[indication_prrs['A'] >= 50].sort_values(by='PRR', ascending=False).head()
# indication_prrs[(indication_prrs['reaction']=='35506628')].sort_values(by='A', ascending=False).head(10)

Unnamed: 0,indication,reaction,A,B,C,D,PRR
1044818,ABORTION INDUCED,36818629,51.0,168.0,7.0,219624.0,9524.510204
5039133,VULVOVAGINAL MYCOTIC INFECTION,42890298,78.0,235.0,14.0,219523.0,5204.496657
5038030,VULVOVAGINAL MYCOTIC INFECTION,37119780,91.0,222.0,45.0,219492.0,1999.376577
6642,ACCIDENTAL EXPOSURE,43053921,55.0,76.0,119.0,219600.0,1335.471031
1221230,CONTRACEPTION,42889673,89.0,1571.0,19.0,218171.0,650.514892


In [17]:
# load the meddra labels
reaction2label = dict()
with open(os.path.join(data_path, 'meddra_llt_pt_map.txt')) as fh:
    reader = csv.reader(fh, delimiter='|')
    header = next(reader)
    #print(header)
    for row in tqdm(reader):
        data = dict(zip(header, row))
        reaction2label[data['llt_concept_id']] = data['llt_concept_name']
        reaction2label[data['pt_concept_id']] = data['pt_concept_name']

0it [00:00, ?it/s]

In [62]:
reaction2index = json.load(open(os.path.join(dataset_path, 'reaction2index.json')))
ordered_reactions = [key for key, index in sorted(reaction2index.items(), key=lambda item: item[1])]

indication2index = json.load(open(os.path.join(dataset_path, 'indication2index.json')))
ordered_indications = [key for key, index in sorted(indication2index.items(), key=lambda item: item[1])]

In [63]:
reactions = sp.sparse.load_npz(os.path.join(dataset_path, 'reports_by_reactions.npz'))
indications = sp.sparse.load_npz(os.path.join(dataset_path, 'reports_by_indications.npz'))

A = (indications.T @ reactions).toarray()
B = (indications.sum(0).T - A).A
# B[B==0] = 1
C = (reactions.sum(0) - A).A
# C[C==0] = 1
D = (num_reports- (A+B+C))

PRRs = ((A/B)/(C/D))

  PRRs = ((A/B)/(C/D))


In [64]:
mask = A>=50
indices = np.where(mask)
inds = [ordered_indications[i] for i in indices[0]]
rxns = [ordered_reactions[i] for i in indices[1]]
rxn_labels = [reaction2label[ordered_reactions[i]] for i in indices[1]]

ind_prrs = pd.DataFrame({
    'indication': inds,
    'reaction': rxns,
    'reaction_label': rxn_labels,
    'A': A[mask],
    'B': B[mask],
    'C': C[mask],
    'D': D[mask],
    'PRR': PRRs[mask],
})

In [66]:
ind_prrs[ind_prrs['A'] >= 100].sort_values(by='PRR', ascending=False).head(10)
# ind_prrs[(ind_prrs['reaction']=='35506628')].sort_values(by='A', ascending=False).head(10)

Unnamed: 0,indication,reaction,reaction_label,A,B,C,D,PRR
2227,PROPHYLAXIS AGAINST TRANSPLANT REJECTION,36009897,Kidney transplant rejection,111,553,74,219112,594.336347
1905,MULTIPLE SCLEROSIS,36718112,Multiple sclerosis relapse,337,6285,26,213202,439.685907
2123,CONTRACEPTION,36818844,Unintended pregnancy,231,1429,87,218103,405.249174
1989,HORMONE REPLACEMENT THERAPY,36312432,Progesterone receptor assay positive,156,1207,89,218398,317.158225
2178,METASTASES TO BONE,36516749,Osteonecrosis,391,231,1247,217981,295.880923
1988,HORMONE REPLACEMENT THERAPY,36312409,Oestrogen receptor assay positive,182,1181,122,218365,275.832026
1985,HORMONE REPLACEMENT THERAPY,36617163,Breast cancer female,534,829,562,217925,249.779887
2236,HAEMOPHILIA,35809320,Adverse event,174,145,1071,218460,244.773109
932,DIABETES MELLITUS NON-INSULIN-DEPENDENT,35708172,Early satiety,112,4266,27,215445,209.49315
2179,METASTASES TO BONE,37521200,Tooth extraction,175,447,414,218814,206.921344


In [51]:
indices[0]

array([  0,   1,   1, ..., 726, 726, 726])

In [59]:
np.where(PRRs==PRRs[mask].max())

(array([113]), array([472]))

In [61]:
ordered_indications[113], ordered_reactions[472]

('ABORTION INDUCED', '36312034')