# Exploratory Data Analysis

FAERS drug and natural products data extracted after standardization processes to find number of mapped and unmapped strings.

In [1]:
import pandas as pd
import numpy as np

In [2]:
##raw files from FAERS and LAERS
#Importing from only one file resulted in corrupted data with missing columns
#primaryid - FAERS reports
#isr - LAERS reports
faers = pd.read_csv('data/standard_combined_drug_mapping_primaryid_202201241744.csv', low_memory=False)
faers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38618311 entries, 0 to 38618310
Data columns (total 9 columns):
 #   Column               Dtype  
---  ------               -----  
 0   primaryid            int64  
 1   isr                  float64
 2   drug_seq             int64  
 3   role_cod             object 
 4   drug_name_original   object 
 5   lookup_value         object 
 6   concept_id           float64
 7   update_method        object 
 8   standard_concept_id  float64
dtypes: float64(3), int64(2), object(4)
memory usage: 2.6+ GB


In [3]:
laers = pd.read_csv('data/standard_combined_drug_mapping_isr_202201241741.csv', low_memory=False)
laers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10190238 entries, 0 to 10190237
Data columns (total 9 columns):
 #   Column               Dtype  
---  ------               -----  
 0   primaryid            float64
 1   isr                  int64  
 2   drug_seq             int64  
 3   role_cod             object 
 4   drug_name_original   object 
 5   lookup_value         object 
 6   concept_id           float64
 7   update_method        object 
 8   standard_concept_id  float64
dtypes: float64(3), int64(2), object(4)
memory usage: 699.7+ MB


In [4]:
#check if rows misplaced
faers.loc[faers['isr'].notna()]

Unnamed: 0,primaryid,isr,drug_seq,role_cod,drug_name_original,lookup_value,concept_id,update_method,standard_concept_id


In [5]:
laers.loc[laers['primaryid'].notna()]

Unnamed: 0,primaryid,isr,drug_seq,role_cod,drug_name_original,lookup_value,concept_id,update_method,standard_concept_id


In [None]:
##Find coverage of reports and unique drug strings (mapped and unmapped) - FAERS + LAERS
##Combine for analysis

In [6]:
id_faers = faers['primaryid'].unique()
len(id_faers)

10590419

In [7]:
id_laers = laers['isr'].unique()
len(id_laers)

2996985

In [8]:
##count unique drug strings
df = pd.concat([faers, laers], ignore_index=True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48808549 entries, 0 to 48808548
Data columns (total 9 columns):
 #   Column               Dtype  
---  ------               -----  
 0   primaryid            float64
 1   isr                  float64
 2   drug_seq             int64  
 3   role_cod             object 
 4   drug_name_original   object 
 5   lookup_value         object 
 6   concept_id           float64
 7   update_method        object 
 8   standard_concept_id  float64
dtypes: float64(4), int64(1), object(4)
memory usage: 3.3+ GB


In [12]:
drugs_all = df['drug_name_original'].tolist()
len(drugs_all)

48808549

In [14]:
drugs_all_unique = list(set(drugs_all))
len(drugs_all_unique)

832031

In [11]:
drugs_all_unique[1:50]

[nan,
 'FLUOXETINE --/--/2007',
 'CHLOPROTHIXENE',
 'HYDROCHLOROT 12.5 MG',
 'PUMPKIN SEED OIL                   /01517303/',
 'EXACIN [ISEPAMICIN SULFATE]',
 'CALCIUM 500 TAB + D',
 'Depo-Teststerone',
 'Rozavel',
 'LEVEMIR PEN INSULIN',
 'apixaban 2.5mg PO BID',
 'zolendronate',
 'RIBAPAK (1600 MG, 1200 MG) (NOT SPECIFIED)',
 'NOVOLOG N',
 'Felodipin Actavis',
 ' D AND CALCIUM',
 'ADVAIR DISCUS 50/250',
 'lilly basal',
 'PROLAIR (BECLOMETASONE)',
 'ETHICAL NUTRIENTS IBS SUPPORT',
 'BUSCOPAN(hyoscine butylbromide)',
 'CLOPIDOGREL TABLET FO  75MG (WATERSTOFSULFAAT)',
 'DICLOFENAC?NATRIUM / Brand name not specified',
 'ZYCLORAN',
 'Hydecodone',
 'SELECTIVE SEROTONIN REUPTAKE INHIBITOR',
 'SIFROL (PRAMIPEXOLE DIHYCHLORIDE) (TA) (PRAMIPEXOLE)',
 'MIRTAZAPINE Tablet, 30 mg (milligram',
 'MEGESTROL AC SUS 40MG/ML',
 'DRAMIN (DIMENHYDRINATE)',
 'Proctozone-HC',
 'A LOT OF UNKNOWN MEDICATIONS',
 'ARGENTUM NITRICUM 9 CH (OTHER THERAPEUTIC PRODUCTS) (NULL)',
 'oxycodone 30mg',
 'TAVASTIN',
 'VI

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48808549 entries, 0 to 48808548
Data columns (total 9 columns):
 #   Column               Dtype  
---  ------               -----  
 0   primaryid            float64
 1   isr                  float64
 2   drug_seq             int64  
 3   role_cod             object 
 4   drug_name_original   object 
 5   lookup_value         object 
 6   concept_id           float64
 7   update_method        object 
 8   standard_concept_id  float64
dtypes: float64(4), int64(1), object(4)
memory usage: 3.3+ GB


In [16]:
#divide into mapped and unmapped strings
df_map = df.loc[df['concept_id'].notna()]
df_unmap = df.loc[df['concept_id'].isna()]

In [17]:
df_map = df_map.reset_index(drop=True)
df_map.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46164530 entries, 0 to 46164529
Data columns (total 9 columns):
 #   Column               Dtype  
---  ------               -----  
 0   primaryid            float64
 1   isr                  float64
 2   drug_seq             int64  
 3   role_cod             object 
 4   drug_name_original   object 
 5   lookup_value         object 
 6   concept_id           float64
 7   update_method        object 
 8   standard_concept_id  float64
dtypes: float64(4), int64(1), object(4)
memory usage: 3.1+ GB


In [18]:
df_unmap = df_unmap.reset_index(drop=True)
df_unmap.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2644019 entries, 0 to 2644018
Data columns (total 9 columns):
 #   Column               Dtype  
---  ------               -----  
 0   primaryid            float64
 1   isr                  float64
 2   drug_seq             int64  
 3   role_cod             object 
 4   drug_name_original   object 
 5   lookup_value         object 
 6   concept_id           float64
 7   update_method        object 
 8   standard_concept_id  float64
dtypes: float64(4), int64(1), object(4)
memory usage: 181.6+ MB


In [19]:
df_unmap.head()

Unnamed: 0,primaryid,isr,drug_seq,role_cod,drug_name_original,lookup_value,concept_id,update_method,standard_concept_id
0,100033200.0,,11,C,Baktar,,,,
1,100033200.0,,13,C,chinese medicine,,,,
2,1000333000.0,,10,C,IDROCLOROTIAZ,,,,
3,1000333000.0,,4,C,INSULINA GLULISINA,,,,
4,1000333000.0,,9,C,IDROCLOROTIAZ,,,,


In [20]:
df_map.head()

Unnamed: 0,primaryid,isr,drug_seq,role_cod,drug_name_original,lookup_value,concept_id,update_method,standard_concept_id
0,147383531.0,,11,C,0.0375 ESTRADIOL TRANSDERMAL SYSTEM,ESTRADIOL,1548195.0,single ingredient match,1548195.0
1,182024081.0,,1,PS,0.09% NORMAL SALINE,NORMAL SALINE,19011130.0,single ingredient match,19011130.0
2,110049491.0,,1,PS,0.12% CHLORHEXIDINE,CHLORHEXIDINE,1790812.0,single ingredient match,1790812.0
3,143307352.0,,3,C,0.15% brimonidine,BRIMONIDINE,915542.0,single ingredient match,915542.0
4,131627711.0,,1,PS,.01 MG CLONIDINE,CLONIDINE,1398937.0,single ingredient match,1398937.0


In [21]:
##count unique drug strings in both
drug_map = df_map['drug_name_original'].unique()
len(drug_map)

379079

In [22]:
drug_unmap = df_unmap['drug_name_original'].unique()
len(drug_unmap)

452953

In [35]:
df_map.shape

(46164530, 9)

In [37]:
df_unmap.shape

(2644019, 9)

In [29]:
##find coverage of mapped and unmapped reports - or complete vs incomplete where
#complete = all drug strings in report are mapped
#incomplete = one or more drug strings in report are unmapped
pid_map = df_map['primaryid'].unique()
len(pid_map)

10423131

In [31]:
pid_unmap = df_unmap['primaryid'].unique()
len(pid_unmap)

1199302

In [30]:
isr_map = df_map['isr'].unique()
len(isr_map)

2938434

In [32]:
isr_unmap = df_unmap['isr'].unique()
len(isr_unmap)

524406

In [33]:
#find intersection of mapped and unmapped reports
pid_intersect = np.intersect1d(pid_map, pid_unmap)
len(pid_intersect)

1032012

In [34]:
isr_intersect = np.intersect1d(isr_map, isr_unmap)
len(isr_intersect)

465853

In [4]:
dfup = pd.read_csv('data/upper_unmap_orig_drug_names_202201201812.csv')
dfup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569525 entries, 0 to 569524
Data columns (total 1 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   drug_name_original  412264 non-null  object
dtypes: object(1)
memory usage: 4.3+ MB


In [5]:
dfnp = pd.read_csv('data/np_names_clean_202201201810.csv')
dfnp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6195 entries, 0 to 6194
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   concept_id  6195 non-null   int64 
 1   np_name     6195 non-null   object
dtypes: int64(1), object(1)
memory usage: 96.9+ KB
