# Pipeline for publications

The output of the data pipeline should be a JSON file representing a link graph between the different drugs and their respective mentions in the different PubMed publications, the different scientific publications and finally the journals with the date associated with each of these mentions.

## Import libraries

In [1]:
sys.path.append("../")
import src.utils as utils
import src.tasks.extract as extract

import json
import pandas as pd
import sys
import numpy as np

from dotenv import dotenv_values

## Read the files into dataframes

In [2]:
# Read the configuration variables
config = dotenv_values("../.env")

pubmed_csv_df = extract.read_file("../"+config["SOURCE_FILE_PUBMED"])
pubmed_json_df = extract.read_file("../"+config["SOURCE_FILE_JOURNALS"])
trials_df = extract.read_file("../"+config["SOURCE_FILE_TRIALS"])
drugs_df = extract.read_file("../"+config["SOURCE_FILE_DRUGS"])

## Check the data 

In [3]:
pubmed_csv_df.head(20)

Unnamed: 0,id,title,date,journal
0,1,A 44-year-old man with erythema of the face di...,01/01/2019,Journal of emergency nursing
1,2,"An evaluation of benadryl, pyribenzamine, and ...",01/01/2019,Journal of emergency nursing
2,3,Diphenhydramine hydrochloride helps symptoms o...,02/01/2019,The Journal of pediatrics
3,4,Tetracycline Resistance Patterns of Lactobacil...,01/01/2020,Journal of food protection
4,5,Appositional Tetracycline bone formation rates...,02/01/2020,American journal of veterinary research
5,6,Rapid reacquisition of contextual fear followi...,2020-01-01,Psychopharmacology
6,7,The High Cost of Epinephrine Autoinjectors and...,01/02/2020,The journal of allergy and clinical immunology...
7,8,Time to epinephrine treatment is associated wi...,01/03/2020,The journal of allergy and clinical immunology...


In [4]:
pubmed_json_df.head(20)

Unnamed: 0,id,title,date,journal
0,9.0,Gold nanoparticles synthesized from Euphorbia ...,2020-01-01,"Journal of photochemistry and photobiology. B,..."
1,10.0,Clinical implications of umbilical artery Dopp...,2020-01-01,The journal of maternal-fetal & neonatal medicine
2,11.0,Effects of Topical Application of Betamethason...,2020-01-01,Journal of back and musculoskeletal rehabilita...
3,12.0,"Comparison of pressure release, phonophoresis ...",2020-01-03,Journal of back and musculoskeletal rehabilita...
4,,"Comparison of pressure BETAMETHASONE release, ...",2020-01-03,The journal of maternal-fetal & neonatal medicine


In [5]:
trials_df.head(20)

Unnamed: 0,id,scientific_title,date,journal
0,NCT01967433,Use of Diphenhydramine as an Adjunctive Sedati...,1 January 2020,Journal of emergency nursing
1,NCT04189588,Phase 2 Study IV QUZYTTIR™ (Cetirizine Hydroch...,1 January 2020,Journal of emergency nursing
2,NCT04237090,,1 January 2020,Journal of emergency nursing
3,NCT04237091,Feasibility of a Randomized Controlled Clinica...,1 January 2020,Journal of emergency nursing
4,NCT04153396,Preemptive Infiltration With Betamethasone and...,1 January 2020,Hôpitaux Universitaires de Genève
5,NCT03490942,Glucagon Infusion in T1D Patients With Recurre...,25/05/2020,
6,,Glucagon Infusion in T1D Patients With Recurre...,25/05/2020,Journal of emergency nursing
7,NCT04188184,Tranexamic Acid Versus Epinephrine During Expl...,27 April 2020,Journal of emergency nursing\xc3\x28


In [6]:
drugs_df.head()

Unnamed: 0,atccode,drug
0,A04AD,DIPHENHYDRAMINE
1,S03AA,TETRACYCLINE
2,V03AB,ETHANOL
3,A03BA,ATROPINE
4,A01AD,EPINEPHRINE


## Merge the publications dataframes

Merge dataframe into a single dataframe, for easier handling and manipulation of the data.

In [7]:
pubmed_csv_df["publication_type"] = "pubmed"
pubmed_json_df["publication_type"] = "pubmed"
pubmed_json_df["date"] = pubmed_json_df["date"].astype(str).str[:10]
trials_df["publication_type"] = "trial"
trials_df.rename(columns = {'scientific_title':'title'}, inplace = True)

all_publications_df = pd.concat([pubmed_csv_df, trials_df, pubmed_json_df], ignore_index=True)
all_publications_df.head()

Unnamed: 0,id,title,date,journal,publication_type
0,1,A 44-year-old man with erythema of the face di...,01/01/2019,Journal of emergency nursing,pubmed
1,2,"An evaluation of benadryl, pyribenzamine, and ...",01/01/2019,Journal of emergency nursing,pubmed
2,3,Diphenhydramine hydrochloride helps symptoms o...,02/01/2019,The Journal of pediatrics,pubmed
3,4,Tetracycline Resistance Patterns of Lactobacil...,01/01/2020,Journal of food protection,pubmed
4,5,Appositional Tetracycline bone formation rates...,02/01/2020,American journal of veterinary research,pubmed
5,6,Rapid reacquisition of contextual fear followi...,2020-01-01,Psychopharmacology,pubmed
6,7,The High Cost of Epinephrine Autoinjectors and...,01/02/2020,The journal of allergy and clinical immunology...,pubmed
7,8,Time to epinephrine treatment is associated wi...,01/03/2020,The journal of allergy and clinical immunology...,pubmed
8,NCT01967433,Use of Diphenhydramine as an Adjunctive Sedati...,1 January 2020,Journal of emergency nursing,trial
9,NCT04189588,Phase 2 Study IV QUZYTTIR™ (Cetirizine Hydroch...,1 January 2020,Journal of emergency nursing,trial


## Clean the data

In [8]:
import re
def decode_ascii_chars(input: str) -> str:
    cleaned = re.sub(r'[\xc3]', 'APA', input)
    return cleaned

# Replace empty records (only containing one or more space) with NA.
all_publications_df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

# Remove all records that contain NA
all_publications_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

all_publications_df["journal"] = all_publications_df["journal"].apply(lambda x: utils.remove_ascii_chars(x))
all_publications_df["title"] = all_publications_df["title"].apply(lambda x: utils.remove_ascii_chars(x))

all_publications_df.head()

Unnamed: 0,id,title,date,journal,publication_type
0,1,A 44-year-old man with erythema of the face di...,01/01/2019,Journal of emergency nursing,pubmed
1,2,"An evaluation of benadryl, pyribenzamine, and ...",01/01/2019,Journal of emergency nursing,pubmed
2,3,Diphenhydramine hydrochloride helps symptoms o...,02/01/2019,The Journal of pediatrics,pubmed
3,4,Tetracycline Resistance Patterns of Lactobacil...,01/01/2020,Journal of food protection,pubmed
4,5,Appositional Tetracycline bone formation rates...,02/01/2020,American journal of veterinary research,pubmed
5,6,Rapid reacquisition of contextual fear followi...,2020-01-01,Psychopharmacology,pubmed
6,7,The High Cost of Epinephrine Autoinjectors and...,01/02/2020,The journal of allergy and clinical immunology...,pubmed
7,8,Time to epinephrine treatment is associated wi...,01/03/2020,The journal of allergy and clinical immunology...,pubmed
8,NCT01967433,Use of Diphenhydramine as an Adjunctive Sedati...,1 January 2020,Journal of emergency nursing,trial
9,NCT04189588,Phase 2 Study IV QUZYTTIR™ (Cetirizine Hydroch...,1 January 2020,Journal of emergency nursing,trial


In [9]:
# Get unique date formats
# This is of course not the best solution to find unique data formats, but works in this case since the dataset is small.
all_publications_df.date.unique()

array(['01/01/2019', '02/01/2019', '01/01/2020', '02/01/2020',
       '2020-01-01', '01/02/2020', '01/03/2020', '1 January 2020',
       '27 April 2020', '2020-01-03'], dtype=object)

## Clean and normalise the data in the dataframes

### Normalise the dates

In [10]:
all_publications_df["date"] = all_publications_df["date"].apply(utils.convert_date_format)
all_publications_df.reset_index(drop=True, inplace=True)

all_publications_df.head()

## Create output data

In [12]:
output = []
object = {}

for drug in drugs_df["drug"]:  
    if(all_publications_df["title"].str.contains(drug, case = False).any()):
        object = {"Drug": drug}
        publications = all_publications_df[all_publications_df["title"].str.contains(drug, case = False)] # Returnernar en dataframe
        publications = publications[["date", "journal", "title", "publication_type"]]
        publications = publications.to_dict(orient='records')
        object.update({"Publications": publications})
        output.append(object)

json_object = json.dumps(output, indent = 4)        

## Write to destination

In [13]:
with open("../"+config["OUTPUT_FILE"], "w+") as outfile:
    outfile.write(json_object)