In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#Importing necessary packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import datetime
pd.options.display.max_colwidth = 100



## Data info

I downloaded **xml** files that contain info regarding all publications in 2017 (upto October with full text freely available) from [PubMed](https://www.ncbi.nlm.nih.gov/pubmed) for this project. The data are in saved in three files (ncbi.xml, ncbip2.xml and ncbip3.xml). This notebook contains the code to extract the **pubmed ID, journal name, main keyword, title, and abstract** of the paper and save the data to a **csv** file. 
*Note: The xml, csv and model files are large (100s of MB to GBs) and not uploaded to the repository.* 


In [2]:
import xml.etree.cElementTree as et

In [3]:
# I downloaded all publications in 2017 (upto October) that are freely available for this study. The data are
# saved in three files (ncbi.xml, ncbip2.xml and ncbip3.xml). 
 
def getvalueofnode(node):
    """ return node text or None """
    return node.text if node is not None else None
 
#Run separately for ncbi.xml, ncbip2.xml, ncbip3.xml 

parsedXML = et.parse( "data/NCBI/ncbip3.xml" )
dfcols = ['pmid','key', 'journal', 'title', 'abstract']
df_xml = pd.DataFrame(columns=dfcols)
 




In [4]:
for node in parsedXML.getroot():
    pmid = node.find('MedlineCitation/PMID')
    key = node.find('MedlineCitation/KeywordList/Keyword')
    journal = node.find('MedlineCitation/Article/Journal/Title')
    title = node.find('MedlineCitation/Article/ArticleTitle')
    abstract = node.find('MedlineCitation/Article/Abstract/AbstractText')
 
    df_xml = df_xml.append(
        pd.Series([getvalueofnode(pmid),getvalueofnode(key), getvalueofnode(journal), getvalueofnode(title),
                    getvalueofnode(abstract)], index=dfcols),
        ignore_index=True)
 

In [5]:

ncbi2=df_xml.copy(True) 
ncbi2.tail()

Unnamed: 0,pmid,key,journal,title,abstract
88055,28290033,Hydroxypropyl-β-cyclodextrin,AAPS PharmSciTech,Improving cyclodextrin complexation of a new antihepatitis drug with glacial acetic acid.,The purpose of this study was to develop and evaluate a solid nonaqueous oral dosage form for a ...
88056,28290032,factorial design,AAPS PharmSciTech,Intragastric floating drug delivery system of cefuroxime axetil: In vitro evaluation.,This investigation describes the development of an intragastric drug-delivery system for cefurox...
88057,28290025,antioxidant,AAPS PharmSciTech,Evaluation of functional stability of quercetin as a raw material and in different topical formu...,"The present study evaluates the antioxidant activity of the flavonol quercetin, and its function..."
88058,28290023,brain targeting,AAPS PharmSciTech,Preliminary brain-targeting studies on intranasal mucoadhesive microemulsions of sumatriptan.,The aim of this investigation was to prepare microemulsions containing sumatriptan (ST) and suma...
88059,28290022,gloss,AAPS PharmSciTech,Local and average gloss from flat-faced sodium chloride tablets.,The purpose of this study was to detect local gloss and surface structure changes of sodium chlo...


In [7]:
#Save separately as ncbi_data.csv, ncbi_datap2.csv, ncbi_datap3.csv 
# ncbi2.to_csv('ncbi_datap3.csv', sep=',')

In [4]:
ncbip1=pd.read_csv('data/NCBI/ncbi_data.csv', encoding='latin1', index_col=0)
ncbip2=pd.read_csv('data/NCBI/ncbi_datap2.csv', encoding='latin1', index_col=0)
ncbip3=pd.read_csv('data/NCBI/ncbi_datap3.csv', encoding='latin1', index_col=0)


In [5]:
#Combining to a single dataframe
ncbi2=pd.concat([ncbip1,ncbip2,ncbip3])
# len(ncbi2)
# len(ncbip1)
# len(ncbip2)
# len(ncbip3)
ncbi2.tail()

Unnamed: 0,pmid,key,journal,title,abstract
88055,28290033.0,Hydroxypropyl-?-cyclodextrin,AAPS PharmSciTech,Improving cyclodextrin complexation of a new antihepatitis drug with glacial acetic acid.,The purpose of this study was to develop and evaluate a solid nonaqueous oral dosage form for a ...
88056,28290032.0,factorial design,AAPS PharmSciTech,Intragastric floating drug delivery system of cefuroxime axetil: In vitro evaluation.,This investigation describes the development of an intragastric drug-delivery system for cefurox...
88057,28290025.0,antioxidant,AAPS PharmSciTech,Evaluation of functional stability of quercetin as a raw material and in different topical formu...,"The present study evaluates the antioxidant activity of the flavonol quercetin, and its function..."
88058,28290023.0,brain targeting,AAPS PharmSciTech,Preliminary brain-targeting studies on intranasal mucoadhesive microemulsions of sumatriptan.,The aim of this investigation was to prepare microemulsions containing sumatriptan (ST) and suma...
88059,28290022.0,gloss,AAPS PharmSciTech,Local and average gloss from flat-faced sodium chloride tablets.,The purpose of this study was to detect local gloss and surface structure changes of sodium chlo...


In [6]:
#combining title and abstract and saving to new column, data
ncbi2['data']=ncbi2['title']+'. '+ncbi2['abstract']

In [7]:
ncbi2.isnull().sum()

pmid           884
key         105088
journal        884
title          884
abstract     14481
data         14481
dtype: int64

In [8]:
#Dropping column keyword because it has many nans and is not very informative
ncbi2.drop(labels=['key'],axis=1, inplace=True)

In [9]:
#Dropping null rows
ncbi2.dropna(axis=0, inplace=True)

In [10]:
ncbi3=ncbi2.copy(True)

In [12]:
ncbi3.isnull().sum()

pmid        0
journal     0
title       0
abstract    0
data        0
dtype: int64

In [None]:
#Save csv file to disk
# ncbi3.to_csv('ncbi_full.csv', sep=',')