# Scrape PubMed

[PubMed](http://www.ncbi.nlm.nih.gov/pubmed/)

In [9]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string

import pandas as pd
from collections import Counter,defaultdict
import urllib.request as urllib2
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sbs
import re
from nltk.corpus import stopwords
from sklearn.decomposition import NMF
import lda
from sklearn.feature_extraction.text import CountVectorizer
from IPython.core.display import HTML
from nltk import word_tokenize
from operator import itemgetter
import string
%matplotlib inline

HTML("<style>.container { width:95% !important; }</style>")

### Load dataframe

In [3]:
df = pd.read_csv('data/labeled_df.csv', na_values=['?'])
df.head()

Unnamed: 0,boolean_relationship,relationship_info,pmid,chemical_name,disease_name
0,True,describe yearold woman preexisting mitral valv...,11419773,caffeine,ventricular fibrillation
1,False,laxation critically ill patients lactulose pol...,17893628,lactulose,critically ill
2,False,methotrexate mtx sulfasalazine ssz cyclosporin...,12463452,sulfasalazine/SSZ,PsA
3,True,polychlorinated biphenyls pcbs persistent envi...,24812009,Polychlorinated biphenyls/PCBs,NAFLD/non-alcoholic fatty liver disease
4,True,although pcp classified human carcinogen epide...,7904464,PCP,sarcoma


In [4]:
df.shape

(1928, 5)

### Scrape Abstracts from PubMed

In [10]:
#extract abstracts for each pmid in the dataframe:
i = 0
abstracts = []
for pm in df.pmid: 
    soup = BeautifulSoup(urllib2.urlopen('http://www.ncbi.nlm.nih.gov/pubmed/%s' %pm).read(), "lxml")
    #abstract
    abstract = soup.find_all(attrs={"class": "abstr"})
    abstracts.append(abstract[0].p.string)
    i += 1
    if( (i) % 100 == 0 ):
        print("Review %d of %d\n" % (i, df.shape[0]))
df['abstracts'] = abstracts 

Review 100 of 1928

Review 200 of 1928

Review 300 of 1928

Review 400 of 1928

Review 500 of 1928

Review 600 of 1928

Review 700 of 1928

Review 800 of 1928

Review 900 of 1928

Review 1000 of 1928

Review 1100 of 1928

Review 1200 of 1928

Review 1300 of 1928

Review 1400 of 1928

Review 1500 of 1928

Review 1600 of 1928

Review 1700 of 1928

Review 1800 of 1928

Review 1900 of 1928



In [11]:
df.head()

Unnamed: 0,boolean_relationship,relationship_info,pmid,chemical_name,disease_name,abstracts
0,True,describe yearold woman preexisting mitral valv...,11419773,caffeine,ventricular fibrillation,We describe a 25-year-old woman with pre-exist...
1,False,laxation critically ill patients lactulose pol...,17893628,lactulose,critically ill,To study whether lactulose or polyethylene gly...
2,False,methotrexate mtx sulfasalazine ssz cyclosporin...,12463452,sulfasalazine/SSZ,PsA,As erosive and deforming arthritis is present ...
3,True,polychlorinated biphenyls pcbs persistent envi...,24812009,Polychlorinated biphenyls/PCBs,NAFLD/non-alcoholic fatty liver disease,Polychlorinated biphenyls (PCBs) are persisten...
4,True,although pcp classified human carcinogen epide...,7904464,PCP,sarcoma,"Pentachlorophenol (PCP) was, and still is, one..."


In [12]:
df.to_csv( "data/df_with_abstracts.csv", index=False)

### Cleaning abstracts

In [13]:
df_with_abstracts = pd.read_csv('data/df_with_abstracts.csv', na_values=['?'])
df_with_abstracts.head()

Unnamed: 0,boolean_relationship,relationship_info,pmid,chemical_name,disease_name,abstracts
0,True,describe yearold woman preexisting mitral valv...,11419773,caffeine,ventricular fibrillation,We describe a 25-year-old woman with pre-exist...
1,False,laxation critically ill patients lactulose pol...,17893628,lactulose,critically ill,To study whether lactulose or polyethylene gly...
2,False,methotrexate mtx sulfasalazine ssz cyclosporin...,12463452,sulfasalazine/SSZ,PsA,As erosive and deforming arthritis is present ...
3,True,polychlorinated biphenyls pcbs persistent envi...,24812009,Polychlorinated biphenyls/PCBs,NAFLD/non-alcoholic fatty liver disease,Polychlorinated biphenyls (PCBs) are persisten...
4,True,although pcp classified human carcinogen epide...,7904464,PCP,sarcoma,"Pentachlorophenol (PCP) was, and still is, one..."


In [14]:
df['abstracts'] = df_with_abstracts['abstracts'].copy()

In [16]:
df.head()

Unnamed: 0,boolean_relationship,relationship_info,pmid,chemical_name,disease_name,abstracts
0,True,describe yearold woman preexisting mitral valv...,11419773,caffeine,ventricular fibrillation,We describe a 25-year-old woman with pre-exist...
1,False,laxation critically ill patients lactulose pol...,17893628,lactulose,critically ill,To study whether lactulose or polyethylene gly...
2,False,methotrexate mtx sulfasalazine ssz cyclosporin...,12463452,sulfasalazine/SSZ,PsA,As erosive and deforming arthritis is present ...
3,True,polychlorinated biphenyls pcbs persistent envi...,24812009,Polychlorinated biphenyls/PCBs,NAFLD/non-alcoholic fatty liver disease,Polychlorinated biphenyls (PCBs) are persisten...
4,True,although pcp classified human carcinogen epide...,7904464,PCP,sarcoma,"Pentachlorophenol (PCP) was, and still is, one..."


In [17]:
cleaned_names = list(df.chemical_name) + list(df.disease_name)

In [18]:
#porter_stemmer = PorterStemmer()

def abstract_to_words( raw_review ):
    #Remove HTML
    raw_review = raw_review.replace(">","> ")
    review_text = BeautifulSoup(raw_review, "lxml").get_text().split()
    
    #if word is not a disease or chemical, remove non-alpha. else, retain word. 
    cleaned_sentence = []
    for word in review_text:
        if word not in cleaned_names:
            cleaned_sentence.append(re.sub("[^a-zA-Z]", "", word.lower()))
        else:
            cleaned_sentence.append(word.lower())
    
    #remove stops, extra spaces, and single-letter words 
    stops = set(stopwords.words("english"))  
    spaces = set(" ")
    single_letter_words = set(string.ascii_lowercase)
    unimportant_words = stops | spaces | single_letter_words
    
    return (" ".join([word for word in cleaned_sentence if not word in list(unimportant_words)])).replace("  "," ")

In [19]:
clean_abstracts = []
for i in range( 0, df.shape[0] ):
    clean_abstracts.append( abstract_to_words( df["abstracts"][i] ) )

In [20]:
df.abstracts = clean_abstracts

In [21]:
df.head()

Unnamed: 0,boolean_relationship,relationship_info,pmid,chemical_name,disease_name,abstracts
0,True,describe yearold woman preexisting mitral valv...,11419773,caffeine,ventricular fibrillation,describe yearold woman preexisting mitral valv...
1,False,laxation critically ill patients lactulose pol...,17893628,lactulose,critically ill,study whether lactulose polyethylene glycol ef...
2,False,methotrexate mtx sulfasalazine ssz cyclosporin...,12463452,sulfasalazine/SSZ,PsA,erosive deforming arthritis present patients p...
3,True,polychlorinated biphenyls pcbs persistent envi...,24812009,Polychlorinated biphenyls/PCBs,NAFLD/non-alcoholic fatty liver disease,polychlorinated biphenyls pcbs persistent envi...
4,True,although pcp classified human carcinogen epide...,7904464,PCP,sarcoma,pentachlorophenol pcp still one frequently use...


In [22]:
df.abstracts[100]

'thrombosis rare wellrecognized potential complication factor viii inhibitor bypass activity feiba infusion recombinant factor viia rfviia increasingly used alternative feiba however thrombotic safety profile rfviia remains incompletely characterized determine incidence rates thrombotic adverse events aes infusion rfviia feiba data medwatch pharmacovigilance program us food drug administration supplemented published case reports used conjunction estimated numbers infusions available manufacturers assess comparative incidence thrombotic aes patients receiving rfviia feiba period april june reported thrombotic aes rare incidence rates per infusions ci per infusions rfviia per infusions ci per infusions feiba thrombotic aes significantly frequent rfviia feiba recipients incidence rate ratio ci commonly documented single type thrombotic ae rfviia infusion cerebrovascular thrombosis myocardial infarction frequent type patients receiving feiba contrasting ae reporting patterns rfviia feiba m

In [27]:
df.to_csv( "data/cleaned_abstracts.csv", index=False)

In [28]:
df_cleaned_abstracts = pd.read_csv('cleaned_abstracts.csv', na_values=['?'])
df_cleaned_abstracts.head()

Unnamed: 0,boolean_relationship,relationship_info,pmid,chemical_name,disease_name,abstracts
0,True,describe yearold woman preexisting mitral valv...,11419773,caffeine,ventricular fibrillation,describe yearold woman preexisting mitral valv...
1,False,laxation critically ill patients lactulose pol...,17893628,lactulose,critically ill,study whether lactulose polyethylene glycol ef...
2,False,methotrexate mtx sulfasalazine ssz cyclosporin...,12463452,sulfasalazine/SSZ,PsA,erosive deforming arthritis present patients p...
3,True,polychlorinated biphenyls pcbs persistent envi...,24812009,Polychlorinated biphenyls/PCBs,NAFLD/non-alcoholic fatty liver disease,polychlorinated biphenyls pcbs persistent envi...
4,True,although pcp classified human carcinogen epide...,7904464,PCP,sarcoma,pentachlorophenol pcp still one frequently use...


### Merge relationship_info and abstracts into 1 column: all_info

In [29]:
#merge all wordy columns into 1 column: abstracts_and_info
df_cleaned_abstracts['all_info'] = (df_cleaned_abstracts["relationship_info"].map(str) + df_cleaned_abstracts["abstracts"])
df_cleaned_abstracts.head()

Unnamed: 0,boolean_relationship,relationship_info,pmid,chemical_name,disease_name,abstracts,all_info
0,True,describe yearold woman preexisting mitral valv...,11419773,caffeine,ventricular fibrillation,describe yearold woman preexisting mitral valv...,describe yearold woman preexisting mitral valv...
1,False,laxation critically ill patients lactulose pol...,17893628,lactulose,critically ill,study whether lactulose polyethylene glycol ef...,laxation critically ill patients lactulose pol...
2,False,methotrexate mtx sulfasalazine ssz cyclosporin...,12463452,sulfasalazine/SSZ,PsA,erosive deforming arthritis present patients p...,methotrexate mtx sulfasalazine ssz cyclosporin...
3,True,polychlorinated biphenyls pcbs persistent envi...,24812009,Polychlorinated biphenyls/PCBs,NAFLD/non-alcoholic fatty liver disease,polychlorinated biphenyls pcbs persistent envi...,polychlorinated biphenyls pcbs persistent envi...
4,True,although pcp classified human carcinogen epide...,7904464,PCP,sarcoma,pentachlorophenol pcp still one frequently use...,although pcp classified human carcinogen epide...


In [30]:
df_cleaned_abstracts.all_info[100]

'commonly documented single type thrombotic ae rfviia infusion cerebrovascular thrombosis myocardial infarction frequent type patients receiving feiba nanthrombosis rare wellrecognized potential complication factor viii inhibitor bypass activity feiba infusion recombinant factor viia rfviia increasingly used alternative feiba however thrombotic safety profile rfviia remains incompletely characterized determine incidence rates thrombotic adverse events aes infusion rfviia feiba data medwatch pharmacovigilance program us food drug administration supplemented published case reports used conjunction estimated numbers infusions available manufacturers assess comparative incidence thrombotic aes patients receiving rfviia feiba period april june reported thrombotic aes rare incidence rates per infusions ci per infusions rfviia per infusions ci per infusions feiba thrombotic aes significantly frequent rfviia feiba recipients incidence rate ratio ci commonly documented single type thrombotic ae

In [31]:
df_final = df_cleaned_abstracts.drop(['abstracts','relationship_info','pmid'], axis=1).copy()
df_final.head()

Unnamed: 0,boolean_relationship,chemical_name,disease_name,all_info
0,True,caffeine,ventricular fibrillation,describe yearold woman preexisting mitral valv...
1,False,lactulose,critically ill,laxation critically ill patients lactulose pol...
2,False,sulfasalazine/SSZ,PsA,methotrexate mtx sulfasalazine ssz cyclosporin...
3,True,Polychlorinated biphenyls/PCBs,NAFLD/non-alcoholic fatty liver disease,polychlorinated biphenyls pcbs persistent envi...
4,True,PCP,sarcoma,although pcp classified human carcinogen epide...


In [32]:
df_final.to_csv( "data/Final_df.csv", index=False)

### Data Visualization

In [33]:
df_final.boolean_relationship.value_counts()

True     964
False    964
Name: boolean_relationship, dtype: int64

In [34]:
true_vocab = (' '.join(df_final[df_final['boolean_relationship']==True]['all_info'])).replace("  "," ")
false_vocab = (' '.join(df_final[df_final['boolean_relationship']==False]['all_info'])).replace("  "," ")

In [35]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
true_stem = [porter_stemmer.stem(w) for w in true_vocab.split(' ')]
false_stem = [porter_stemmer.stem(w) for w in false_vocab.split(' ')]

#### Most common true words

In [36]:
true_word_freq_stem = Counter(true_stem)
false_word_freq_stem = Counter(false_stem)

true_word_freq_stem.most_common(10)

[('patient', 1134),
 ('effect', 710),
 ('use', 654),
 ('studi', 616),
 ('treatment', 600),
 ('increas', 591),
 ('cell', 563),
 ('associ', 531),
 ('toxic', 438),
 ('may', 426)]

#### Most common false words

In [37]:
false_word_freq_stem.most_common(10)

[('patient', 1635),
 ('effect', 966),
 ('treatment', 921),
 ('cell', 877),
 ('studi', 844),
 ('use', 576),
 ('therapi', 473),
 ('cancer', 462),
 ('activ', 453),
 ('diseas', 439)]

#### How many words are there in each vocab?

In [38]:
print("length of true vocab: ", len(true_vocab))
print("length of false vocab: ", len(false_vocab))

length of true vocab:  796634
length of false vocab:  822422
