# Scrape PubMed

[PubMed](http://www.ncbi.nlm.nih.gov/pubmed/)

In [7]:
import pandas as pd, urllib.request as urllib2, re, time, string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
from IPython.core.display import HTML
HTML("<style>.container { width:95% !important; }</style>")

### Load dataframe

In [2]:
df = pd.read_csv('data/before_removing_rows_df.csv', na_values=['?'])
df.head()

Unnamed: 0,verify_relationship,chemical_name,disease_name,pmid,relationship_info
0,yes_direct,5-HT,psychotic disorders,20705401,animal models considered reflect positive symp...
1,no_relation,D-penicillamine,localized scleroderma,2334179,case reports patients severe extensive localiz...
2,no_relation,yohimbine,affective disorders,1535072,method six patients either obsessive compulsiv...
3,no_relation,calcium,muscle spasms,8492347,severe hypokalemia may cause muscle weakness s...
4,no_relation,dexrazoxane,hematologic toxicity,15897593,clinical trials patients brain metastases comb...


In [3]:
df.shape

(5713, 5)

### Scrape Abstracts from PubMed

In [8]:
#extract abstracts for each pmid in the dataframe:
i = 0
abstracts = []
for pm in df.pmid: 
    soup = BeautifulSoup(urllib2.urlopen('http://www.ncbi.nlm.nih.gov/pubmed/%s' %pm).read(), "lxml")
    abstract = soup.find_all(attrs={"class": "abstr"})
    abstracts.append(abstract[0].p.string)
    i += 1
    if( (i) % 250 == 0 ):
        print("Review %d of %d\n" % (i, df.shape[0]))
        time.sleep(3)
df['abstracts'] = abstracts 

Review 250 of 5713

Review 500 of 5713

Review 750 of 5713

Review 1000 of 5713

Review 1250 of 5713

Review 1500 of 5713

Review 1750 of 5713

Review 2000 of 5713

Review 2250 of 5713

Review 2500 of 5713

Review 2750 of 5713

Review 3000 of 5713

Review 3250 of 5713

Review 3500 of 5713

Review 3750 of 5713

Review 4000 of 5713

Review 4250 of 5713

Review 4500 of 5713

Review 4750 of 5713

Review 5000 of 5713

Review 5250 of 5713

Review 5500 of 5713



In [9]:
df.to_csv( "data/df_with_all_abstracts.csv", index=False)

In [10]:
df.head()

Unnamed: 0,verify_relationship,chemical_name,disease_name,pmid,relationship_info,abstracts
0,yes_direct,5-HT,psychotic disorders,20705401,animal models considered reflect positive symp...,Altered serotonergic neural transmission is hy...
1,no_relation,D-penicillamine,localized scleroderma,2334179,case reports patients severe extensive localiz...,Localized scleroderma has no recognized intern...
2,no_relation,yohimbine,affective disorders,1535072,method six patients either obsessive compulsiv...,Preclinical and clinical studies suggest that ...
3,no_relation,calcium,muscle spasms,8492347,severe hypokalemia may cause muscle weakness s...,"Diuretics may induce hypokalemia, hypocalcemia..."
4,no_relation,dexrazoxane,hematologic toxicity,15897593,clinical trials patients brain metastases comb...,The anthracyclines daunorubicin and doxorubici...


### Cleaning abstracts

In [11]:
cleaned_names = list(df.chemical_name) + list(df.disease_name)

In [12]:
def abstract_to_words( raw_review ):
    #Remove HTML
    raw_review = raw_review.replace(">","> ")
    review_text = BeautifulSoup(raw_review, "lxml").get_text().split()
    
    #if word is not a disease or chemical, remove non-alpha. else, retain word. 
    cleaned_sentence = []
    for word in review_text:
        if word not in cleaned_names:
            cleaned_sentence.append(re.sub("[^a-zA-Z]", "", word.lower()))
        else:
            cleaned_sentence.append(word.lower())
    
    #remove stops, extra spaces, and single-letter words 
    stops = set(stopwords.words("english"))  
    spaces = set(" ")
    single_letter_words = set(string.ascii_lowercase)
    unimportant_words = stops | spaces | single_letter_words
    
    return (" ".join([word for word in cleaned_sentence if not word in list(unimportant_words)])).replace("  "," ")

In [13]:
clean_abstracts = []
for i in range( 0, df.shape[0] ):
    clean_abstracts.append( abstract_to_words( df["abstracts"][i] ) )

In [14]:
df.abstracts = clean_abstracts

In [15]:
df.abstracts[100]

' lobundwistar lw strain rats developed large palpable prostate adenocarcinomas pas following treatments nnitrosonmethylurea cas testosterone propionate tp cas tumorbearing rats manifested metastatic lesions incubation periods averaged months within timeframe lw rat developed similar palpable pa treated tp lw rats tp acted tumor enhancement agent primary emphasis development prostate cancer'

In [20]:
df.relationship_info[100]

'lw rats tp acted tumor enhancement agent primary emphasis development prostate cancer tp enhances tumors especially development prostate cancer therefore conclude tp cause prostate cancer'

In [16]:
df.to_csv( "data/cleaned_all_abstracts.csv", index=False)

### Merge relationship_info and abstracts into 1 column: all_info

In [17]:
#merge all wordy columns into 1 column: abstracts_and_info
df['all_info'] = (df["relationship_info"].map(str) + df["abstracts"])
df.head()

Unnamed: 0,verify_relationship,chemical_name,disease_name,pmid,relationship_info,abstracts,all_info
0,yes_direct,5-HT,psychotic disorders,20705401,animal models considered reflect positive symp...,altered serotonergic neural transmission hypot...,animal models considered reflect positive symp...
1,no_relation,D-penicillamine,localized scleroderma,2334179,case reports patients severe extensive localiz...,localized scleroderma recognized internal orga...,case reports patients severe extensive localiz...
2,no_relation,yohimbine,affective disorders,1535072,method six patients either obsessive compulsiv...,preclinical clinical studies suggest yohimbine...,method six patients either obsessive compulsiv...
3,no_relation,calcium,muscle spasms,8492347,severe hypokalemia may cause muscle weakness s...,diuretics may induce hypokalemia hypocalcemia ...,severe hypokalemia may cause muscle weakness s...
4,no_relation,dexrazoxane,hematologic toxicity,15897593,clinical trials patients brain metastases comb...,anthracyclines daunorubicin doxorubicin epipod...,clinical trials patients brain metastases comb...


In [19]:
df.all_info[100]

'lw rats tp acted tumor enhancement agent primary emphasis development prostate cancer tp enhances tumors especially development prostate cancer therefore conclude tp cause prostate cancer lobundwistar lw strain rats developed large palpable prostate adenocarcinomas pas following treatments nnitrosonmethylurea cas testosterone propionate tp cas tumorbearing rats manifested metastatic lesions incubation periods averaged months within timeframe lw rat developed similar palpable pa treated tp lw rats tp acted tumor enhancement agent primary emphasis development prostate cancer'

In [21]:
df_all = df.drop(['abstracts','relationship_info','pmid'], axis=1).copy()
df_all.head()

Unnamed: 0,verify_relationship,chemical_name,disease_name,all_info
0,yes_direct,5-HT,psychotic disorders,animal models considered reflect positive symp...
1,no_relation,D-penicillamine,localized scleroderma,case reports patients severe extensive localiz...
2,no_relation,yohimbine,affective disorders,method six patients either obsessive compulsiv...
3,no_relation,calcium,muscle spasms,severe hypokalemia may cause muscle weakness s...
4,no_relation,dexrazoxane,hematologic toxicity,clinical trials patients brain metastases comb...


In [22]:
df_all.to_csv( "data/df_all.csv", index=False)

In [23]:
df_all.shape

(5713, 4)