In [1]:
## Imports

#Web Data Packages
#import lxml         
import requests         
from bs4 import BeautifulSoup

# Other Packages
import numpy as np
from __future__ import division

In [2]:
## Things to search through
erps = ['P50', 'P100', 'P200', 'P300', 'P3a', 'P3b', 'P600', 'N100', 'N170', 'N200', 'N2pc', 'N400', 'MMN', 'LPC', 'CNV', 'ERN', 'ELAN', 'CPS', 'LRP']
cogs = ['language', 'memory', 'attention', 'motor', 'decision making', 'vision', 'auditory', 'emotion', 'categorization', 'reward', 'spatial']

In [3]:
## Initialize Things

# Front part of the e-search url
url_front = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&field=word&term='

# Get number of ERPs & COGs to search
nERPs = len(erps)
nCOGs = len(cogs)

# Initialize vectors to store number of papers for ERPs and COGs
ERP_papers = np.zeros([nERPs])
COG_papers = np.zeros([nCOGs])

# Initialize matrices to store data
dat_numbers = np.zeros([nERPs, nCOGs])
dat_percent = np.zeros([nERPs, nCOGs])

In [4]:
##

for erp in erps:
    for cog in cogs:
        
        # Get inds of current erp & cog
        erp_ind = erps.index(erp)
        cog_ind = cogs.index(cog)
        
        # Exact term version
        url = url_front + '"' + erp + '"AND"' + cog + '"'
        
        # Non-exact term version
        #url = url_front + erp + ' erp ' + cog
        
        # Pull the page, and parse with BS
        page = requests.get(url)
        page_soup = BeautifulSoup(page.content)
        
        # Get all 'count' tags
        counts = page_soup.find_all('count')
        
        # Initialize empty temporary vector to hold counts
        vec = []
        
        # Loop through counts, extracting into vec
        for i in range(0, len(counts)):
            count = counts[i]
            ext = count.text
            vec.append(int(ext))
        
        # Add the total number of papers for erp & cog
        ERP_papers[erp_ind] = vec[1]
        COG_papers[cog_ind] = vec[2]
        
        # Add the number & percent of overlapping papers
        dat_numbers[erp_ind, cog_ind] = vec[0]
        dat_percent[erp_ind, cog_ind] = vec[0]/vec[1]

In [5]:
# Print strongest association for each ERP
for erp in erps:
    
    erp_ind = erps.index(erp)
    cog_ind = np.argmax(dat_percent[erp_ind, :])

    print 'For the  {:5} the most common association is  {:10} with %{:05.2f}'.format(erp, cogs[cog_ind], dat_percent[erp_ind, cog_ind]*100)

For the  P50   the most common association is  auditory   with %06.75
For the  P100  the most common association is  vision     with %09.64
For the  P200  the most common association is  auditory   with %32.81
For the  P300  the most common association is  auditory   with %21.08
For the  P3a   the most common association is  auditory   with %52.21
For the  P3b   the most common association is  attention  with %41.28
For the  P600  the most common association is  language   with %55.66
For the  N100  the most common association is  auditory   with %63.83
For the  N170  the most common association is  attention  with %19.37
For the  N200  the most common association is  auditory   with %35.27
For the  N2pc  the most common association is  attention  with %87.37
For the  N400  the most common association is  language   with %43.10
For the  MMN   the most common association is  auditory   with %68.75
For the  LPC   the most common association is  memory     with %04.44
For the  CNV   the m

In [6]:
# Print strongest associated ERP for each COG
for cog in cogs:
    
    cog_ind = cogs.index(cog)
    erp_ind = np.argmax(dat_percent[:, cog_ind])
    
    print 'For  {:20} the strongest associated ERP is   {:5} with   %{:05.2f}'.format(cog, erps[erp_ind], dat_percent[erp_ind, cog_ind]*100)

For  language             the strongest associated ERP is   P600  with   %55.66
For  memory               the strongest associated ERP is   N400  with   %22.27
For  attention            the strongest associated ERP is   N2pc  with   %87.37
For  motor                the strongest associated ERP is   MMN   with   %14.37
For  decision making      the strongest associated ERP is   ERN   with   %05.15
For  vision               the strongest associated ERP is   CNV   with   %10.18
For  auditory             the strongest associated ERP is   MMN   with   %68.75
For  emotion              the strongest associated ERP is   N170  with   %15.54
For  categorization       the strongest associated ERP is   N170  with   %07.55
For  reward               the strongest associated ERP is   ERN   with   %06.41
For  spatial              the strongest associated ERP is   N2pc  with   %36.52


In [7]:
print 'The most studied ERP is  {:6}  with  {:8.0f}  papers'.format(erps[np.argmax(ERP_papers)], ERP_papers[np.argmax(ERP_papers)])
print 'The most studied COG is  {:6}  with  {:8.0f}  papers'.format(cogs[np.argmax(COG_papers)], COG_papers[np.argmax(COG_papers)])

The most studied ERP is  P300    with      9865  papers
The most studied COG is  motor   with    345239  papers


In [8]:
for erp in erps:
    erp_ind = erps.index(erp)
    print '{:5} - {:8.0f}'.format(erp, ERP_papers[erp_ind])

P50   -     8032
P100  -     2137
P200  -      835
P300  -     9865
P3a   -      768
P3b   -      751
P600  -      521
N100  -      882
N170  -      914
N200  -      567
N2pc  -      293
N400  -     1805
MMN   -     2192
LPC   -     2366
CNV   -     5970
ERN   -      796
ELAN  -      224
CPS   -     6014
LRP   -     3223


In [9]:
for cog in cogs:
    cog_ind = cogs.index(cog)
    print '{:18} - {:10.0f}'.format(cog, COG_papers[cog_ind])

language           -     131880
memory             -     207980
attention          -     327160
motor              -     345239
decision making    -     144828
vision             -     133321
auditory           -     112603
emotion            -      25577
categorization     -      11075
reward             -      30520
spatial            -     217492


In [None]:
# Test Code

page = requests.get('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&field=word&term=“N400”AND”Language”')

page_soup = BeautifulSoup(page.content)

counts = page_soup.find_all('count')

for i in range(0, len(counts)):
    count = counts[i]
    ext = count.text
    print int(ext)