### Scraping abstract information
March 4, 2018
This notebook scrapes abstract text from:
- Proceedings of the Annual Cognitive Science Society meeting archive (html)
- Proceedings of Cognitive Neuroscience Society annual meeting (text converted from pdf)

Abstracts are then stored in a spreadsheet, containing information such as year, authors, title, and abstract.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import csv
from bs4 import BeautifulSoup
import urllib
import re
import string
import pandas as pd
import os
import sys

In [2]:
def scrape_CS(home_url, data_file):
    #connect to home page url for that year
    CSurl = urllib.request.urlopen(home_url).read()
    soup = BeautifulSoup(CSurl, 'html.parser')
    all_links = soup.find_all('a', attrs={'href': re.compile("papers/*")})    
    year = home_url[-5:-1]    
    
    # enumerate through all paper links
    for link_idx, link in enumerate(all_links):
        # get soup from paper url
        if home_url not in str(link['href']):
            url_text = home_url + str(link['href'])
        else:
            url_text = str(link['href'])
    
        url = urllib.request.urlopen(url_text).read()
        soup = BeautifulSoup(url, 'html.parser')
    
        # scrape & parse
        authors = []
        affl = []
        title = ' '.join(soup.find_all('h1')[0].text.split())
        # exception rule for 2014 abstracts
        if '2014' in home_url:            
            abstr = ' '.join(soup.find_all('blockquote')[1].text.split())
        else:            
            abstr = ' '.join(soup.find_all('p', {"id": "abstract"})[0].text.split())            
        
        soup.find_all('ul')
        for ana in soup.find_all('em'):
            affl.append('>'+ana.text)
            if '2014' in home_url:
                # somebody fucked something up in 2014
                authors.append('>' + ana.previous_element.previous_element.split(',')[0])
            else:            
                authors.append('>' + ana.previous_element.split(',')[0])
        
        # do some gymnastics to get it into a pandas df and add as a row to CSV
        new_row = {'Year': str(year), 'Title': title,'Abstract': abstr,'Authors': ''.join(authors),'Affiliations': ''.join(affl), 'URL': url_text}
        df_cur = pd.Series(data=new_row).to_frame().T[['Year','Title','Abstract','Authors','Affiliations','URL']]
        df_cur.to_csv(data_file, mode='a', header=False, index=False)


In [10]:
# get all paper links from cogsci conference
home_urls = ['https://mindmodeling.org/cogsci2017/',
             'https://mindmodeling.org/cogsci2016/',
             'https://mindmodeling.org/cogsci2015/',
             'https://mindmodeling.org/cogsci2014/',
             'https://mindmodeling.org/cogsci2013/',
             'https://mindmodeling.org/cogsci2012/',
             'https://mindmodeling.org/cogsci2011/',
             'https://mindmodeling.org/cogsci2010/']

for year in home_urls:
    # scrape all
    print(year)
    scrape_CS(home_url=year, data_file='../data/cogsci_abstracts.csv')

https://mindmodeling.org/cogsci2017/
https://mindmodeling.org/cogsci2016/
https://mindmodeling.org/cogsci2015/
https://mindmodeling.org/cogsci2014/
https://mindmodeling.org/cogsci2013/
https://mindmodeling.org/cogsci2012/
https://mindmodeling.org/cogsci2011/
https://mindmodeling.org/cogsci2010/


### gather CNS abstracts from text to csv

In [92]:
data_folder = '../data/CNS_programs/'
os.listdir(data_folder)
CNS_files = sorted([f for f in os.listdir(data_folder) if ('CNS' in f) and ('.txt' in f)])[:-1]
print(CNS_files)

['CNS_2007_Program.txt', 'CNS_2008_Program.txt', 'CNS_2009_Program.txt', 'CNS_2010_Program.txt', 'CNS_2011_Program.txt', 'CNS_2012_Program.txt', 'CNS_2013_Program.txt', 'CNS_2014_Program.txt', 'CNS_2015_Program.txt', 'CNS_2016_Program.txt']


In [119]:
for i in range(len(CNS_files)):
    file = open(data_folder+CNS_files[i], 'r')
    data = file.read()
    data_list = data.split('\n')
    abs_start = [ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d][0]
    print([ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d])
    abs_list = data_list[abs_start:]
    poster_beg_ind = next((ind for ind,s in enumerate(abs_list) if '\x0cPoster Session A' == s), None)    
    poster_end_ind = next((ind for ind,s in enumerate(abs_list) if '\x0cAuthor Index' == s), None)
    print(poster_beg_ind, poster_end_ind)

[1456]
0 29493
[1538, 1748, 1969, 2189, 2414, 2647, 2884, 3114, 3341, 3577, 3814, 4062, 4289, 4522, 4743]
0 30363
[2181, 2395, 2613, 2834, 3076, 3310, 3545, 3791, 4016, 4257, 4478, 4703]
0 23882
[3236, 3472, 3718, 3974, 4229, 4460, 4694, 4956, 5207, 5453, 5700, 5963, 6217, 6466, 6724]
0 31628
[3326, 3420, 3667, 3907, 4148, 4394, 4628, 4877, 5128, 5377, 5626, 5856, 6086]
0 24580
[3516, 3610, 3848, 4113, 4353, 4587, 4836, 5074, 5310, 5560, 5789, 6021, 6256]
0 24793
[2881, 3128, 3372, 3626, 3852, 4106, 4348, 4593, 4844, 5088, 5333, 5583, 5827, 6078, 6313]
0 28381
[3137, 3250, 3505, 3753, 4009, 4253, 4493, 4754, 5007, 5251, 5508, 5758, 6009, 6245, 6501, 6761, 7012]
0 28084
[3350, 3588, 3829, 4068, 4322, 4566, 4823, 5068, 5298, 5530, 5787, 6028, 6274, 6511]
0 23909
[3593, 3823, 4072, 4313, 4566, 4827, 5087, 5337, 5585, 5836, 6077, 6325, 6588, 6859, 7099, 7349, 7613, 7877, 8129]
0 28305


In [126]:
for i in range(len(CNS_files)):
    file = open(data_folder+CNS_files[i], 'r')
    print(CNS_files[i])
    data = file.read()
    data_list = data.split('\n')
    poster_beg_ind = data_list.index('\x0cPoster Session A')
    if '\x0cPoster Topic Index' in data_list:
        poster_end_ind = data_list.index('\x0cPoster Topic Index')
    else:
        poster_end_ind = data_list.index('\x0cAuthor Index')

    #print(poster_beg_ind, poster_end_ind)
    print(data_list[poster_beg_ind:poster_beg_ind+7])
    print('---')

CNS_2007_Program.txt
['\x0cPoster Session A', 'Attentional Processes: Auditory', 'A1', 'ERP MEASURES OF AUDITORY SELECTIVE ATTENTION IN', 'CHILDREN WITH AD/HD Hilary', 'Gomes1, Martin Duff1, Virginia', 'Wolfson1, Walter Ritter2, Jeffrey Halperin3; 1City College of New York, 2Nathan']
---
CNS_2008_Program.txt
['\x0cPoster Session A', 'Higher level cognition: Executive functions', 'A1', 'EFFECTS OF PRIOR PROBABILITY ON THE DECISION', 'CRITERION: AN FMRI STUDY Kathleen Hansen1, Sarah Hillenbrand1,', 'Leslie Ungerleider1; 1NIMH/NIH — Most models of decision-making', 'include several theoretical steps: sensory data are analyzed to yield evidence supporting one or more alternatives; biases are introduced to reflect']
---
CNS_2009_Program.txt
['\x0cPoster Session A', 'Attentional processes: Auditory', 'A1', 'INDEPENDENT FACILITATION AND INHIBITION MECHANISMS', 'IN AUDITORY SELECTIVE ATTENTION Constanze Mikyska1, Aurelie', 'Bidet-Caulet1, Robert T. Knight1,2; 1Helen Wills Neuroscience Institut

In [88]:
file = open(data_folder+CNS_files[1], 'r')
data = file.read()
data_list = data.split('\n')
print(data_list.index('\x0cPoster Session A'))
data_list[data_list.index('\x0cPoster Session H'):]


1538


['\x0cPoster Session H',
 'Higher level cognition: Executive functions',
 'H1',
 'ADJUSTMENTS IN ATTENTIONAL CONTROL BY CONGRUENT',
 'INFORMATION IN ANTERIOR CINGULATE CORTEX Esther',
 'Aarts1,2, Ardi Roelofs1,2; 1F.C. Donders Centre for Cognitive Neuroimaging at',
 'Radboud University Nijmegen, 2Nijmegen Institute for Cognition and',
 'Information at Radboud University Nijmegen — According to a dominant',
 'theory of attentional control (Miller & Cohen, 2001), adjustments in control',
 'are made upon detection of response conflict by the anterior cingulate cortex (ACC). In line with this view, previous research has demonstrated',
 'reduced conflict effects in response times and ACC activity to targets after',
 'incongruent as compared to congruent trials. It is unclear, however,',
 'whether this sequential effect is induced by expected incongruency (Miller',
 '& Cohen, 2001), congruency (Gratton et al., 1992), or both. To investigate',
 'this, we ran an fMRI study using a Stroop-like 

In [94]:
def find_num_sess(abs_list, poster_beg_ind):
    # find the number of poster sessions from the schedule section
    sched = abs_list[:poster_beg_ind]
    for ind, sess in enumerate(string.ascii_uppercase):
        if sess not in sched:  
            return string.ascii_uppercase[:ind]

sess = find_num_sess(abs_list, poster_beg_ind)
sess
#'Poster Session ' + sess[0]



''

In [41]:
cur_section = 'E '
cur_abs = 129
abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)
abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))

' '.join(abs_list[abs_beg_ind:abs_end_ind])
#abs_list[abs_beg_ind:abs_end_ind]

'E 129 TEMPORAL PROPERTIES OF MONOCULAR AND DICHOPTIC CROWDING Paul F. Bulakowski, Robert B. Post, David Whitney; University of California, Davis — Identification of an object in the peripheral visual field is impaired when it is crowded by surrounding, nonoverlapping objects. Several studies have reported two unique characteristics of crowding. First, optimal crowding for binocularly presented stimuli occurs with very brief stimulus onset asynchronies (SOAs); thus  \x0cPerceptual Processes: Other  the crowding effect is temporally dependent. Second, crowding occurs when a target and its flankers are both presented to one eye (monocular presentation) or when each is presented to a different eye (dichoptic presentation). Thus, crowding may occur at a single stage in the visual system. To test this, we measured the temporal dependence of crowding when the target and flanker stimuli were presented monocularly or dichoptically. A method of constant stimuli task revealed that crowding varie