### Scraping abstract information
March 4, 2018
This notebook scrapes abstract text from:
- Proceedings of the Annual Cognitive Science Society meeting archive (html)
- Proceedings of Cognitive Neuroscience Society annual meeting (text converted from pdf)

Abstracts are then stored in a spreadsheet, containing information such as year, authors, title, and abstract.

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import csv
from bs4 import BeautifulSoup
import urllib
import re
import string
import pandas as pd
import os
import sys

In [9]:
def scrape_CS(home_url, data_file):
    #connect to home page url for that year
    CSurl = urllib.request.urlopen(home_url).read()
    soup = BeautifulSoup(CSurl, 'html.parser')
    all_links = soup.find_all('a', attrs={'href': re.compile("papers/*")})    
    year = home_url[-5:-1]    
    
    # enumerate through all paper links
    for link_idx, link in enumerate(all_links):
        # get soup from paper url
        if home_url not in str(link['href']):
            url_text = home_url + str(link['href'])
        else:
            url_text = str(link['href'])
    
        url = urllib.request.urlopen(url_text).read()
        soup = BeautifulSoup(url, 'html.parser')
    
        # scrape & parse
        authors = []
        affl = []
        title = ' '.join(soup.find_all('h1')[0].text.split())
        # exception rule for 2014 abstracts
        if '2014' in home_url:            
            abstr = ' '.join(soup.find_all('blockquote')[1].text.split())
        else:            
            abstr = ' '.join(soup.find_all('p', {"id": "abstract"})[0].text.split())            
        
        soup.find_all('ul')
        for ana in soup.find_all('em'):
            affl.append('>'+ana.text)
            if '2014' in home_url:
                # somebody fucked something up in 2014
                authors.append('>' + ana.previous_element.previous_element.split(',')[0])
            else:            
                authors.append('>' + ana.previous_element.split(',')[0])
        
        # do some gymnastics to get it into a pandas df and add as a row to CSV
        new_row = {'Year': str(year), 'Title': title,'Abstract': abstr,'Authors': ''.join(authors),'Affiliations': ''.join(affl), 'URL': url_text}
        df_cur = pd.Series(data=new_row).to_frame().T[['Year','Title','Abstract','Authors','Affiliations','URL']]
        df_cur.to_csv(data_file, mode='a', header=False, index=False)


In [10]:
# get all paper links from cogsci conference
home_urls = ['https://mindmodeling.org/cogsci2017/',
             'https://mindmodeling.org/cogsci2016/',
             'https://mindmodeling.org/cogsci2015/',
             'https://mindmodeling.org/cogsci2014/',
             'https://mindmodeling.org/cogsci2013/',
             'https://mindmodeling.org/cogsci2012/',
             'https://mindmodeling.org/cogsci2011/',
             'https://mindmodeling.org/cogsci2010/']

for year in home_urls:
    # scrape all
    print(year)
    scrape_CS(home_url=year, data_file='../data/cogsci_abstracts.csv')

https://mindmodeling.org/cogsci2017/
https://mindmodeling.org/cogsci2016/
https://mindmodeling.org/cogsci2015/
https://mindmodeling.org/cogsci2014/
https://mindmodeling.org/cogsci2013/
https://mindmodeling.org/cogsci2012/
https://mindmodeling.org/cogsci2011/
https://mindmodeling.org/cogsci2010/


### gather CNS abstracts from text to csv

In [71]:
data_folder = '../data/CNS_programs/'
os.listdir(data_folder)
CNS_files = sorted([f for f in os.listdir(data_folder) if ('CNS' in f) and ('.txt' in f)])
CNS_files

file = open(data_folder+CNS_files[0], 'r')
data = file.read()

In [159]:
data_list = data.split('\n')
abs_start = [ind for ind, d in enumerate(data_list) if 'Graduate Students Present Abstracts' in d][-1]
abs_list = data_list[abs_start:]
poster_beg_ind = next((ind for ind,s in enumerate(abs_list) if 'Poster Session A' == s.strip()), None)
poster_end_ind = next((ind for ind,s in enumerate(abs_list) if 'Author Index' == s.strip()), None)
print(poster_beg_ind, poster_end_ind)

130 29623


In [160]:
def find_num_sess(abs_list, poster_beg_ind):
    # find the number of poster sessions from the schedule section
    sched = abs_list[:poster_beg_ind]
    for ind, sess in enumerate(string.ascii_uppercase):
        if sess not in sched:  
            return string.ascii_uppercase[:ind]

sess = find_num_sess(abs_list, poster_beg_ind)

'Poster Session ' + sess[0]



'Poster Session A'

In [116]:
abs_list[A1[0]:A1[0]+10]

['A2',
 'BENEFITS OF OPTOKINETIC STIMULATION IN PATIENTS WITH',
 'AUDITORY AND VISUAL NEGLECT: TRANSIENT AND',
 'PERMANENT EFFECTS Georg Kerkhoff1,2, Christian Groh-Bordin1, Ingo',
 'Keller3, Vera Ritter2, Frank Artinger4, Wolfram Ziegler2; 1Saarland University,',
 'Saarbruecken, Germany, 2Clinical Neuropsychology Research Group, MunichBogenhausen, Germany, 3Neurological Clinic Bad Aibling, Germany,',
 '4University of Applied Sciences, Karlsruhe, Germany — Unilateral',
 'neglect',
 'after right cerebral stroke involves visual and auditory impairments in orientation and exploration of contralesional stimuli. Several treatments –',
 'mostly focussing on visual neglect - have been proposed: prism adaptation, pharmacological treatments and optokinetic stimulation (OKS).']