# Scraping CogSci abstracts from the web

This notebook scrapes abstract text from the Proceedings of the Annual Cognitive Science Society meeting archive (html). Abstracts are then stored in a spreadsheet, containing information such as year, authors, title, and abstract.

In [5]:
from bs4 import BeautifulSoup
import urllib
import pandas as pd
import re
import os
from numpy import sort

In [33]:
def scrape_CS(home_url):
    # scrape abstracts and return a dataframe
    
    # connect to home page url for that year
    CSurl = urllib.request.urlopen(home_url).read()
    soup = BeautifulSoup(CSurl, 'html.parser')
    all_links = soup.find_all('a', attrs={'href': re.compile("papers/*")})    
    year = home_url[-5:-1]    
    
    df_year = pd.DataFrame()
    # enumerate through all paper links
    for link_idx, link in enumerate(all_links):
        # get soup from paper url
        if home_url not in str(link['href']):
            url_text = home_url + str(link['href'])
        else:
            url_text = str(link['href'])
    
        url = urllib.request.urlopen(url_text).read()
        soup = BeautifulSoup(url, 'html.parser')
    
        # scrape & parse
        authors = []
        affl = []
        title = ' '.join(soup.find_all('h1')[0].text.split())
        # exception rule for 2014 abstracts
        if '2014' in home_url:            
            abstr = ' '.join(soup.find_all('blockquote')[1].text.split())
        else:            
            abstr = ' '.join(soup.find_all('p', {"id": "abstract"})[0].text.split())            
        
        soup.find_all('ul')
        for ana in soup.find_all('em'):
            affl.append('>'+ana.text)
            if '2014' in home_url:
                # somebody fucked something up in 2014
                authors.append('>' + ana.previous_element.previous_element.split(',')[0])
            else:            
                authors.append('>' + ana.previous_element.split(',')[0])
        
        # do some gymnastics to get it into a pandas df and add as a row to CSV
        new_row = {'Year': str(year), 'Title': title,'Abstract': abstr,'Authors': ''.join(authors),'Affiliations': ''.join(affl), 'URL': url_text}
        df_cur = pd.Series(data=new_row).to_frame().T[['Year','Title','Abstract','Authors','Affiliations','URL']]
        df_year = df_year.append(df_cur, ignore_index=True)
    
    return df_year

# Iterate over years
Scrape and save to individual csvs, then combine.

In [2]:
cogsci_rawfolder = '../data/raw/CogSci/'

In [37]:
years = range(2010,2018)
base_url = 'https://mindmodeling.org/cogsci'
for year in years:
    home_url = base_url+str(year)+'/'
    print(home_url)
    df_year = scrape_CS(home_url)
    df_year.to_csv(cogsci_rawfolder + 'cogsci_'+str(year)+'.csv', index=False)
    
    

https://mindmodeling.org/cogsci2010/


KeyboardInterrupt: 

In [8]:
df_cogsci = pd.DataFrame()
for abstr in sort(os.listdir(cogsci_rawfolder)):
    if '.csv' in abstr:
        print(abstr)
        df_year = pd.read_csv(cogsci_rawfolder+abstr)
        df_cogsci=df_cogsci.append(df_year, ignore_index=True)
    
df_cogsci.to_csv('../data/abstracts_cogsci_combined.csv')

cogsci_2010.csv
cogsci_2011.csv
cogsci_2012.csv
cogsci_2013.csv
cogsci_2014.csv
cogsci_2015.csv
cogsci_2016.csv
cogsci_2017.csv
