# Scraping NIPS abstracts from the web

This notebook scrapes abstract text from the Proceedings of NIPS archive (html). Abstracts are then stored in a spreadsheet, containing information such as year, authors, title, and abstract.

https://papers.nips.cc/

There is also a Kaggle dataset of the NIPS proceedings, but the csv looks really messed up after I download it for some reason: https://www.kaggle.com/benhamner/nips-papers#papers.csv

So I ended up scraping from NIPS myself anyway. Abstracts were all missing before 2008 so data starts from 2008.

In [8]:
from bs4 import BeautifulSoup
import urllib
import pandas as pd
import re
import os
from numpy import sort

In [94]:
def scrape_NIPS(home_url):
    # connect to home page url for that year
    year_url = urllib.request.urlopen(home_url).read()
    soup = BeautifulSoup(year_url, 'html.parser')
    all_links = soup.find_all('a', attrs={'href': re.compile("/paper/*")})    
    year = home_url[-4:]    

    df_year = pd.DataFrame()
    # enumerate through all paper links
    for link_idx, link in enumerate(all_links):
        # get soup from paper url
        url_text = 'https://papers.nips.cc' + str(link['href'])
        url = urllib.request.urlopen(url_text).read()
        soup = BeautifulSoup(url, 'html.parser')

        # scrape & parse
        title = ' '.join(soup.find_all('title')[0].text.split())
        abstr = ' '.join(soup.find_all('p', {"class": "abstract"})[0].text.split())
        if abstr == 'Abstract Missing':
            abstr = ''
        authors = ['>'+au['content'] for au in soup.find_all('meta', {"name": "citation_author"})]

        # do some gymnastics to get it into a pandas df and add as a row to CSV
        new_row = {'Year': str(year), 'Title': title,'Abstract': abstr,'Authors': ''.join(authors),'Affiliations': ' ', 'URL': url_text}
        df_cur = pd.Series(data=new_row).to_frame().T[['Year','Title','Abstract','Authors','Affiliations','URL']]
        df_year = df_year.append(df_cur, ignore_index=True)

    return df_year

# Iterate over years
Scrape and save to individual csvs, then combine.

In [2]:
nips_rawfolder = '../data/raw/NIPS/'

In [95]:
years = range(2008,2019) # abstracts are in html starting from 2008
base_url = 'https://papers.nips.cc/book/advances-in-neural-information-processing-systems-%i-%i'
for y_i, year in enumerate(years):
    home_url = base_url%(y_i+21, year)
    print(home_url)
    df_year = scrape_NIPS(home_url)
    df_year.to_csv(nips_rawfolder + 'nips_'+str(year)+'.csv', index=False)

https://papers.nips.cc/book/advances-in-neural-information-processing-systems-21-2008
https://papers.nips.cc/book/advances-in-neural-information-processing-systems-22-2009
https://papers.nips.cc/book/advances-in-neural-information-processing-systems-23-2010
https://papers.nips.cc/book/advances-in-neural-information-processing-systems-24-2011
https://papers.nips.cc/book/advances-in-neural-information-processing-systems-25-2012
https://papers.nips.cc/book/advances-in-neural-information-processing-systems-26-2013
https://papers.nips.cc/book/advances-in-neural-information-processing-systems-27-2014
https://papers.nips.cc/book/advances-in-neural-information-processing-systems-28-2015
https://papers.nips.cc/book/advances-in-neural-information-processing-systems-29-2016
https://papers.nips.cc/book/advances-in-neural-information-processing-systems-30-2017
https://papers.nips.cc/book/advances-in-neural-information-processing-systems-31-2018


In [12]:
df_nips = pd.DataFrame()
for abstr in sort(os.listdir(nips_rawfolder)):
    if '.csv' in abstr:
        print(abstr)
        df_year = pd.read_csv(nips_rawfolder+abstr)
        df_nips=df_nips.append(df_year, ignore_index=True)
    
df_nips.to_csv('../data/abstracts_nips_combined.csv')

nips_2008.csv
nips_2009.csv
nips_2010.csv
nips_2011.csv
nips_2012.csv
nips_2013.csv
nips_2014.csv
nips_2015.csv
nips_2016.csv
nips_2017.csv
nips_2018.csv
