### Scraping abstract information
March 4, 2018
This notebook scrapes abstract text from:
- Proceedings of the Annual Cognitive Science Society meeting archive (html)
- Proceedings of Cognitive Neuroscience Society annual meeting (text converted from pdf)

Abstracts are then stored in a spreadsheet, containing information such as year, authors, title, and abstract.

# Scraping CNS abstracts from pdf-converted textfiles

This notebook scrapes abstract text from the Proceedings of the Annual Cognitive Science Society meeting archive (html). Abstracts are then stored in a spreadsheet, containing information such as year, authors, title, and abstract.

In [2]:
from bs4 import BeautifulSoup
import urllib
import re
import pandas as pd
import os

In [3]:
def scrape_CS(home_url, data_file):
    #connect to home page url for that year
    CSurl = urllib.request.urlopen(home_url).read()
    soup = BeautifulSoup(CSurl, 'html.parser')
    all_links = soup.find_all('a', attrs={'href': re.compile("papers/*")})    
    year = home_url[-5:-1]    
    
    # enumerate through all paper links
    for link_idx, link in enumerate(all_links):
        # get soup from paper url
        if home_url not in str(link['href']):
            url_text = home_url + str(link['href'])
        else:
            url_text = str(link['href'])
    
        url = urllib.request.urlopen(url_text).read()
        soup = BeautifulSoup(url, 'html.parser')
    
        # scrape & parse
        authors = []
        affl = []
        title = ' '.join(soup.find_all('h1')[0].text.split())
        # exception rule for 2014 abstracts
        if '2014' in home_url:            
            abstr = ' '.join(soup.find_all('blockquote')[1].text.split())
        else:            
            abstr = ' '.join(soup.find_all('p', {"id": "abstract"})[0].text.split())            
        
        soup.find_all('ul')
        for ana in soup.find_all('em'):
            affl.append('>'+ana.text)
            if '2014' in home_url:
                # somebody fucked something up in 2014
                authors.append('>' + ana.previous_element.previous_element.split(',')[0])
            else:            
                authors.append('>' + ana.previous_element.split(',')[0])
        
        # do some gymnastics to get it into a pandas df and add as a row to CSV
        new_row = {'Year': str(year), 'Title': title,'Abstract': abstr,'Authors': ''.join(authors),'Affiliations': ''.join(affl), 'URL': url_text}
        df_cur = pd.Series(data=new_row).to_frame().T[['Year','Title','Abstract','Authors','Affiliations','URL']]
        df_cur.to_csv(data_file, mode='a', header=False, index=False)


### gather CNS abstracts from text to csv

In [5]:
data_folder = '../data/raw/CNS_programs_txt/'
os.listdir(data_folder)
CNS_files = sorted([f for f in os.listdir(data_folder) if ('CNS' in f) and ('.txt' in f)])[:-1]#not include 2017 forget why though
print(CNS_files)

['CNS_2007_Program.txt', 'CNS_2008_Program.txt', 'CNS_2009_Program.txt', 'CNS_2010_Program.txt', 'CNS_2011_Program.txt', 'CNS_2012_Program.txt', 'CNS_2013_Program.txt', 'CNS_2014_Program.txt', 'CNS_2015_Program.txt', 'CNS_2016_Program.txt']


In [6]:
CNS_file=[]
start_sect=[]
beg_ind=[]
end_ind=[]

for i in range(len(CNS_files)):
    
    CNS_file.append(CNS_files[i])
    
    #print(CNS_files[i])
    file = open(data_folder+CNS_files[i], 'r')
    data = file.read()
    data_list = data.split('\n')
    abs_start = [ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d][0]#tells you the ind where the first abstract is?
    
    start_sect.append([ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d])
    #print([ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d])
    
    abs_list = data_list[abs_start:]
    poster_beg_ind = next((ind for ind,s in enumerate(abs_list) if '\x0cPoster Session A' == s), None)    
    poster_end_ind = next((ind for ind,s in enumerate(abs_list) if '\x0cAuthor Index' == s), None)
    
    
    beg_ind.append(poster_beg_ind)
    
    end_ind.append(poster_end_ind)
    
    
    #print(poster_beg_ind, poster_end_ind)

In [7]:
file = open(data_folder+CNS_files[0], 'r')
data = file.read()
data_list = data.split('\n')
abs_start = [ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d][0]#tells you the ind where the first abstract is?

In [8]:
cns_frame=pd.DataFrame({'CNS_file':CNS_file,'start_sect':start_sect,'beg_ind':beg_ind,'end_ind':end_ind})
cns_frame

Unnamed: 0,CNS_file,beg_ind,end_ind,start_sect
0,CNS_2007_Program.txt,0,29493,[1456]
1,CNS_2008_Program.txt,0,30363,"[1538, 1748, 1969, 2189, 2414, 2647, 2884, 311..."
2,CNS_2009_Program.txt,0,23882,"[2181, 2395, 2613, 2834, 3076, 3310, 3545, 379..."
3,CNS_2010_Program.txt,0,31628,"[3236, 3472, 3718, 3974, 4229, 4460, 4694, 495..."
4,CNS_2011_Program.txt,0,24580,"[3326, 3420, 3667, 3907, 4148, 4394, 4628, 487..."
5,CNS_2012_Program.txt,0,24793,"[3516, 3610, 3848, 4113, 4353, 4587, 4836, 507..."
6,CNS_2013_Program.txt,0,28381,"[2881, 3128, 3372, 3626, 3852, 4106, 4348, 459..."
7,CNS_2014_Program.txt,0,28084,"[3137, 3250, 3505, 3753, 4009, 4253, 4493, 475..."
8,CNS_2015_Program.txt,0,23909,"[3350, 3588, 3829, 4068, 4322, 4566, 4823, 506..."
9,CNS_2016_Program.txt,0,28305,"[3593, 3823, 4072, 4313, 4566, 4827, 5087, 533..."


So it looks like the sections in 2007 are messed up until F for numbering the 10's

EX:
A 10(doesnt work)
B 10(doesnt work)
...
F10(works)

and it's all of the numbers until F


In [9]:
sect_abs=[]
abst=[]
title=[]
auth=[]
CNS_Year=[]

let_vec = ['A','B','C','D','E','F','G','H','I','J']
for x in range(len(CNS_files)):

    
    file = open(data_folder+CNS_files[x], 'r')# looks like this is the place where we call the specific year to parse
    data = file.read()
    data_list = data.split('\n')
    abs_start = [ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d][0]#tells you the ind where the first abstract is?
    abs_list = data_list[abs_start:]
    poster_beg_ind = data_list.index('\x0cPoster Session A')

    



    for j in range(0,len(let_vec)):

        #print(j)

        for i in range(0,200):

            try:
                cur_section= let_vec[j]
                cur_abs=i
                abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)
                abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))
                last_index = cur_section+'%i'%(cur_abs+1)
                #print(cur_abs)

                section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
                
                start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
                start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed

                whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
                title_sect=section_abst[0:start_abst]#the title and author combined
                title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
                title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
                length_title_lst=' '.join(title_lst)
                remove_start_string=length_title_lst.replace(start_string,"")
                auth_sect=section_abst[len(length_title_lst):start_abst]



                sect_abs.append(cur_section+'%i'%cur_abs)
                abst.append(whole_abs)
                title.append(remove_start_string)
                
                auth.append(auth_sect)
                CNS_Year.append(CNS_file[x])


                last_good = i
            except ValueError:

                for k in range(2,51):
                    try:
                        #print('k = ',k)
                        #print(cur_section+'%i'%(cur_abs+k))
                        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+k))

                        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
                        #     print(section_abst)
                        #     print(cur_section+'%i'%(cur_abs+1))
                        #     print(i, abs_beg_ind, abs_end_ind)
                        start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
                        start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed

                        whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
                        title_sect=section_abst[0:start_abst]#the title and author combined
                        title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
                        title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
                        length_title_lst=' '.join(title_lst)
                        remove_start_string=length_title_lst.replace(start_string,"")
                        auth_sect=section_abst[len(length_title_lst):start_abst]


                        sect_abs.append(cur_section+'%i'%cur_abs)
                        abst.append(whole_abs)
                        title.append(remove_start_string)
                        
                        auth.append(auth_sect)
                        CNS_Year.append(CNS_file[x])


                        break
                    except:
                        #print('except')
                        pass


                    else:
                        #print('reached else')
                        pass

                #print('last index =',last_index)



    
            except:

                                abs_beg_ind = abs_list.index(last_index)
                                nxt = cur_section + 1
                                abs_end_ind = abs_list.index(let_vec[j+1]+'%i'%nxt)

                                section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
                                #     print(section_abst)
                                #     print(cur_section+'%i'%(cur_abs+1))
                                #     print(i, abs_beg_ind, abs_end_ind)
                                start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
                                start_string=last_index #the F4 like number at the beg of each title that isnt needed

                                whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
                                title_sect=section_abst[0:start_abst]#the title and author combined
                                title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
                                title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
                                length_title_lst=' '.join(title_lst)
                                remove_start_string=length_title_lst.replace(start_string,"")
                                auth_sect=section_abst[len(length_title_lst):start_abst]


                                sect_abs.append(last_index)
                                abst.append(whole_abs)
                                title.append(remove_start_string)
                                #auth.append(title_sect) old wrong way of getting author
                                auth.append(auth_sect)
                                CNS_Year.append(CNS_file[x])
                                last_index
                        



            else:
                    pass



    print(last_good)

147
131
103
126
126
99
135
150
131
174


In [10]:
try_table=pd.DataFrame({'sect_abs':sect_abs,'abst':abst,'title':title,'auth':auth, 'CNS_Year':CNS_Year})
try_table


Unnamed: 0,CNS_Year,abst,auth,sect_abs,title
0,CNS_2007_Program.txt,Recent models of attention in typically develo...,"lary Gomes1, Martin Duff1, Virginia Wolfson1, ...",A1,ERP MEASURES OF AUDITORY SELECTIVE ATTENTION ...
1,CNS_2007_Program.txt,Unilateral neglect after right cerebral stroke...,"Georg Kerkhoff1,2, Christian Groh-Bordin1, In...",A2,BENEFITS OF OPTOKINETIC STIMULATION IN PATIEN...
2,CNS_2007_Program.txt,People easily extract the embedded metrical st...,"ung1, Denise H. Wu2, Daisy L. Hung1,2, Ovid J....",A3,WHEN ABSTRACT KNOWLEDGE MEETS PERCEPTUAL GROU...
3,CNS_2007_Program.txt,To investigate the impact of attention on proc...,"Yuliya Yoncheva, Jason Zevin, Urs Maurer, Bru...",A4,LEFT-LATERALIZED MODULATIONS OF INFERIOR FRON...
4,CNS_2007_Program.txt,Abstract stimulus features are encoded in sens...,"Alexandra Bendixen, Erich Schröger; Universit...",A5,EXTRACTION OF ABSTRACT REGULARITIES FROM DYNA...
5,CNS_2007_Program.txt,This experiment investigated brain responses i...,"ChunYu Tse, Kathy Low, Jason Agran, Guadalupe...",A6,SEQUENTIAL TONE DISCRIMINATION WITH IMPLICIT ...
6,CNS_2007_Program.txt,Spatially selective attention allows individua...,"Lisa Sanders, Lori Astheimer; University of M...",A7,EVENT RELATED POTENTIAL EVIDENCE OF RAPIDLY M...
7,CNS_2007_Program.txt,"In crossmodal spatial cueing (Posner, M., 1980...","Maja Trenner1, Markus Bauer2, Rüdiger Wenzel1...",A8,WHAT HAPPENS IN BETWEEN? MODULATIONS OF CORTI...
8,CNS_2007_Program.txt,Disruption of attention is a hallmark symptom ...,"n Wood1, Geoffrey Potts1, Laura Martin1, Delia...",B0,A9 DISRUPTION OF AUDITORY AND VISUAL ATTENTION...
9,CNS_2007_Program.txt,Decision-making is a fundamental capacity whic...,"Bruno Kopp, Sandra Tabeling, Carsten Moschner...",B1,NEUROCOGNITIVE MECHANISMS OF PERCEPTUAL DECIS...


In [9]:
try_table.to_csv("CNS_SCRAPED_DI.csv")