### Scraping abstract information
March 4, 2018
This notebook scrapes abstract text from:
- Proceedings of the Annual Cognitive Science Society meeting archive (html)
- Proceedings of Cognitive Neuroscience Society annual meeting (text converted from pdf)

Abstracts are then stored in a spreadsheet, containing information such as year, authors, title, and abstract.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import csv
from bs4 import BeautifulSoup
import urllib
import re
import string
import pandas as pd
import os
import sys

In [2]:
def scrape_CS(home_url, data_file):
    #connect to home page url for that year
    CSurl = urllib.request.urlopen(home_url).read()
    soup = BeautifulSoup(CSurl, 'html.parser')
    all_links = soup.find_all('a', attrs={'href': re.compile("papers/*")})    
    year = home_url[-5:-1]    
    
    # enumerate through all paper links
    for link_idx, link in enumerate(all_links):
        # get soup from paper url
        if home_url not in str(link['href']):
            url_text = home_url + str(link['href'])
        else:
            url_text = str(link['href'])
    
        url = urllib.request.urlopen(url_text).read()
        soup = BeautifulSoup(url, 'html.parser')
    
        # scrape & parse
        authors = []
        affl = []
        title = ' '.join(soup.find_all('h1')[0].text.split())
        # exception rule for 2014 abstracts
        if '2014' in home_url:            
            abstr = ' '.join(soup.find_all('blockquote')[1].text.split())
        else:            
            abstr = ' '.join(soup.find_all('p', {"id": "abstract"})[0].text.split())            
        
        soup.find_all('ul')
        for ana in soup.find_all('em'):
            affl.append('>'+ana.text)
            if '2014' in home_url:
                # somebody fucked something up in 2014
                authors.append('>' + ana.previous_element.previous_element.split(',')[0])
            else:            
                authors.append('>' + ana.previous_element.split(',')[0])
        
        # do some gymnastics to get it into a pandas df and add as a row to CSV
        new_row = {'Year': str(year), 'Title': title,'Abstract': abstr,'Authors': ''.join(authors),'Affiliations': ''.join(affl), 'URL': url_text}
        df_cur = pd.Series(data=new_row).to_frame().T[['Year','Title','Abstract','Authors','Affiliations','URL']]
        df_cur.to_csv(data_file, mode='a', header=False, index=False)


In [None]:
# get all paper links from cogsci conference
home_urls = ['https://mindmodeling.org/cogsci2017/',
             'https://mindmodeling.org/cogsci2016/',
             'https://mindmodeling.org/cogsci2015/',
             'https://mindmodeling.org/cogsci2014/',
             'https://mindmodeling.org/cogsci2013/',
             'https://mindmodeling.org/cogsci2012/',
             'https://mindmodeling.org/cogsci2011/',
             'https://mindmodeling.org/cogsci2010/']

for year in home_urls:
    # scrape all
    print(year)
    scrape_CS(home_url=year, data_file='../data/cogsci_abstracts.csv')

### gather CNS abstracts from text to csv

In [3]:
data_folder = '../data/CNS_programs/'
os.listdir(data_folder)
CNS_files = sorted([f for f in os.listdir(data_folder) if ('CNS' in f) and ('.txt' in f)])[:-1]
print(CNS_files)

['CNS_2007_Program.txt', 'CNS_2008_Program.txt', 'CNS_2009_Program.txt', 'CNS_2010_Program.txt', 'CNS_2011_Program.txt', 'CNS_2012_Program.txt', 'CNS_2013_Program.txt', 'CNS_2014_Program.txt', 'CNS_2015_Program.txt', 'CNS_2016_Program.txt']


In [4]:
#col_names = ['CNS_file','start_sect','beg_ind','end_ind']

#reffering to the abs_list problem encountered in journal on 8/2/2018. I think this needs to be on the outside of the loop so that it can catch each abs_list year change


CNS_file=[]
start_sect=[]
beg_ind=[]
end_ind=[]

for i in range(len(CNS_files)):
    
    CNS_file.append(CNS_files[i])
    
    #print(CNS_files[i])
    file = open(data_folder+CNS_files[i], 'r')#the 8 is supposed to be "i" but I think that it is grabbing the last one and using that to make abs list each time 
    data = file.read()
    data_list = data.split('\n')
    abs_start = [ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d][0]#tells you the ind where the first abstract is?
    
    start_sect.append([ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d])
    #print([ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d])
    
    abs_list = data_list[abs_start:]
    poster_beg_ind = next((ind for ind,s in enumerate(abs_list) if '\x0cPoster Session A' == s), None)    
    poster_end_ind = next((ind for ind,s in enumerate(abs_list) if '\x0cAuthor Index' == s), None)
    
    
    beg_ind.append(poster_beg_ind)
    
    end_ind.append(poster_end_ind)
    
    
    #print(poster_beg_ind, poster_end_ind)

In [5]:
cns_frame=pd.DataFrame({'CNS_file':CNS_file,'start_sect':start_sect,'beg_ind':beg_ind,'end_ind':end_ind})
cns_frame

Unnamed: 0,CNS_file,beg_ind,end_ind,start_sect
0,CNS_2007_Program.txt,0,29493,[1456]
1,CNS_2008_Program.txt,0,30363,"[1538, 1748, 1969, 2189, 2414, 2647, 2884, 311..."
2,CNS_2009_Program.txt,0,23882,"[2181, 2395, 2613, 2834, 3076, 3310, 3545, 379..."
3,CNS_2010_Program.txt,0,31628,"[3236, 3472, 3718, 3974, 4229, 4460, 4694, 495..."
4,CNS_2011_Program.txt,0,24580,"[3326, 3420, 3667, 3907, 4148, 4394, 4628, 487..."
5,CNS_2012_Program.txt,0,24793,"[3516, 3610, 3848, 4113, 4353, 4587, 4836, 507..."
6,CNS_2013_Program.txt,0,28381,"[2881, 3128, 3372, 3626, 3852, 4106, 4348, 459..."
7,CNS_2014_Program.txt,0,28084,"[3137, 3250, 3505, 3753, 4009, 4253, 4493, 475..."
8,CNS_2015_Program.txt,0,23909,"[3350, 3588, 3829, 4068, 4322, 4566, 4823, 506..."
9,CNS_2016_Program.txt,0,28305,"[3593, 3823, 4072, 4313, 4566, 4827, 5087, 533..."


In [33]:
#this cell specifically needs to be copied and pasted for every journal and needs to go first because all the rest of the cells use it to initialze abs_list
sect_abs=[]
abst=[]
title=[]
auth=[]
i=1
letter_start_ind = data_list.index('\x0cPoster Session A')
letter_end_ind = data_list.index('\x0cPoster Session B')
a_b_range= range(letter_start_ind,letter_end_ind)
a_b_range=list(a_b_range)
file = open(data_folder+CNS_files[5], 'r')# looks like this is the place where we call the specific year to parse
data = file.read()
data_list = data.split('\n')
abs_start = [ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d][0]#tells you the ind where the first abstract is?
abs_list = data_list[abs_start:]
poster_beg_ind = data_list.index('\x0cPoster Session A')
#while cur_section = 'A':#dont know how this cur_section string being A will work. thinking 
    #better idea being from poster_beg_ind = data_list.index('\x0cPoster Session A') 
    #to poster_beg_ind = data_list.index('\x0cPoster Session B') make this a range
#['A' 'B']
#for cur_section in 

last_index = 0
let_vec = ['A','B','C']

for j in let_vec:
    
    print(j)

    for i in range(0,200):

        try:
            cur_section= j
            cur_abs=i+1
            abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)

            for k in range(2,51):
                try:
                    abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))
                    last_index = cur_section+'%i'%(cur_abs+1)
                    #print('index found')
                    
                    section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
                    #     print(section_abst)
                    #     print(cur_section+'%i'%(cur_abs+1))
                    #     print(i, abs_beg_ind, abs_end_ind)
                    start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
                    start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed

                    whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
                    title_sect=section_abst[0:start_abst]#the title and author combined
                    title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
                    title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
                    length_title_lst=' '.join(title_lst)
                    remove_start_string=length_title_lst.replace(start_string,"")
                    auth_sect=section_abst[len(length_title_lst):start_abst]



                    sect_abs.append(cur_section+'%i'%cur_abs)
                    abst.append(whole_abs)
                    title.append(remove_start_string)
                    #auth.append(title_sect) old wrong way of getting author
                    auth.append(auth_sect)
                    
                    
                    break

                except ValueError:
                    
                    print('k = ',k)
                    print(cur_section+'%i'%(cur_abs+k))
                    abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+k))
                    
                    section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
                    #     print(section_abst)
                    #     print(cur_section+'%i'%(cur_abs+1))
                    #     print(i, abs_beg_ind, abs_end_ind)
                    start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
                    start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed

                    whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
                    title_sect=section_abst[0:start_abst]#the title and author combined
                    title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
                    title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
                    length_title_lst=' '.join(title_lst)
                    remove_start_string=length_title_lst.replace(start_string,"")
                    auth_sect=section_abst[len(length_title_lst):start_abst]


                    sect_abs.append(cur_section+'%i'%cur_abs)
                    abst.append(whole_abs)
                    title.append(remove_start_string)
                    #auth.append(title_sect) old wrong way of getting author
                    auth.append(auth_sect)
                    
                    
                    break
                    
                except: 
                    pass
                    
                    
                else:
                    
                    pass

                print(last_index)



#         except:
#                     abs_beg_ind = abs_list.index(last_index)
#                     abs_end_ind = abs_list.index((j+1[1]))
                    
#                     section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
#                     #     print(section_abst)
#                     #     print(cur_section+'%i'%(cur_abs+1))
#                     #     print(i, abs_beg_ind, abs_end_ind)
#                     start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
#                     start_string=last_index #the F4 like number at the beg of each title that isnt needed

#                     whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
#                     title_sect=section_abst[0:start_abst]#the title and author combined
#                     title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
#                     title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
#                     length_title_lst=' '.join(title_lst)
#                     remove_start_string=length_title_lst.replace(start_string,"")
#                     auth_sect=section_abst[len(length_title_lst):start_abst]


#                     sect_abs.append(cur_section+'%i'%cur_abs)
#                     abst.append(whole_abs)
#                     title.append(remove_start_string)
#                     #auth.append(title_sect) old wrong way of getting author
#                     auth.append(auth_sect)
#             last_index
#             # letter + last index up to B1 
#             # section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract

#             pass

#         else:
#             pass


SyntaxError: unexpected EOF while parsing (<ipython-input-33-81cd90e6f368>, line 144)

In [34]:
#this cell specifically needs to be copied and pasted for every journal and needs to go first because all the rest of the cells use it to initialze abs_list
sect_abs=[]
abst=[]
title=[]
auth=[]
i=1
letter_start_ind = data_list.index('\x0cPoster Session A')
letter_end_ind = data_list.index('\x0cPoster Session B')
a_b_range= range(letter_start_ind,letter_end_ind)
a_b_range=list(a_b_range)
file = open(data_folder+CNS_files[5], 'r')# looks like this is the place where we call the specific year to parse
data = file.read()
data_list = data.split('\n')
abs_start = [ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d][0]#tells you the ind where the first abstract is?
abs_list = data_list[abs_start:]
poster_beg_ind = data_list.index('\x0cPoster Session A')
#while cur_section = 'A':#dont know how this cur_section string being A will work. thinking 
    #better idea being from poster_beg_ind = data_list.index('\x0cPoster Session A') 
    #to poster_beg_ind = data_list.index('\x0cPoster Session B') make this a range
#['A' 'B']
#for cur_section in 

last_index = 0
let_vec = ['A','B','C']

for j in let_vec:
    
    print(j)

    for i in range(0,200):

        try:
            cur_section= j
            cur_abs=i+1
            abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)

            for k in range(2,51):
                try:
                    abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))
                    last_index = cur_section+'%i'%(cur_abs+1)
                    #print('index found')
                    
                    section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
                    #     print(section_abst)
                    #     print(cur_section+'%i'%(cur_abs+1))
                    #     print(i, abs_beg_ind, abs_end_ind)
                    start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
                    start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed

                    whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
                    title_sect=section_abst[0:start_abst]#the title and author combined
                    title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
                    title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
                    length_title_lst=' '.join(title_lst)
                    remove_start_string=length_title_lst.replace(start_string,"")
                    auth_sect=section_abst[len(length_title_lst):start_abst]



                    sect_abs.append(cur_section+'%i'%cur_abs)
                    abst.append(whole_abs)
                    title.append(remove_start_string)
                    #auth.append(title_sect) old wrong way of getting author
                    auth.append(auth_sect)
                    
                    
                    break

                except ValueError:
                    
                    print('k = ',k)
                    print(cur_section+'%i'%(cur_abs+k))
                    abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+k))
                    
                    section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
                    #     print(section_abst)
                    #     print(cur_section+'%i'%(cur_abs+1))
                    #     print(i, abs_beg_ind, abs_end_ind)
                    start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
                    start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed

                    whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
                    title_sect=section_abst[0:start_abst]#the title and author combined
                    title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
                    title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
                    length_title_lst=' '.join(title_lst)
                    remove_start_string=length_title_lst.replace(start_string,"")
                    auth_sect=section_abst[len(length_title_lst):start_abst]


                    sect_abs.append(cur_section+'%i'%cur_abs)
                    abst.append(whole_abs)
                    title.append(remove_start_string)
                    #auth.append(title_sect) old wrong way of getting author
                    auth.append(auth_sect)
                    
                    
                    break
                    
                except: 
                    pass
                    
                    
                else:
                    
                    pass

                print(last_index)

SyntaxError: unexpected EOF while parsing (<ipython-input-34-d0c9b7cf8724>, line 108)

In [None]:
abs_end_ind = abs_list.index((cur_section+1'%i'%(z)))

In [None]:
while true 

In [None]:
data_list

In [None]:
i=1
data_list = data.split('\n')
letter_start_ind = data_list.index('\x0cPoster Session B')
letter_end_ind = data_list.index('\x0cPoster Session C')
a_b_range= range(letter_start_ind,letter_end_ind)
a_b_range=list(a_b_range)
file = open(data_folder+CNS_files[5], 'r')
data = file.read()
#data_list = data.split('\n')
poster_beg_ind = data_list.index('\x0cPoster Session B')
#while cur_section = 'A':#dont know how this cur_section string being A will work. thinking 
    #better idea being from poster_beg_ind = data_list.index('\x0cPoster Session A') 
    #to poster_beg_ind = data_list.index('\x0cPoster Session B') make this a range
#['A' 'B']
#for cur_section in 
for i in range(len(a_b_range)):
    try:
        cur_section='B'
        cur_abs=i+1
        abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))

        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
    #     print(section_abst)
    #     print(cur_section+'%i'%(cur_abs+1))
    #     print(i, abs_beg_ind, abs_end_ind)
        start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
        start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed
        
        whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
        title_sect=section_abst[0:start_abst]#the title and author combined
        title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
        title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
        length_title_lst=' '.join(title_lst)
        remove_start_string=length_title_lst.replace(start_string,"")
        auth_sect=section_abst[len(length_title_lst):start_abst]
        
        

        sect_abs.append(cur_section+'%i'%cur_abs)
        abst.append(whole_abs)
        title.append(remove_start_string)
        #auth.append(title_sect) old wrong way of getting author
        auth.append(auth_sect)
    except :
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+2)) #works!!
        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 

In [None]:
i=1
letter_start_ind = data_list.index('\x0cPoster Session C')
letter_end_ind = data_list.index('\x0cPoster Session D')
a_b_range= range(letter_start_ind,letter_end_ind)
a_b_range=list(a_b_range)
file = open(data_folder+CNS_files[5], 'r')
data = file.read()
data_list = data.split('\n')
poster_beg_ind = data_list.index('\x0cPoster Session C')
#while cur_section = 'A':#dont know how this cur_section string being A will work. thinking 
    #better idea being from poster_beg_ind = data_list.index('\x0cPoster Session A') 
    #to poster_beg_ind = data_list.index('\x0cPoster Session B') make this a range
#['A' 'B']
#for cur_section in 
for i in range(len(a_b_range)):
    try:
        cur_section='C'
        cur_abs=i+1
        abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))

        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
    #     print(section_abst)
    #     print(cur_section+'%i'%(cur_abs+1))
    #     print(i, abs_beg_ind, abs_end_ind)
        start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
        start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed
        
        whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
        title_sect=section_abst[0:start_abst]#the title and author combined
        title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
        title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
        length_title_lst=' '.join(title_lst)
        remove_start_string=length_title_lst.replace(start_string,"")
        auth_sect=section_abst[len(length_title_lst):start_abst]
        
        

        sect_abs.append(cur_section+'%i'%cur_abs)
        abst.append(whole_abs)
        title.append(remove_start_string)
        #auth.append(title_sect) old wrong way of getting author
        auth.append(auth_sect)
    except :
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+2)) #works!!
        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 

In [None]:
i=1
letter_start_ind = data_list.index('\x0cPoster Session D')
letter_end_ind = data_list.index('\x0cPoster Session E')
a_b_range= range(letter_start_ind,letter_end_ind)
a_b_range=list(a_b_range)
file = open(data_folder+CNS_files[5], 'r')
data = file.read()
data_list = data.split('\n')
poster_beg_ind = data_list.index('\x0cPoster Session D')
#while cur_section = 'A':#dont know how this cur_section string being A will work. thinking 
    #better idea being from poster_beg_ind = data_list.index('\x0cPoster Session A') 
    #to poster_beg_ind = data_list.index('\x0cPoster Session B') make this a range
#['A' 'B']
#for cur_section in 
for i in range(len(a_b_range)):
    try:
        cur_section='D'
        cur_abs=i+1
        abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))

        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
    #     print(section_abst)
    #     print(cur_section+'%i'%(cur_abs+1))
    #     print(i, abs_beg_ind, abs_end_ind)
        start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
        start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed
        
        whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
        title_sect=section_abst[0:start_abst]#the title and author combined
        title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
        title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
        length_title_lst=' '.join(title_lst)
        remove_start_string=length_title_lst.replace(start_string,"")
        auth_sect=section_abst[len(length_title_lst):start_abst]
        
        

        sect_abs.append(cur_section+'%i'%cur_abs)
        abst.append(whole_abs)
        title.append(remove_start_string)
        #auth.append(title_sect) old wrong way of getting author
        auth.append(auth_sect)
    except :
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+2)) #works!!
        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 

In [None]:
i=1
letter_start_ind = data_list.index('\x0cPoster Session E')
letter_end_ind = data_list.index('\x0cPoster Session F')
a_b_range= range(letter_start_ind,letter_end_ind)
a_b_range=list(a_b_range)
file = open(data_folder+CNS_files[5], 'r')
data = file.read()
data_list = data.split('\n')
poster_beg_ind = data_list.index('\x0cPoster Session E')
#while cur_section = 'A':#dont know how this cur_section string being A will work. thinking 
    #better idea being from poster_beg_ind = data_list.index('\x0cPoster Session A') 
    #to poster_beg_ind = data_list.index('\x0cPoster Session B') make this a range
#['A' 'B']
#for cur_section in 
for i in range(len(a_b_range)):
    try:
        cur_section='E'
        cur_abs=i+1
        abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))

        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
    #     print(section_abst)
    #     print(cur_section+'%i'%(cur_abs+1))
    #     print(i, abs_beg_ind, abs_end_ind)
        start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
        start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed
        
        whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
        title_sect=section_abst[0:start_abst]#the title and author combined
        title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
        title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
        length_title_lst=' '.join(title_lst)
        remove_start_string=length_title_lst.replace(start_string,"")
        auth_sect=section_abst[len(length_title_lst):start_abst]
        
        

        sect_abs.append(cur_section+'%i'%cur_abs)
        abst.append(whole_abs)
        title.append(remove_start_string)
        #auth.append(title_sect) old wrong way of getting author
        auth.append(auth_sect)
    except :
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+2)) #works!!
        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 

In [None]:
i=1
letter_start_ind = data_list.index('\x0cPoster Session F')
letter_end_ind = data_list.index('\x0cPoster Session G')
a_b_range= range(letter_start_ind,letter_end_ind)
a_b_range=list(a_b_range)
file = open(data_folder+CNS_files[5], 'r')
data = file.read()
data_list = data.split('\n')
poster_beg_ind = data_list.index('\x0cPoster Session F')
#while cur_section = 'A':#dont know how this cur_section string being A will work. thinking 
    #better idea being from poster_beg_ind = data_list.index('\x0cPoster Session A') 
    #to poster_beg_ind = data_list.index('\x0cPoster Session B') make this a range
#['A' 'B']
#for cur_section in 
for i in range(len(a_b_range)):
    try:
        cur_section='F'
        cur_abs=i+1
        abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))

        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
    #     print(section_abst)
    #     print(cur_section+'%i'%(cur_abs+1))
    #     print(i, abs_beg_ind, abs_end_ind)
        start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
        start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed
        
        whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
        title_sect=section_abst[0:start_abst]#the title and author combined
        title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
        title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
        length_title_lst=' '.join(title_lst)
        remove_start_string=length_title_lst.replace(start_string,"")
        auth_sect=section_abst[len(length_title_lst):start_abst]
        
        

        sect_abs.append(cur_section+'%i'%cur_abs)
        abst.append(whole_abs)
        title.append(remove_start_string)
        #auth.append(title_sect) old wrong way of getting author
        auth.append(auth_sect)
    except :
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+2)) #works!!
        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 

In [None]:
i=1
letter_start_ind = data_list.index('\x0cPoster Session G')
letter_end_ind = data_list.index('\x0cPoster Session H')
a_b_range= range(letter_start_ind,letter_end_ind)
a_b_range=list(a_b_range)
file = open(data_folder+CNS_files[5], 'r')
data = file.read()
data_list = data.split('\n')
poster_beg_ind = data_list.index('\x0cPoster Session G')
#while cur_section = 'A':#dont know how this cur_section string being A will work. thinking 
    #better idea being from poster_beg_ind = data_list.index('\x0cPoster Session A') 
    #to poster_beg_ind = data_list.index('\x0cPoster Session B') make this a range
#['A' 'B']
#for cur_section in 
for i in range(len(a_b_range)):
    try:
        cur_section='G'
        cur_abs=i+1
        abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))

        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
    #     print(section_abst)
    #     print(cur_section+'%i'%(cur_abs+1))
    #     print(i, abs_beg_ind, abs_end_ind)
        start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
        start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed
        
        whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
        title_sect=section_abst[0:start_abst]#the title and author combined
        title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
        title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
        length_title_lst=' '.join(title_lst)
        remove_start_string=length_title_lst.replace(start_string,"")
        auth_sect=section_abst[len(length_title_lst):start_abst]
        
        

        sect_abs.append(cur_section+'%i'%cur_abs)
        abst.append(whole_abs)
        title.append(remove_start_string)
        #auth.append(title_sect) old wrong way of getting author
        auth.append(auth_sect)
    except :
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+2)) #works!!
        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 

In [None]:
i=1
letter_start_ind = data_list.index('\x0cPoster Session H')
letter_end_ind = data_list.index('\x0cPoster Session I')
a_b_range= range(letter_start_ind,letter_end_ind)
a_b_range=list(a_b_range)
file = open(data_folder+CNS_files[5], 'r')
data = file.read()
data_list = data.split('\n')
poster_beg_ind = data_list.index('\x0cPoster Session H')
#while cur_section = 'A':#dont know how this cur_section string being A will work. thinking 
    #better idea being from poster_beg_ind = data_list.index('\x0cPoster Session A') 
    #to poster_beg_ind = data_list.index('\x0cPoster Session B') make this a range
#['A' 'B']
#for cur_section in 
for i in range(len(a_b_range)):
    try:
        cur_section='H'
        cur_abs=i+1
        abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))

        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
    #     print(section_abst)
    #     print(cur_section+'%i'%(cur_abs+1))
    #     print(i, abs_beg_ind, abs_end_ind)
        start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
        start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed
        
        whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
        title_sect=section_abst[0:start_abst]#the title and author combined
        title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
        title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
        length_title_lst=' '.join(title_lst)
        remove_start_string=length_title_lst.replace(start_string,"")
        auth_sect=section_abst[len(length_title_lst):start_abst]
        
        

        sect_abs.append(cur_section+'%i'%cur_abs)
        abst.append(whole_abs)
        title.append(remove_start_string)
        #auth.append(title_sect) old wrong way of getting author
        auth.append(auth_sect)
    except :
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+2)) #works!!
        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 

In [None]:
i=1
letter_start_ind = data_list.index('\x0cPoster Session I')
letter_end_ind = data_list.index('\x0cPoster Topic Index')
a_b_range= range(letter_start_ind,letter_end_ind)
a_b_range=list(a_b_range)
file = open(data_folder+CNS_files[5], 'r')
data = file.read()
data_list = data.split('\n')
poster_beg_ind = data_list.index('\x0cPoster Session I')
#while cur_section = 'A':#dont know how this cur_section string being A will work. thinking 
    #better idea being from poster_beg_ind = data_list.index('\x0cPoster Session A') 
    #to poster_beg_ind = data_list.index('\x0cPoster Session B') make this a range
#['A' 'B']
#for cur_section in 
for i in range(len(a_b_range)):
    try:
        cur_section='I'
        cur_abs=i+1
        abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))

        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
    #     print(section_abst)
    #     print(cur_section+'%i'%(cur_abs+1))
    #     print(i, abs_beg_ind, abs_end_ind)
        start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
        
        
        start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed
        
        whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
        title_sect=section_abst[0:start_abst]#the title and author combined
        title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
        title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
        length_title_lst=' '.join(title_lst)
        remove_start_string=length_title_lst.replace(start_string,"")
        auth_sect=section_abst[len(length_title_lst):start_abst]
        
        

        sect_abs.append(cur_section+'%i'%cur_abs)
        abst.append(whole_abs)
        title.append(remove_start_string)
        #auth.append(title_sect) old wrong way of getting author
        auth.append(auth_sect)
    except :
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+2)) #works!!
        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 

In [None]:
#making a datatable to see how the data fills the table which will help better understand the format i need to put the data into
try_table=pd.DataFrame({'sect_abs':sect_abs,'abst':abst,'title':title,'auth':auth})
try_table


In [None]:
try_table.to_csv("CNS_2012_SCRAPED.csv")

Testing another year

In [None]:
#this cell specifically needs to be copied and pasted for every journal and needs to go first because all the rest of the cells use it to initialze abs_list
sect_abs=[]
abst=[]
title=[]
auth=[]
i=1
letter_start_ind = data_list.index('\x0cPoster Session A')
letter_end_ind = data_list.index('\x0cPoster Session B')
a_b_range= range(letter_start_ind,letter_end_ind)
a_b_range=list(a_b_range)
file = open(data_folder+CNS_files[6], 'r')# looks like this is the place where we call the specific year to parse
data = file.read()
data_list = data.split('\n')
abs_start = [ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d][0]#tells you the ind where the first abstract is?
abs_list = data_list[abs_start:]
poster_beg_ind = data_list.index('\x0cPoster Session A')
#while cur_section = 'A':#dont know how this cur_section string being A will work. thinking 
    #better idea being from poster_beg_ind = data_list.index('\x0cPoster Session A') 
    #to poster_beg_ind = data_list.index('\x0cPoster Session B') make this a range
#['A' 'B']
#for cur_section in 
for i in range(len(a_b_range)):
    try:
        cur_section='A'
        cur_abs=i+1
        abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))

        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
    #     print(section_abst)
    #     print(cur_section+'%i'%(cur_abs+1))
    #     print(i, abs_beg_ind, abs_end_ind)
        start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
        start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed
        
        whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
        title_sect=section_abst[0:start_abst]#the title and author combined
        title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
        title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
        length_title_lst=' '.join(title_lst)
        remove_start_string=length_title_lst.replace(start_string,"")
        auth_sect=section_abst[len(length_title_lst):start_abst]
        
        

        sect_abs.append(cur_section+'%i'%cur_abs)
        abst.append(whole_abs)
        title.append(remove_start_string)
        #auth.append(title_sect) old wrong way of getting author
        auth.append(auth_sect)
    except :
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+2)) #works!!
        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 

In [None]:
i=1
data_list = data.split('\n')
letter_start_ind = data_list.index('\x0cPoster Session B')
letter_end_ind = data_list.index('\x0cPoster Session C')
a_b_range= range(letter_start_ind,letter_end_ind)
a_b_range=list(a_b_range)
file = open(data_folder+CNS_files[6], 'r')
data = file.read()
#data_list = data.split('\n')
poster_beg_ind = data_list.index('\x0cPoster Session B')
#while cur_section = 'A':#dont know how this cur_section string being A will work. thinking 
    #better idea being from poster_beg_ind = data_list.index('\x0cPoster Session A') 
    #to poster_beg_ind = data_list.index('\x0cPoster Session B') make this a range
#['A' 'B']
#for cur_section in 
for i in range(len(a_b_range)):
    try:
        cur_section='B'
        cur_abs=i+1
        abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))

        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
    #     print(section_abst)
    #     print(cur_section+'%i'%(cur_abs+1))
    #     print(i, abs_beg_ind, abs_end_ind)
        start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
        start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed
        
        whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
        title_sect=section_abst[0:start_abst]#the title and author combined
        title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
        title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
        length_title_lst=' '.join(title_lst)
        remove_start_string=length_title_lst.replace(start_string,"")
        auth_sect=section_abst[len(length_title_lst):start_abst]
        
        

        sect_abs.append(cur_section+'%i'%cur_abs)
        abst.append(whole_abs)
        title.append(remove_start_string)
        #auth.append(title_sect) old wrong way of getting author
        auth.append(auth_sect)
    except :
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+2)) #works!!
        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 

In [None]:
i=1
letter_start_ind = data_list.index('\x0cPoster Session C')
letter_end_ind = data_list.index('\x0cPoster Session D')
a_b_range= range(letter_start_ind,letter_end_ind)
a_b_range=list(a_b_range)
file = open(data_folder+CNS_files[5], 'r')
data = file.read()
data_list = data.split('\n')
poster_beg_ind = data_list.index('\x0cPoster Session C')
#while cur_section = 'A':#dont know how this cur_section string being A will work. thinking 
    #better idea being from poster_beg_ind = data_list.index('\x0cPoster Session A') 
    #to poster_beg_ind = data_list.index('\x0cPoster Session B') make this a range
#['A' 'B']
#for cur_section in 
for i in range(len(a_b_range)):
    try:
        cur_section='C'
        cur_abs=i+1
        abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))

        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
    #     print(section_abst)
    #     print(cur_section+'%i'%(cur_abs+1))
    #     print(i, abs_beg_ind, abs_end_ind)
        start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
        start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed
        
        whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
        title_sect=section_abst[0:start_abst]#the title and author combined
        title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
        title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
        length_title_lst=' '.join(title_lst)
        remove_start_string=length_title_lst.replace(start_string,"")
        auth_sect=section_abst[len(length_title_lst):start_abst]
        
        

        sect_abs.append(cur_section+'%i'%cur_abs)
        abst.append(whole_abs)
        title.append(remove_start_string)
        #auth.append(title_sect) old wrong way of getting author
        auth.append(auth_sect)
    except :
        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+2)) #works!!
        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 