### Scraping abstract information
March 4, 2018
This notebook scrapes abstract text from:
- Proceedings of the Annual Cognitive Science Society meeting archive (html)
- Proceedings of Cognitive Neuroscience Society annual meeting (text converted from pdf)

Abstracts are then stored in a spreadsheet, containing information such as year, authors, title, and abstract.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import csv
from bs4 import BeautifulSoup
import urllib
import re
import string
import pandas as pd
import os
import sys

In [2]:
def scrape_CS(home_url, data_file):
    #connect to home page url for that year
    CSurl = urllib.request.urlopen(home_url).read()
    soup = BeautifulSoup(CSurl, 'html.parser')
    all_links = soup.find_all('a', attrs={'href': re.compile("papers/*")})    
    year = home_url[-5:-1]    
    
    # enumerate through all paper links
    for link_idx, link in enumerate(all_links):
        # get soup from paper url
        if home_url not in str(link['href']):
            url_text = home_url + str(link['href'])
        else:
            url_text = str(link['href'])
    
        url = urllib.request.urlopen(url_text).read()
        soup = BeautifulSoup(url, 'html.parser')
    
        # scrape & parse
        authors = []
        affl = []
        title = ' '.join(soup.find_all('h1')[0].text.split())
        # exception rule for 2014 abstracts
        if '2014' in home_url:            
            abstr = ' '.join(soup.find_all('blockquote')[1].text.split())
        else:            
            abstr = ' '.join(soup.find_all('p', {"id": "abstract"})[0].text.split())            
        
        soup.find_all('ul')
        for ana in soup.find_all('em'):
            affl.append('>'+ana.text)
            if '2014' in home_url:
                # somebody fucked something up in 2014
                authors.append('>' + ana.previous_element.previous_element.split(',')[0])
            else:            
                authors.append('>' + ana.previous_element.split(',')[0])
        
        # do some gymnastics to get it into a pandas df and add as a row to CSV
        new_row = {'Year': str(year), 'Title': title,'Abstract': abstr,'Authors': ''.join(authors),'Affiliations': ''.join(affl), 'URL': url_text}
        df_cur = pd.Series(data=new_row).to_frame().T[['Year','Title','Abstract','Authors','Affiliations','URL']]
        df_cur.to_csv(data_file, mode='a', header=False, index=False)


In [None]:
# get all paper links from cogsci conference
home_urls = ['https://mindmodeling.org/cogsci2017/',
             'https://mindmodeling.org/cogsci2016/',
             'https://mindmodeling.org/cogsci2015/',
             'https://mindmodeling.org/cogsci2014/',
             'https://mindmodeling.org/cogsci2013/',
             'https://mindmodeling.org/cogsci2012/',
             'https://mindmodeling.org/cogsci2011/',
             'https://mindmodeling.org/cogsci2010/']

for year in home_urls:
    # scrape all
    print(year)
    scrape_CS(home_url=year, data_file='../data/cogsci_abstracts.csv')

### gather CNS abstracts from text to csv

In [4]:
data_folder = '../data/CNS_programs_txt/'
os.listdir(data_folder)
CNS_files = sorted([f for f in os.listdir(data_folder) if ('CNS' in f) and ('.txt' in f)])[:-1]#not include 2017 forget why though
print(CNS_files)

['CNS_2007_Program.txt', 'CNS_2008_Program.txt', 'CNS_2009_Program.txt', 'CNS_2010_Program.txt', 'CNS_2011_Program.txt', 'CNS_2012_Program.txt', 'CNS_2013_Program.txt', 'CNS_2014_Program.txt', 'CNS_2015_Program.txt', 'CNS_2016_Program.txt']


In [5]:
#col_names = ['CNS_file','start_sect','beg_ind','end_ind']

#reffering to the abs_list problem encountered in journal on 8/2/2018. I think this needs to be on the outside of the loop so that it can catch each abs_list year change


CNS_file=[]
start_sect=[]
beg_ind=[]
end_ind=[]

for i in range(len(CNS_files)):
    
    CNS_file.append(CNS_files[i])
    
    #print(CNS_files[i])
    file = open(data_folder+CNS_files[i], 'r')
    data = file.read()
    data_list = data.split('\n')
    abs_start = [ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d][0]#tells you the ind where the first abstract is?
    
    start_sect.append([ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d])
    #print([ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d])
    
    abs_list = data_list[abs_start:]
    poster_beg_ind = next((ind for ind,s in enumerate(abs_list) if '\x0cPoster Session A' == s), None)    
    poster_end_ind = next((ind for ind,s in enumerate(abs_list) if '\x0cAuthor Index' == s), None)
    
    
    beg_ind.append(poster_beg_ind)
    
    end_ind.append(poster_end_ind)
    
    
    #print(poster_beg_ind, poster_end_ind)

In [8]:
file = open(data_folder+CNS_files[0], 'r')
data = file.read()
data_list = data.split('\n')
abs_start = [ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d][0]#tells you the ind where the first abstract is?

In [9]:
data_list

['Cognitive Neuroscience Society',
 'Annual Meeting Program 2007',
 'A supplement of the Journal of Cognitive Neuroscience',
 '',
 'ISSN 1096-8857',
 '© CNS',
 'Cognitive Neuroscience Society',
 'c/o Center for Mind and Brain',
 'University of California, Davis',
 'One Shields Avenue',
 'Davis, CA 95616',
 'www.cogneurosociety.org',
 '',
 '\x0cCognitive Neuroscience Society 2007 Committees',
 'Governing Board',
 'Carol Colby, Ph.D., University of Pittsburgh',
 'Marta Kutas, Ph.D., University of California, San Diego',
 'Helen Neville, Ph.D., University of Oregon',
 'Michael I. Posner, Ph.D., University of Oregon',
 'Daniel Schacter, Ph.D., Harvard University',
 'Michael S. Gazzaniga, Ph.D., University of California, Santa Barbara (ex officio)',
 'George R. Mangun, Ph.D., University of California, Davis (ex officio)',
 'Patti Reuter-Lorenz, Ph.D., University of Michigan (ex officio)',
 '',
 'Program Committee 2007 Meeting',
 'Patti Reuter-Lorenz, Ph.D., University of Michigan, Chair',
 

In [5]:
cns_frame=pd.DataFrame({'CNS_file':CNS_file,'start_sect':start_sect,'beg_ind':beg_ind,'end_ind':end_ind})
cns_frame

Unnamed: 0,CNS_file,beg_ind,end_ind,start_sect
0,CNS_2007_Program.txt,0,29493,[1456]
1,CNS_2008_Program.txt,0,30363,"[1538, 1748, 1969, 2189, 2414, 2647, 2884, 311..."
2,CNS_2009_Program.txt,0,23882,"[2181, 2395, 2613, 2834, 3076, 3310, 3545, 379..."
3,CNS_2010_Program.txt,0,31628,"[3236, 3472, 3718, 3974, 4229, 4460, 4694, 495..."
4,CNS_2011_Program.txt,0,24580,"[3326, 3420, 3667, 3907, 4148, 4394, 4628, 487..."
5,CNS_2012_Program.txt,0,24793,"[3516, 3610, 3848, 4113, 4353, 4587, 4836, 507..."
6,CNS_2013_Program.txt,0,28381,"[2881, 3128, 3372, 3626, 3852, 4106, 4348, 459..."
7,CNS_2014_Program.txt,0,28084,"[3137, 3250, 3505, 3753, 4009, 4253, 4493, 475..."
8,CNS_2015_Program.txt,0,23909,"[3350, 3588, 3829, 4068, 4322, 4566, 4823, 506..."
9,CNS_2016_Program.txt,0,28305,"[3593, 3823, 4072, 4313, 4566, 4827, 5087, 533..."


For some reason the first file is problematic,CNS_2007, until cell 106 (excel) which is when it hits F1, and then everything works perfect

We do miss the very last index for 2016 but i think thats the only last index that is missed. Did random checks and this appears true.

FIGURED it out! so it looks like the sections in 2007 are messed up until F for numbering the 10's

EX:
A 10(doesnt work)
B 10(doesnt work)
...
F10(works)

and it's all of the numbers until F

solutions ideas
maybe just run a find and delete B ## and make it B## in the text doc.

So will need to write something to grab these? Maybe 

In [6]:
sect_abs=[]
abst=[]
title=[]
auth=[]
CNS_Year=[]

let_vec = ['A','B','C','D','E','F','G','H','I','J']
for x in range(len(CNS_files)):

    
    file = open(data_folder+CNS_files[x], 'r')# looks like this is the place where we call the specific year to parse
    data = file.read()
    data_list = data.split('\n')
    abs_start = [ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d][0]#tells you the ind where the first abstract is?
    abs_list = data_list[abs_start:]
    poster_beg_ind = data_list.index('\x0cPoster Session A')

    



    for j in range(0,len(let_vec)):

        #print(j)

        for i in range(0,200):

            try:
                cur_section= let_vec[j]
                cur_abs=i
                abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)
                abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))
                last_index = cur_section+'%i'%(cur_abs+1)
                #print(cur_abs)

                section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
                
                start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
                start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed

                whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
                title_sect=section_abst[0:start_abst]#the title and author combined
                title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
                title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
                length_title_lst=' '.join(title_lst)
                remove_start_string=length_title_lst.replace(start_string,"")
                auth_sect=section_abst[len(length_title_lst):start_abst]



                sect_abs.append(cur_section+'%i'%cur_abs)
                abst.append(whole_abs)
                title.append(remove_start_string)
                
                auth.append(auth_sect)
                CNS_Year.append(CNS_file[x])


                last_good = i
            except ValueError:

                for k in range(2,51):
                    try:
                        #print('k = ',k)
                        #print(cur_section+'%i'%(cur_abs+k))
                        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+k))

                        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
                        #     print(section_abst)
                        #     print(cur_section+'%i'%(cur_abs+1))
                        #     print(i, abs_beg_ind, abs_end_ind)
                        start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
                        start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed

                        whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
                        title_sect=section_abst[0:start_abst]#the title and author combined
                        title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
                        title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
                        length_title_lst=' '.join(title_lst)
                        remove_start_string=length_title_lst.replace(start_string,"")
                        auth_sect=section_abst[len(length_title_lst):start_abst]


                        sect_abs.append(cur_section+'%i'%cur_abs)
                        abst.append(whole_abs)
                        title.append(remove_start_string)
                        
                        auth.append(auth_sect)
                        CNS_Year.append(CNS_file[x])


                        break
                    except:
                        #print('except')
                        pass


                    else:
                        #print('reached else')
                        pass

                #print('last index =',last_index)



    
            except:

                                abs_beg_ind = abs_list.index(last_index)
                                nxt = cur_section + 1
                                abs_end_ind = abs_list.index(let_vec[j+1]+'%i'%nxt)

                                section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
                                #     print(section_abst)
                                #     print(cur_section+'%i'%(cur_abs+1))
                                #     print(i, abs_beg_ind, abs_end_ind)
                                start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
                                start_string=last_index #the F4 like number at the beg of each title that isnt needed

                                whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
                                title_sect=section_abst[0:start_abst]#the title and author combined
                                title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
                                title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
                                length_title_lst=' '.join(title_lst)
                                remove_start_string=length_title_lst.replace(start_string,"")
                                auth_sect=section_abst[len(length_title_lst):start_abst]


                                sect_abs.append(last_index)
                                abst.append(whole_abs)
                                title.append(remove_start_string)
                                #auth.append(title_sect) old wrong way of getting author
                                auth.append(auth_sect)
                                CNS_Year.append(CNS_file[x])
                                last_index
                        



            else:
                    pass



    print(last_good)

147
131
103
126
126
99
135
150
131
174


Works. But has some stuff left in it. The above is the same but cleaned up

In [None]:
#this cell specifically needs to be copied and pasted for every journal and needs to go first because all the rest of the cells use it to initialze abs_list
sect_abs=[]
abst=[]
title=[]
auth=[]
CNS_Year=[]
i=1
let_vec = ['A','B','C','D','E','F','G','H','I']
for x in range(len(CNS_files)):
    letter_start_ind = data_list.index('\x0cPoster Session A')
    letter_end_ind = data_list.index('\x0cPoster Session B')
    a_b_range= range(letter_start_ind,letter_end_ind)
    a_b_range=list(a_b_range)
    
    file = open(data_folder+CNS_files[x], 'r')# looks like this is the place where we call the specific year to parse
    data = file.read()
    data_list = data.split('\n')
    abs_start = [ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d][0]#tells you the ind where the first abstract is?
    abs_list = data_list[abs_start:]
    poster_beg_ind = data_list.index('\x0cPoster Session A')

    z=1



    for j in range(0,len(let_vec)):

        print(j)

        for i in range(0,200):

            try:
                cur_section= let_vec[j]
                cur_abs=i
                abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)
                abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))
                last_index = cur_section+'%i'%(cur_abs+1)
                print(cur_abs)

                section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
                #     print(section_abst)
                #     print(cur_section+'%i'%(cur_abs+1))
                #     print(i, abs_beg_ind, abs_end_ind)
                start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
                start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed

                whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
                title_sect=section_abst[0:start_abst]#the title and author combined
                title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
                title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
                length_title_lst=' '.join(title_lst)
                remove_start_string=length_title_lst.replace(start_string,"")
                auth_sect=section_abst[len(length_title_lst):start_abst]



                sect_abs.append(cur_section+'%i'%cur_abs)
                abst.append(whole_abs)
                title.append(remove_start_string)
                #auth.append(title_sect) old wrong way of getting author
                auth.append(auth_sect)
                CNS_Year.append(CNS_file[x])


                last_good = i
            except ValueError:

                for k in range(2,51):
                    try:
                        #print('k = ',k)
                        #print(cur_section+'%i'%(cur_abs+k))
                        abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+k))

                        section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
                        #     print(section_abst)
                        #     print(cur_section+'%i'%(cur_abs+1))
                        #     print(i, abs_beg_ind, abs_end_ind)
                        start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
                        start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed

                        whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
                        title_sect=section_abst[0:start_abst]#the title and author combined
                        title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
                        title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
                        length_title_lst=' '.join(title_lst)
                        remove_start_string=length_title_lst.replace(start_string,"")
                        auth_sect=section_abst[len(length_title_lst):start_abst]


                        sect_abs.append(cur_section+'%i'%cur_abs)
                        abst.append(whole_abs)
                        title.append(remove_start_string)
                        #auth.append(title_sect) old wrong way of getting author
                        auth.append(auth_sect)
                        CNS_Year.append(CNS_file[x])


                        break
                    except:
                        #print('except')
                        pass


                    else:
                        #print('reached else')
                        pass

                #print('last index =',last_index)



    #                 except ValueError:
    #                             abs_beg_ind = abs_list.index(last_index)
    #                             #nxt = cur_section + 1
    #                             abs_end_ind = abs_list.index('B'+'%i'%z)

    #                             section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
    #                             #     print(section_abst)
    #                             #     print(cur_section+'%i'%(cur_abs+1))
    #                             #     print(i, abs_beg_ind, abs_end_ind)
    #                             start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
    #                             start_string=last_index #the F4 like number at the beg of each title that isnt needed

    #                             whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
    #                             title_sect=section_abst[0:start_abst]#the title and author combined
    #                             title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
    #                             title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
    #                             length_title_lst=' '.join(title_lst)
    #                             remove_start_string=length_title_lst.replace(start_string,"")
    #                             auth_sect=section_abst[len(length_title_lst):start_abst]


    #                             sect_abs.append(cur_section+'%i'%cur_abs)
    #                             abst.append(whole_abs)
    #                             title.append(remove_start_string)
    #                             #auth.append(title_sect) old wrong way of getting author
    #                             auth.append(auth_sect)
    #                             last_index
    #                     # letter + last index up to B1 
    #                     # section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract

    #                             pass
            except:

                                abs_beg_ind = abs_list.index(last_index)
                                #nxt = cur_section + 1
                                abs_end_ind = abs_list.index(let_vec[j+1]+'%i'%z)

                                section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
                                #     print(section_abst)
                                #     print(cur_section+'%i'%(cur_abs+1))
                                #     print(i, abs_beg_ind, abs_end_ind)
                                start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
                                start_string=last_index #the F4 like number at the beg of each title that isnt needed

                                whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
                                title_sect=section_abst[0:start_abst]#the title and author combined
                                title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
                                title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
                                length_title_lst=' '.join(title_lst)
                                remove_start_string=length_title_lst.replace(start_string,"")
                                auth_sect=section_abst[len(length_title_lst):start_abst]


                                sect_abs.append(last_index)
                                abst.append(whole_abs)
                                title.append(remove_start_string)
                                #auth.append(title_sect) old wrong way of getting author
                                auth.append(auth_sect)
                                CNS_Year.append(CNS_file[x])
                                last_index
                        # letter + last index up to B1 
                        # section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract




            else:
                    pass



    print(last_good)

In [7]:
#making a datatable to see how the data fills the table which will help better understand the format i need to put the data into
try_table=pd.DataFrame({'sect_abs':sect_abs,'abst':abst,'title':title,'auth':auth, 'CNS_Year':CNS_Year})#'CNS_Year':CNS_Year
try_table


Unnamed: 0,CNS_Year,abst,auth,sect_abs,title
0,CNS_2007_Program.txt,Recent models of attention in typically develo...,"lary Gomes1, Martin Duff1, Virginia Wolfson1, ...",A1,ERP MEASURES OF AUDITORY SELECTIVE ATTENTION ...
1,CNS_2007_Program.txt,Unilateral neglect after right cerebral stroke...,"Georg Kerkhoff1,2, Christian Groh-Bordin1, In...",A2,BENEFITS OF OPTOKINETIC STIMULATION IN PATIEN...
2,CNS_2007_Program.txt,People easily extract the embedded metrical st...,"ung1, Denise H. Wu2, Daisy L. Hung1,2, Ovid J....",A3,WHEN ABSTRACT KNOWLEDGE MEETS PERCEPTUAL GROU...
3,CNS_2007_Program.txt,To investigate the impact of attention on proc...,"Yuliya Yoncheva, Jason Zevin, Urs Maurer, Bru...",A4,LEFT-LATERALIZED MODULATIONS OF INFERIOR FRON...
4,CNS_2007_Program.txt,Abstract stimulus features are encoded in sens...,"Alexandra Bendixen, Erich Schröger; Universit...",A5,EXTRACTION OF ABSTRACT REGULARITIES FROM DYNA...
5,CNS_2007_Program.txt,This experiment investigated brain responses i...,"ChunYu Tse, Kathy Low, Jason Agran, Guadalupe...",A6,SEQUENTIAL TONE DISCRIMINATION WITH IMPLICIT ...
6,CNS_2007_Program.txt,Spatially selective attention allows individua...,"Lisa Sanders, Lori Astheimer; University of M...",A7,EVENT RELATED POTENTIAL EVIDENCE OF RAPIDLY M...
7,CNS_2007_Program.txt,"In crossmodal spatial cueing (Posner, M., 1980...","Maja Trenner1, Markus Bauer2, Rüdiger Wenzel1...",A8,WHAT HAPPENS IN BETWEEN? MODULATIONS OF CORTI...
8,CNS_2007_Program.txt,Disruption of attention is a hallmark symptom ...,"n Wood1, Geoffrey Potts1, Laura Martin1, Delia...",B0,A9 DISRUPTION OF AUDITORY AND VISUAL ATTENTION...
9,CNS_2007_Program.txt,Decision-making is a fundamental capacity whic...,"Bruno Kopp, Sandra Tabeling, Carsten Moschner...",B1,NEUROCOGNITIVE MECHANISMS OF PERCEPTUAL DECIS...


In [None]:
try_table.to_csv("CNS_SCRAPED_DI.csv")

This works for one file at a time, the above loops through all the files in the folder

In [None]:
#this cell specifically needs to be copied and pasted for every journal and needs to go first because all the rest of the cells use it to initialze abs_list
sect_abs=[]
abst=[]
title=[]
auth=[]
i=1
letter_start_ind = data_list.index('\x0cPoster Session A')
letter_end_ind = data_list.index('\x0cPoster Session B')
a_b_range= range(letter_start_ind,letter_end_ind)
a_b_range=list(a_b_range)
file = open(data_folder+CNS_files[5], 'r')# looks like this is the place where we call the specific year to parse
data = file.read()
data_list = data.split('\n')
abs_start = [ind for ind, d in enumerate(data_list) if '\x0cPoster Session A' in d][0]#tells you the ind where the first abstract is?
abs_list = data_list[abs_start:]
poster_beg_ind = data_list.index('\x0cPoster Session A')

z=1


let_vec = ['A','B','C','D','E','F','G','H','I']

for j in range(0,len(let_vec)):
    
    print(j)

    for i in range(0,200):

        try:
            cur_section= let_vec[j]
            cur_abs=i
            abs_beg_ind = abs_list.index(cur_section+'%i'%cur_abs)
            abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+1))
            last_index = cur_section+'%i'%(cur_abs+1)
            print(cur_abs)

            section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
            #     print(section_abst)
            #     print(cur_section+'%i'%(cur_abs+1))
            #     print(i, abs_beg_ind, abs_end_ind)
            start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
            start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed

            whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
            title_sect=section_abst[0:start_abst]#the title and author combined
            title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
            title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
            length_title_lst=' '.join(title_lst)
            remove_start_string=length_title_lst.replace(start_string,"")
            auth_sect=section_abst[len(length_title_lst):start_abst]



            sect_abs.append(cur_section+'%i'%cur_abs)
            abst.append(whole_abs)
            title.append(remove_start_string)
            #auth.append(title_sect) old wrong way of getting author
            auth.append(auth_sect)
            
            
            last_good = i
        except ValueError:
            
            for k in range(2,51):
                try:
                    #print('k = ',k)
                    #print(cur_section+'%i'%(cur_abs+k))
                    abs_end_ind = abs_list.index(cur_section+'%i'%(cur_abs+k))
                    
                    section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
                    #     print(section_abst)
                    #     print(cur_section+'%i'%(cur_abs+1))
                    #     print(i, abs_beg_ind, abs_end_ind)
                    start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
                    start_string=cur_section+'%i'%cur_abs #the F4 like number at the beg of each title that isnt needed

                    whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
                    title_sect=section_abst[0:start_abst]#the title and author combined
                    title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
                    title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
                    length_title_lst=' '.join(title_lst)
                    remove_start_string=length_title_lst.replace(start_string,"")
                    auth_sect=section_abst[len(length_title_lst):start_abst]


                    sect_abs.append(cur_section+'%i'%cur_abs)
                    abst.append(whole_abs)
                    title.append(remove_start_string)
                    #auth.append(title_sect) old wrong way of getting author
                    auth.append(auth_sect)
                    
                    
                    break
                except:
                    #print('except')
                    pass

       
                else:
                    #print('reached else')
                    pass

            #print('last index =',last_index)



#                 except ValueError:
#                             abs_beg_ind = abs_list.index(last_index)
#                             #nxt = cur_section + 1
#                             abs_end_ind = abs_list.index('B'+'%i'%z)

#                             section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
#                             #     print(section_abst)
#                             #     print(cur_section+'%i'%(cur_abs+1))
#                             #     print(i, abs_beg_ind, abs_end_ind)
#                             start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
#                             start_string=last_index #the F4 like number at the beg of each title that isnt needed

#                             whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
#                             title_sect=section_abst[0:start_abst]#the title and author combined
#                             title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
#                             title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
#                             length_title_lst=' '.join(title_lst)
#                             remove_start_string=length_title_lst.replace(start_string,"")
#                             auth_sect=section_abst[len(length_title_lst):start_abst]


#                             sect_abs.append(cur_section+'%i'%cur_abs)
#                             abst.append(whole_abs)
#                             title.append(remove_start_string)
#                             #auth.append(title_sect) old wrong way of getting author
#                             auth.append(auth_sect)
#                             last_index
#                     # letter + last index up to B1 
#                     # section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract

#                             pass
        except:
            
                            abs_beg_ind = abs_list.index(last_index)
                            #nxt = cur_section + 1
                            abs_end_ind = abs_list.index(let_vec[j+1]+'%i'%z)

                            section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract 
                            #     print(section_abst)
                            #     print(cur_section+'%i'%(cur_abs+1))
                            #     print(i, abs_beg_ind, abs_end_ind)
                            start_abst=section_abst.index(' — ')#index with in the section where we first see this character - which denotes start of abs
                            start_string=last_index #the F4 like number at the beg of each title that isnt needed

                            whole_abs=section_abst[start_abst+3:len(section_abst)]#the abstract separated from the title and author
                            title_sect=section_abst[0:start_abst]#the title and author combined
                            title_auth_lst=title_sect.split(' ')#splits them up into list items so that the lamdas will work? have not tried to run without this
                            title_lst=[word for word in title_auth_lst if word.isupper()]#takes only all uppercase words
                            length_title_lst=' '.join(title_lst)
                            remove_start_string=length_title_lst.replace(start_string,"")
                            auth_sect=section_abst[len(length_title_lst):start_abst]


                            sect_abs.append(last_index)
                            abst.append(whole_abs)
                            title.append(remove_start_string)
                            #auth.append(title_sect) old wrong way of getting author
                            auth.append(auth_sect)
                            last_index
                    # letter + last index up to B1 
                    # section_abst=' '.join(abs_list[abs_beg_ind:abs_end_ind])#entire section including title author and abstract

            
            
            
        else:
                pass

            
            
print(last_good)

In [None]:
#making a datatable to see how the data fills the table which will help better understand the format i need to put the data into
try_table=pd.DataFrame({'sect_abs':sect_abs,'abst':abst,'title':title,'auth':auth})
try_table
