### Scraping abstract information
March 4, 2018
This notebook scrapes abstract text from:
- Proceedings of the Annual Cognitive Science Society meeting archive (html)
- Proceedings of Cognitive Neuroscience Society annual meeting (text converted from pdf)

Abstracts are then stored in a spreadsheet, containing information such as year, authors, title, and abstract.

In [1]:
import numpy as np
import csv
from bs4 import BeautifulSoup
import urllib
import re
import string
import pandas as pd
import os
import sys

In [2]:
def scrape_CS(home_url):
    #connect to home page url for that year
    CSurl = urllib.request.urlopen(home_url).read()
    soup = BeautifulSoup(CSurl, 'html.parser')
    all_links = soup.find_all('a', attrs={'href': re.compile("papers/*")})    
    year = home_url[-5:-1]    
    
    # enumerate through all paper links
    for link_idx, link in enumerate(all_links):
        # get soup from paper url
        if home_url not in str(link['href']):
            url_text = home_url + str(link['href'])
        else:
            url_text = str(link['href'])
    
        url = urllib.request.urlopen(url_text).read()
        soup = BeautifulSoup(url, 'html.parser')
    
        # scrape & parse
        authors = []
        affl = []
        title = ' '.join(soup.find_all('h1')[0].text.split())
        # exception rule for 2014 abstracts
        if '2014' in home_url:            
            abstr = ' '.join(soup.find_all('blockquote')[1].text.split())
        else:            
            abstr = ' '.join(soup.find_all('p', {"id": "abstract"})[0].text.split())            
        
        soup.find_all('ul')
        for ana in soup.find_all('em'):
            affl.append('>'+ana.text)
            if '2014' in home_url:
                # somebody fucked something up in 2014
                authors.append('>' + ana.previous_element.previous_element.split(',')[0])
            else:            
                authors.append('>' + ana.previous_element.split(',')[0])
        
        # do some gymnastics to get it into a pandas df and add as a row to CSV
        new_row = {'Year': str(year), 'Title': title,'Abstract': abstr,'Authors': ''.join(authors),'Affiliations': ''.join(affl), 'URL': url_text}
        df_cur = pd.Series(data=new_row).to_frame().T[['Year','Title','Abstract','Authors','Affiliations','URL']]
        return df_cur


In [1]:
#df_cur.to_csv(data_file, mode='a', header=False, index=False)
years = range(2010,2018)
base_url = 'https://mindmodeling.org/cogsci'
for year in years:
    home_url = base_url+str(year)+'/'
    print(home_url)
    scrape_CS

https://mindmodeling.org/cogsci2010/
https://mindmodeling.org/cogsci2011/
https://mindmodeling.org/cogsci2012/
https://mindmodeling.org/cogsci2013/
https://mindmodeling.org/cogsci2014/
https://mindmodeling.org/cogsci2015/
https://mindmodeling.org/cogsci2016/
https://mindmodeling.org/cogsci2017/


In [None]:
years = range(2010,2018)
base_url = 'https://mindmodeling.org/cogsci'

# get all paper links from cogsci conference
home_urls = ['https://mindmodeling.org/cogsci2017/',
             'https://mindmodeling.org/cogsci2016/',
             'https://mindmodeling.org/cogsci2015/',
             'https://mindmodeling.org/cogsci2014/',
             'https://mindmodeling.org/cogsci2013/',
             'https://mindmodeling.org/cogsci2012/',
             'https://mindmodeling.org/cogsci2011/',
             'https://mindmodeling.org/cogsci2010/']

for year in home_urls:
    # scrape all
    print(year)
    scrape_CS(home_url=year, data_file='../data/cogsci_abstracts.csv')

### gather COSYNE abstracts from text to csv

In [3]:
data_folder = '../data/COSYNE_programs/'
os.listdir(data_folder)
CNS_files = sorted([f for f in os.listdir(data_folder) if ('Cosyne' in f) and ('.txt' in f)])
print(CNS_files)

['Cosyne2008book.txt', 'Cosyne2009Book.txt', 'Cosyne2010-programme.txt', 'Cosyne2012_program_book.txt', 'Cosyne2013_program_book.txt', 'Cosyne2014_program_book.txt', 'Cosyne2015_program_book.txt', 'Cosyne2016_program_book.txt', 'Cosyne2017_program_book.txt', 'Cosyne2018_program_book.txt']


### This cell is for 2008

In [4]:
abstracts=[]
journal=[]

let_vec = ['I-','II-','III-']

file = open(data_folder+CNS_files[0], 'r')
data = file.read()
data_list = data.split('\n')
abs_start = [ind for ind, d in enumerate(data_list) if 'Thursday evening, Poster session I-1'in d][0]#'doi:10.3389/conf.neuro.06.2009.03.222' in d][0]

print(CNS_files[0])
abs_list = data_list[abs_start:]


mush=' '.join(abs_list)

for j in range(0,len(let_vec)):

            print(j)

            for i in range(0,200):

                try:
                    cur_section= let_vec[j]
                    cur_abs=i
                    abs_beg_ind = mush.index(cur_section+'%i'%cur_abs)
                    abs_end_ind = mush.index(cur_section+'%i'%(cur_abs+1))
                    last_index = cur_section+'%i'%(cur_abs+1)
                    print(cur_abs)

                    section_abst = mush[abs_beg_ind:abs_end_ind]
                    

                    abstracts.append(section_abst)
                    journal.append(CNS_files[0])

                    
                except:
                    'urmom'

Cosyne2008book.txt
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


### This cell is for 2008

In [5]:
file = open(data_folder+CNS_files[1], 'r')
data = file.read()
data_list = data.split('\n')
abs_start = [ind for ind, d in enumerate(data_list) if '\x0cI-1'in d][0]#'doi:10.3389/conf.neuro.06.2009.03.222' in d][0]

print(CNS_files[1])
abs_list = data_list[abs_start:]


mush=' '.join(abs_list)
for j in range(0,len(let_vec)):

            print(j)

            for i in range(0,200):

                try:
                    cur_section= let_vec[j]
                    add_dot = '.'
                    cur_abs=i
                    abs_beg_ind = mush.index(cur_section+'%i'%cur_abs+add_dot)
                    abs_end_ind = mush.index(cur_section+'%i'%(cur_abs+1)+add_dot)
                    last_index = cur_section+'%i'%(cur_abs+1)+add_dot
                    print(cur_abs)

                    section_abst = mush[abs_beg_ind:abs_end_ind]
                    

                    abstracts.append(section_abst)
                    journal.append(CNS_files[1])

                    
                except:
                    'urmom'

Cosyne2009Book.txt
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [6]:
data_folder = '../data/COSYNE_programs/'
os.listdir(data_folder)
CNS_files = sorted([f for f in os.listdir(data_folder) if ('Cosyne' in f) and ('.txt' in f)])[2:]#not include 08/09
print(CNS_files)

['Cosyne2010-programme.txt', 'Cosyne2012_program_book.txt', 'Cosyne2013_program_book.txt', 'Cosyne2014_program_book.txt', 'Cosyne2015_program_book.txt', 'Cosyne2016_program_book.txt', 'Cosyne2017_program_book.txt', 'Cosyne2018_program_book.txt']


### This cell is for 2010 - 2018

In [7]:
for x in range(len(CNS_files)):

    try:
        file = open(data_folder+CNS_files[x], 'r')
        data = file.read()
        data_list = data.split('\n')
        abs_start = [ind for ind, d in enumerate(data_list) if '– I-1' in d][0]

        print(CNS_files[x])
        abs_list = data_list[abs_start:]


        mush=' '.join(abs_list)


        for j in range(0,len(let_vec)):

            print(j)

            for i in range(0,200):

                try:
                    cur_section= let_vec[j]
                    add_dot = '.'
                    cur_abs=i
                    abs_beg_ind = mush.index(cur_section+'%i'%cur_abs+add_dot)
                    abs_end_ind = mush.index(cur_section+'%i'%(cur_abs+1)+add_dot)
                    last_index = cur_section+'%i'%(cur_abs+1)+add_dot
                    print(cur_abs)

                    section_abst = mush[abs_beg_ind:abs_end_ind]
                    

                    abstracts.append(section_abst)
                    journal.append(CNS_files[x])

                    
                except:
                    'urmom'
    except:
        file = open(data_folder+CNS_files[x], 'r')
        data = file.read()
        data_list = data.split('\n')
        abs_start = [ind for ind, d in enumerate(data_list) if 'I-1 –' in d][0]
        print(CNS_files[x])

        abs_list = data_list[abs_start:]


        mush=' '.join(abs_list)
        
        for j in range(0,len(let_vec)):

            print(j)

            for i in range(0,200):

                try:
                    cur_section= let_vec[j]
                    add_dot = '.'
                    cur_abs=i
                    abs_beg_ind = mush.index(cur_section+'%i'%cur_abs+add_dot)
                    abs_end_ind = mush.index(cur_section+'%i'%(cur_abs+1)+add_dot)
                    last_index = cur_section+'%i'%(cur_abs+1)+add_dot
                    print(cur_abs)

                    section_abst = mush[abs_beg_ind:abs_end_ind]

                    abstracts.append(section_abst)
                    journal.append(CNS_files[x])

                    
                except:
                    'urmom'
    finally:
        file = open(data_folder+CNS_files[x], 'r')
        data = file.read()
        data_list = data.split('\n')
        abs_start = [ind for ind, d in enumerate(data_list) if '– I-1' in d][0]
        print(CNS_files[x])

        abs_list = data_list[abs_start:]


        mush=' '.join(abs_list)
        
        for j in range(0,len(let_vec)):

            print(j)

            for i in range(0,200):

                try:
                    cur_section= let_vec[j]
                    add_dot = '.'
                    cur_abs=i
                    abs_beg_ind = mush.index(cur_section+'%i'%cur_abs+add_dot)
                    abs_end_ind = mush.index(cur_section+'%i'%(cur_abs+1)+add_dot)
                    last_index = cur_section+'%i'%(cur_abs+1)+add_dot
                    print(cur_abs)

                    section_abst = mush[abs_beg_ind:abs_end_ind]

                    abstracts.append(section_abst)
                    journal.append(CNS_files[x])

                    
                except:
                    'urmom'
    
    print('j')

Cosyne2010-programme.txt
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
Cosyne2010-programme.txt
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
j
Cosyne2016_program_book.txt
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

In [8]:
try_table=pd.DataFrame({'journal':journal,'abstracts':abstracts})
try_table

Unnamed: 0,abstracts,journal
0,I-1 Bayesian inference accounts for the ﬁllin...,Cosyne2008book.txt
1,I-2 Cosyne 2008 Towards functional and anato...,Cosyne2008book.txt
2,I-3 A Neural System for Scale and Orientation...,Cosyne2008book.txt
3,I-4 Cosyne 2008 Decoding Frequency and Timin...,Cosyne2008book.txt
4,I-5 Concurrent increases in selectivity and t...,Cosyne2008book.txt
5,I-6 Cosyne 2008 Integration of distributed o...,Cosyne2008book.txt
6,I-7 Common neural mechanisms of intermediate ...,Cosyne2008book.txt
7,I-8 Cosyne 2008 Contrast-dependent suppressi...,Cosyne2008book.txt
8,I-9 Perception of a touch-induced visual illu...,Cosyne2008book.txt
9,I-10 Cosyne 2008 Interactions between chroma...,Cosyne2008book.txt


In [9]:
try_table.to_csv("COSYNE_SCRAPED_Final.csv")