In [None]:
### contains the code for scraping
### Legacy Publications non-archived OTH/WP/WS/WEB content types
### excluding pdf and excel direct downloads

In [130]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
import pickle as pkl

pd.set_option('display.max_rows', 500)

In [119]:
# input content file
CONTENT_FILE = '/Users/sjlee/Documents/github/ncses_data/2022-06-29-prod-cms-content.xlsx'
content = pd.read_excel(CONTENT_FILE, sheet_name='Legacy Publications')

In [120]:
# filter to just the urls that need to be coded
other = content[(content['Archived']=='N') & (content['Type'].isin(['OTH', 'WP', 'WS', 'WEB']))]
oth = other.drop_duplicates(['URL'])
oth.loc[:,'xls'] = oth['URL'].apply(lambda x: bool(re.search('.xlsx|.xls', x)))
oth.loc[:,'pdf'] = oth['URL'].apply(lambda x: bool(re.search('.pdf', x)))
web = oth[(~oth['xls']) & (~oth['pdf'])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [4]:
web['URL']

583             https://www.nsf.gov/statistics/profiles/
659               https://www.nsf.gov/statistics/states/
711    https://www.nsf.gov/statistics/seind14/index.c...
765      https://www.nsf.gov/statistics/2015/ncses15201/
768      https://www.nsf.gov/statistics/2015/ncses15200/
792      https://www.nsf.gov/statistics/2016/ncses16200/
844              https://www.nsf.gov/statistics/randdef/
891        https://www.nsf.gov/statistics/2020/nsf20304/
900        https://www.nsf.gov/statistics/2020/nsf20310/
903        https://www.nsf.gov/statistics/2020/nsf20313/
908        https://www.nsf.gov/statistics/2020/nsf20315/
911            https://www.nsf.gov/statistics/ffrdclist/
912      https://www.nsf.gov/statistics/2021/ncses21200/
913      https://www.nsf.gov/statistics/2021/ncses21201/
914      https://www.nsf.gov/statistics/2021/ncses21202/
918      https://www.nsf.gov/statistics/2022/ncses22205/
Name: URL, dtype: object

In [21]:
### note: url for text and page are different
def page_with_tabs(soup, row):
    url_base = 'https://www.nsf.gov'
    
    # text for page
    legacy = 'Y'
    pub_type = row['Type']
    asset_type = 'page'
    pub_date = row['Publish Date']
    pub_id = row['Legacy Pub ID']
    slug_path = pub_id
    page_title = soup.find_all('h1')[-1].text
    url_link = row['URL']
    page_text = soup.find_all('div', class_='col-sm-12')[1].find_all('p', class_=False)[0].text
    return_lst = [[legacy, pub_type, asset_type, pub_date, pub_id, slug_path, page_title, url_link, page_text]]
    
    # get tabs
    tabs = soup.select('.tabs')[0].find_all("li")
    hrefs = [(t.text, t.find('a').get('data-hash-id'), t.find('a').get('href')) for t in tabs]
    for (title, sec_id, href) in hrefs:
        tab_url = url_base + href
        tab_page = requests.get(tab_url)
        tab_soup = BeautifulSoup(tab_page.content, "html.parser")
        
        tab_slug_path = str(slug_path) + '->' + title.replace(' ', '-').lower()
        para = tab_soup.find_all('p')
        if len(para) > 0:
            text = ' '.join([p.text for p in para])
        else:
            text = ''
        one_lst = [legacy, pub_type, 'text', pub_date, pub_id, tab_slug_path, title, tab_url, text]
        return_lst.append(one_lst)
    
    return return_lst
    

In [25]:
def page_with_panels(soup, row):
    url_base = 'https://www.nsf.gov'
    
    # text for page
    legacy = 'Y'
    pub_type = row['Type']
    asset_type = 'page'
    pub_date = row['Publish Date']
    pub_id = row['Legacy Pub ID']
    slug_path = pub_id
    page_title = soup.find_all('h1')[-1].text
    url_link = row['URL']
    page_text = ''
    return_lst = [[legacy, pub_type, asset_type, pub_date, pub_id, slug_path, page_title, url_link, page_text]]
    
    # get tabs
    tabs = soup.find('div', id='publication-tab').select('.dst-chapter')
    hrefs = [(t.text.strip(), t.get('data-hash-id'), t.find('a').get('href')) for t in tabs]
    for (title, sec_id, href) in hrefs:
        tab_url = url_base + href
        tab_page = requests.get(tab_url)
        tab_soup = BeautifulSoup(tab_page.content, "html.parser")
        
        tab_slug_path = str(slug_path) + '->' + title.replace(' ', '-').lower()
        para = tab_soup.find_all('p')
        if len(para) > 0:
            text = ' '.join([p.text for p in para])
        else:
            text = ''
        one_lst = [legacy, pub_type, 'text', pub_date, pub_id, tab_slug_path, title, tab_url, text]
        return_lst.append(one_lst)
    
    return return_lst

In [26]:
def page_with_asset(soup, row):
    url_base = 'https://www.nsf.gov'
   
    # for page
    legacy = 'Y'
    pub_type = row['Type']
    pub_date = row['Publish Date']
    pub_id = row['Legacy Pub ID']
    slug_path = pub_id
    title = soup.find_all('h1')[-1].text
    
    url = row['URL']
    return_lst = []
    
    # content soup
    url_tail = soup.find('article').find('a').get('href')
    content_url = url_base + url_tail
    content_page = requests.get(content_url)
    content_soup = BeautifulSoup(content_page.content, "html.parser")
    
    # figure
    fig_title = content_soup.find('figcaption').text
    fig_title = re.sub('FIGURE [0-9]{1,3}. ', '', fig_title)
    source_tail = content_soup.find('a', class_='source-data').get('href')
    fig_url = url + source_tail
    fig_lst = [legacy, pub_type, 'figure', pub_date, pub_id, slug_path, fig_title, fig_url, '']
    return_lst.append(fig_lst)

    # text
    ps = content_soup.find('a', class_='source-data').find_all_next("p")
    text = ' '.join([p.text for p in ps if p.find('a')==None])
    text_lst = [legacy, pub_type, 'text', pub_date, pub_id, slug_path, title, url, text]
    return_lst.append(text_lst)
    
    return return_lst

In [28]:
archived = []
error = []
done = []
skip = []

cols = ['Legacy', 'Pub Type', 'Asset Type', 'Publish Date', 'Pub ID', 'Path', 'Title', 'URL', 'Text']
parsed_lst = []

for i, row in web.iterrows():
    url = row['URL']
    print(url)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    if bool(re.search('online archive', soup.text)):
        archived.append(url)
    elif re.search('offline', soup.text):
        error.append(url)
    else: 
        if bool(re.search('states', url)):
            return_lst = page_with_tabs(soup, row)
            parsed_lst.extend(return_lst)
            done.append(url)
        
        else:
            tabs = soup.find('div', id='publication-tab')
            if tabs != None:
                if len(tabs) > 3:
                    return_lst = page_with_panels(soup, row)
                    parsed_lst.extend(return_lst)
                    done.append(url)
                else:
                    try:
                        return_lst = page_with_asset(soup, row)
                        parsed_lst.extend(return_lst)
                        done.append(url)
                    except:
                        skip.append(url)
            else:
                skip.append(url)

df = pd.DataFrame(parsed_lst, columns=cols)

https://www.nsf.gov/statistics/profiles/
https://www.nsf.gov/statistics/states/
https://www.nsf.gov/statistics/seind14/index.cfm/state-data
https://www.nsf.gov/statistics/2015/ncses15201/
https://www.nsf.gov/statistics/2015/ncses15200/
https://www.nsf.gov/statistics/2016/ncses16200/
https://www.nsf.gov/statistics/randdef/
https://www.nsf.gov/statistics/2020/nsf20304/
https://www.nsf.gov/statistics/2020/nsf20310/
https://www.nsf.gov/statistics/2020/nsf20313/
https://www.nsf.gov/statistics/2020/nsf20315/
https://www.nsf.gov/statistics/ffrdclist/
https://www.nsf.gov/statistics/2021/ncses21200/
https://www.nsf.gov/statistics/2021/ncses21201/
https://www.nsf.gov/statistics/2021/ncses21202/
https://www.nsf.gov/statistics/2022/ncses22205/


In [106]:
df = pd.DataFrame(parsed_lst, columns=cols)

In [129]:
assets = df.groupby('Asset Type').size()
nums = {'Text': assets.text, 
 'Figure': assets.figure, 
 'Table': 0, 
 'Page': assets.page, 
 'Error': len(error), 
 'Archived': len(archived), 
 'Irregular': len(skip), 
 'Zip': 0, 
 'PDF': oth[oth['pdf']].shape[0], 
 'Excel': oth[oth['xls']].shape[0]}

In [132]:
with open('/Users/sjlee/Documents/github/ncses_data/legacy_other.pkl', 'wb') as f:
    pkl.dump(df, f)

In [133]:
with open('/Users/sjlee/Documents/github/ncses_data/legacy_other_nums.pkl', 'wb') as f:
    pkl.dump(nums, f)