In [None]:
# from bs4 import BeautifulSoup as BS
# from urllib.request import Request, urlopen
# import re
# import pandas as pd
# from urllib.error import HTTPError
from helper_funs import *
import os

### Setting target links

In [None]:
# define links
link_base = "https://www.papalencyclicals.net"
link_popelist = link_base + "/popelist"

In [None]:
sup = get_url_content(link_popelist)

In [None]:
popelinks_raw = sup.find('h3', string = re.compile('.*whom documents are available.*')).parent.parent.next_sibling.next_sibling.find_all('a')

In [None]:
popes_with_links = [[popelink.string,popelink['href']] for popelink in popelinks_raw]

## Selecting popes and cleaning their names

In [None]:
# select popes here
selected_popes_with_links = popes_with_links[:10]
# list of selected popes..
selected_popes = [popelink[0] for popelink in selected_popes_with_links]
#... and their names clean of St. and historical info
selected_popes_names = [re.sub('(St.)','',pope) if pope.find('(') < 0 else re.sub('(St.)','',pope[:pope.find('(')]) for pope in selected_popes]
selected_popes_names = [name.strip() for name in selected_popes_names]
# list of selected links
selected_links = [popelink[1] for popelink in selected_popes_with_links]
# Replace popes ugly names with nice ones
selected_popes_with_links = [[selected_popes_names[i],selected_popes_with_links[i][1]] for i in range(len(selected_popes_with_links))]

## Creating subfolders for each pope

In [None]:
# create subfolders for each pope
for pope in selected_popes_names:
    if os.path.exists('./txts/'+pope):
        continue
    else:
        os.mkdir('./txts/'+pope)
        
# save list of selected popes as pickle
with open("pickles/selected_popes_names", "wb") as fp:   #Pickling
    pickle.dump(selected_popes_names, fp)

## Scraping and preparing links to popes personal sites

In [None]:
# prepare links to popes personal sites
selected_popes_names_linkform = [re.sub(' ','-',pope).lower() if pope != 'Francis' else 'francesco' for pope in selected_popes_names]
pope_links =  ['https://www.vatican.va/content/'+pope+'/en/encyclicals.index.html' for pope in selected_popes_names_linkform]
# scrap links to encyclicas of all selected popes
pope_docs_links = {}
for i,pope_link in enumerate(pope_links):
   
    try:
        sup = get_url_content(pope_link)
        # some pages have several subpages
        if sup.find_all('nav') == []:
            pope_docs_links[selected_popes_names[i]] = [tag['href'] for tag in sup.find_all('a', text = 'English')]
        # if they do scrap for links each of them
        else:
            no_of_subpages = int(sup.find('div',{'class':'navigation-info'}).get_text()[-1])   
            for j in range(1,no_of_subpages+1):
                subpage_link = re.sub('index','index'+'.'+str(j),pope_link)
                sub_sup = get_url_content(subpage_link)
                for tag in sub_sup.find_all('a', text = 'English'):
                    if j == 1:
                        pope_docs_links[selected_popes_names[i]] = [tag['href'] for tag in sup.find_all('a', text = 'English')]
                    else:
                        pope_docs_links[selected_popes_names[i]].append(tag['href'])
    except HTTPError:
        print('Something wrong with this link: '+pope_link)

## Defining naming convention of the files

In [None]:
# pretty naming of files
def get_enc_date(link):
    end = link.rfind('_')
    start = rfind_nth(link,'_',2)
    date = link[start+1:end]
    return(date)
def get_enc_name(link):
    # make names unified: enc_ + title + date
    
    end = link.find('.html')
    start = link.rfind('_')
    enc_name = link[start+1:end]
    
    enc_date = get_enc_date(link)
    
    if(enc_name.find('enc') >= 0):
        enc_name = re.sub('enciclica','enc',enc_name) + '_' + enc_date
    else:
        enc_name = 'enc_' + enc_name + '_' + enc_date
    
    enc_name = re.sub('-','_',enc_name)
    
    return(enc_name)

## Scraping the encyclicals to each subfolder

In [None]:
# scrap encyclicas to each pope's own folder
for pope in selected_popes_names:
    dirname = 'txts/' + pope + '/' 
    for i,link in enumerate(pope_docs_links[pope]):
        try:
            full_link = 'https://www.vatican.va'+ link
            sup = get_url_content(full_link)
            txt = sup.find('div', {'class','documento'}).get_text()
            with open(dirname + f'{get_enc_name(full_link)}.txt', "w",encoding="utf-8") as text_file:
                text_file.write(txt)
        except:
            print('Something wrong with this link: '+ full_link)