# Extract the plain text of papers in The Lancet Journal

In [27]:
import requests
from bs4 import BeautifulSoup
import re

from selenium import webdriver 
from selenium.webdriver.chrome.service import Service as ChromeService 
from webdriver_manager.chrome import ChromeDriverManager 

def get_dynamic_page(url):
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager(version='114.0.5735.90').install())) 
    driver.get(url) 
    return driver.page_source

url = 'https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(16)31919-5/fulltext'
doi = '10.1016/S0140-6736(16)31919-5'
doi = doi.replace('/', '_')
outdir = f'DOI_{doi}'

html_text = get_dynamic_page(url)
html_text

'<html lang="en" class="pb-page wf-sourcesanspro-n4-active wf-active fonts-loaded" data-request-id="6109e9e7-20f8-4558-86cd-4461792f7570"><head data-pb-dropzone="head"><style>.hs-cta-wrapper p, .hs-cta-wrapper div { margin: 0; padding: 0; } a#cta_button_3798016_b265e8ac-1762-4765-aef5-64ad8cf1965c {\n    -webkit-font-smoothing: antialiased !important;\ncursor: pointer !important;\n-moz-user-select: none !important;\n-webkit-user-select: none !important;\n-o-user-select: none !important;\nuser-select: none !important;\ndisplay: inline-block !important;\nfont-weight: normal !important;\ntext-align: center !important;\ntext-decoration: none !important;\n-moz-transition: all .4s ease !important;\n-webkit-transition: all .4s ease !important;\n-o-transition: all .4s ease !important;\nbackground: rgb(72,116,131) !important;\nborder-radius: 5px !important;\nborder-width: 0px !important;\ncolor: rgb(255,255,255) !important;\nfont-family: sans-serif !important;\nheight: auto !important;\ntransit

In [29]:
import re
# Parse the html file
soup = BeautifulSoup(html_text, 'html.parser')

section_list = soup.find_all('a', attrs={'href':re.compile('#seccestitle[0-9][0-9]*')})

section_list

[<a class="w-slide__hide" href="#seccestitle10"><span>Summary</span></a>,
 <a class="w-slide__hide" href="#seccestitle70"><span>Introduction</span></a>,
 <a class="w-slide__hide" href="#seccestitle80"><span>Methods</span></a>,
 <a class="w-slide__hide" href="#seccestitle120"><span>Results</span></a>,
 <a class="w-slide__hide" href="#seccestitle130"><span>Discussion</span></a>,
 <a class="w-slide__hide" href="#seccestitle150"><span>References</span></a>]

In [94]:
import pprint
import pandas as pd
sections = {}
for section in section_list:
    section_name = section.text
    section_id = section['href'].replace('#', '')
    section_body = soup.find('h2', attrs={'id': section_id}).parent.find_all('div', attrs={'class':'section-paragraph'})

    paragraphs = []
    for s in section_body:
        all_text = s.find_all(text=True, recursive=False)
        paragraphs.append(''.join(all_text))
    sections[section_name] = [p for p in paragraphs]
    
pprint.pprint(sections['Discussion'])

['Raised blood pressure has transitioned from a risk factor largely affecting '
 'high-income countries to one that is now most prevalent in low-income '
 'countries in south Asia and sub-Saharan Africa, while being a persistent '
 'health issue in central and eastern Europe. Although favourable trends '
 'continue in high-income countries, and might also be happening in some '
 'middle-income regions, other low-income and middle-income regions are '
 'affected by rising, or at best stable but high, blood pressure. The number '
 'of people with raised blood pressure in the world has increased by 90% '
 'during these four decades, with the majority of the increase occurring in '
 'low-income and middle-income countries, and largely driven by the growth and '
 'ageing of the population.',
 'At the global level, we estimated lower mean systolic blood pressure in the '
 '1980s, and hence a smaller reduction over time, than reported by Danaei and '
 'colleagues, possibly because we had more

In [95]:
# Write sections to files

import os
if not os.path.exists(outdir):
    os.mkdir(outdir)

for section_name, section_body in sections.items():
    outfile = '{}/{}_section_{}.txt'.format(outdir, doi, section_name.replace('.', '_').replace(' ', '_'))
    with open(outfile, 'w') as fw:
        fw.write('{}\n'.format('\n\n'.join(section_body)))