# Clean Text from Research and White Papers

In [1]:
import os
import re
from glob import glob

import fitz
import pandas as pd
import requests

In [2]:
PROJ_ROOT = os.path.join(os.pardir)

## About

### Objective

Clean research paper and white paper texts.

### Output

The output is a `DataFrame` containing the following columns

1. `file_num`
   - (paper) file number
2. `file_name`
   - (paper) file name
3. `url`
   - web url of paper
4. `text`
   - raw text
5. `char_count`
   - approximate number of characters in raw text
6. `sentence_count_raw`
   - approximate number of sentences in raw text
7. `token_count`
   - approximate number of tokens in raw text
8. `text_cleaned`
   - cleaned text from paper
9. `type`
   - type of data source (`research_paper` or `white_paper`)

### Notes About Data Privacy

1. All papers were retrieved by searching Google Scholar and manually downloading the `.pdf` file. All `.pdf` files are stored locally and will be deleted on November 30, 2024.
2. Raw or processed text outputs are not shown here.

## User Inputs

In [3]:
fname_processed = 'research_and_white_papers_cleaned.parquet'

paper_urls = [
    {'file_num': 1, 'url': 'https://iopscience.iop.org/article/10.1088/1755-1315/750/1/012032/pdf'},
    {'file_num': 2, 'url': 'https://academic.oup.com/oocc/article-pdf/4/1/kgae005/57448622/kgae005.pdf'},
    {'file_num': 3, 'url': 'https://pmc.ncbi.nlm.nih.gov/articles/PMC9448965/'},
    {'file_num': 4, 'url': 'https://www.frontiersin.org/journals/marine-science/articles/10.3389/fmars.2021.704682/full'},
    {'file_num': 5, 'url': 'https://www.sciencedirect.com/science/article/pii/S0048969723007295'},
    {'file_num': 6, 'url': 'https://onlinelibrary.wiley.com/doi/10.1111/gcb.16192'},
    {'file_num': 7, 'url': 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0088720&type=printable'},
    {'file_num': 8, 'url': 'https://www.nature.com/articles/s41598-023-33613-1'},
    {'file_num': 9, 'url': 'https://www.nature.com/articles/s41467-023-40601-6'},
    {'file_num': 10, 'url': 'https://pmc.ncbi.nlm.nih.gov/articles/PMC9545801/'},
    {'file_num': 11, 'url': 'https://www.coris.noaa.gov/activities/reef_managers_guide/reef_managers_guide_ch4.pdf'},
    {'file_num': 12, 'url': 'https://icriforum.org/documents/wcs-coral-reefs-whitepaper-2021/'},
    {'file_num': 13, 'url': 'https://library.sprep.org/content/forecasting-climate-sanctuaries-securing-future-coral-reefs-executive-summary'},
]

references_pattern = r'\[.*?\]'

paper_id_cols = ['file_num', 'file_name']

output_columns = [
    'file_num',
    'file_name',
    'url',
    'text',
    'char_count',
    'sentence_count_raw',
    'token_count',
    'text_cleaned',
    'type',
]

In [4]:
data_dir = os.path.join(PROJ_ROOT, 'data')
raw_data_dir_research = os.path.join(data_dir, 'raw', 'research')
raw_data_dir_white_papers = os.path.join(data_dir, 'raw', 'white-papers')
processed_data_dir = os.path.join(data_dir, 'processed')

fpath_processed = os.path.join(processed_data_dir, fname_processed)

In [5]:
def numeric_sort(test_string):
    """Sort by numeric part of string."""
    return list(map(int, re.findall(r'\d+', test_string)))[0]

## Load Data, including Metadata

Get sorted list of papers

In [6]:
# get and sort list of research papers
filepaths_research = glob(os.path.join(raw_data_dir_research, "*.pdf"))
filepaths_research.sort(key=numeric_sort)

# get and sort list of white papers
filepaths_white_papers = glob(os.path.join(raw_data_dir_white_papers, "*.pdf"))
filepaths_white_papers.sort(key=numeric_sort)

# combine list of research and white papers
filepaths = filepaths_research + filepaths_white_papers

Load papers

In [7]:
%%time
records = [
    {
        # metadata
        "file_num": k,
        "file_name": os.path.basename(fpath),
        "page_number": page_number,
        "page_char_count": len(page.get_text()),
        "page_sentence_count_raw": len(page.get_text().split(". ")),
        "page_token_count": round(len(page.get_text())/4),
        # data
        "text": page.get_text(),
    }
    for k, fpath in enumerate(filepaths, 1)
    for page_number, page in enumerate(fitz.open(fpath))
]
df = pd.DataFrame.from_records(records)
print(
    f"Extracted contents from {len(filepaths):,} .pdf files into "
    f"{len(df):,} pages"
)
df.head()

Extracted contents from 13 .pdf files into 177 pages
CPU times: user 5.73 s, sys: 34.4 ms, total: 5.76 s
Wall time: 5.77 s


Unnamed: 0,file_num,file_name,page_number,page_char_count,page_sentence_count_raw,page_token_count,text
0,1,1.pdf,0,857,4,214,IOP Conference Series: Earth\nand Environmenta...
1,1,1.pdf,1,3279,22,820,Content from this work may be used under the t...
2,1,1.pdf,2,2347,17,587,The 6th International Conference on Tropical a...
3,1,1.pdf,3,3459,25,865,The 6th International Conference on Tropical a...
4,1,1.pdf,4,2837,16,709,The 6th International Conference on Tropical a...


## Process Data

### Research Paper 1

In [8]:
text_1 = ' '.join(
    [
        row['text']
        .split("\n \n \n \n \n \n \n")[1]
        .replace('\n', ' ')
        .replace('\t', '')
        .replace("\'", '')
        .replace('–', '-')
        .replace('°', '')
        .replace('2.1.', '')
        .replace('2.2.', '')
        .replace('Data and Method', '')
        .replace('Methodology', '')
        .replace('(https://oceancolor.gsfc.nasa.gov/)', '')
        for _, row in (
            df
            .query("file_num == 1")
            .query("page_number >= 1")
            .iterrows()
        )
    ]
).split('1. Introduction  ')[1].split('  References')[0]
text_1 = (
    re.sub(references_pattern, '', text_1)
    .replace('  ', ' ')
)
# print(text_1)

### Research Paper 2

In [9]:
text_2 = ' '.join(
    [
        row['text']
        .replace('\n', ' ')
        .replace('\xad', '')
        .replace('kgae005', '')
        .replace('~', ' ')
        .replace(' | Oxford Open Climate Change, 2024, Vol. 4, No. 1', '')
        .replace('Goreau and Hayes | ', '')
        .replace(
            'Received: January 22, 2024. Revised: March 19, 2024. Accepted: March 19, 2024', ''
        )
        .replace(
            (
                'Downloaded from https://academic.oup.com/oocc/article/4/1/'
                'kgae005/7666987 by guest on 22 November 2024.'
            ),
            ''
        )
        .replace(
            'The Author(s) 2024. Published by Oxford University Press.', ''
        )
        .replace(
            'This is an Open Access article distributed under the terms of the Creative Commons Attribution License', ''
        )
        .replace(
            'permits unrestricted reuse, distribution, and reproduction in any medium, provided the original work is properly cited.', ''
        )
        .replace('Oxford Open Climate Change, 2024, 4(1)', '')
        .replace('Downloaded from', '')
        .replace('Research Article', '')
        .replace('by guest on 22 November 2024', '')
        for _, row in (
            df
            .query("file_num == 2")
            .query("page_number >= 0")
            .iterrows()
        )
    ]
).split(' Introduction: ')[1].split('Acknowledgements ')[0]
text_2 = re.sub(references_pattern, '', text_2)
text_2 = re.sub(r'http\S+', '', text_2)
text_2 = text_2.replace('    #     ( which    ,              ', '')
# print(text_2)

### Research Paper 3

In [10]:
text_3 = ' '.join(
    [
        row['text']
        .replace(u'\xa0', u' ')
        .replace('\xad', '')
        .replace('\u2009', '')
        .replace('JOHNSON et al.', '')
        .replace('\u2003', ' ')
        .replace('\u2002', ' ')
        .replace('3 of 12', '')
        .replace('4 of 12', '')
        .replace('5 of 12', '')
        .replace('6 of 12', '')
        .replace('7 of 12', '')
        .replace('8 of 12', '')
        .replace('9 of 12', '')
        .replace('10 of 12', '')
        .replace('|', '')
        .replace('T A X O N O M Y  C L A S S I F I C A T I O N\n', '')
        .replace('Applied ecology; Biodiversity ecology; Functional ecology; Global change ecology; \nMacroecology', '')
        .replace('\n', ' ')
        for _, row in (
            df
            .query("file_num == 3")
            .query("page_number >= 0")
            .iterrows()
        )
    ]
).split('1    INTRODUCTION')[1].split('AUTHOR CONTRIBUTIONS')[0]
# print(text_3)

### Research Paper 4

In [11]:
text_4 = ' '.join(
    [
        row['text']
        .replace('Frontiers in Marine Science | www.frontiersin.org\n1\nAugust 2021 | Volume 8 | Article 704682', '')
        .replace('Frontiers in Marine Science | www.frontiersin.org\n2\nAugust 2021 | Volume 8 | Article 704682', '')
        .replace('Frontiers in Marine Science | www.frontiersin.org\n3\nAugust 2021 | Volume 8 | Article 704682', '')
        .replace('Frontiers in Marine Science | www.frontiersin.org\n4\nAugust 2021 | Volume 8 | Article 704682', '')
        .replace('Frontiers in Marine Science | www.frontiersin.org\n5\nAugust 2021 | Volume 8 | Article 704682', '')
        .replace('Frontiers in Marine Science | www.frontiersin.org\n6\nAugust 2021 | Volume 8 | Article 704682', '')
        .replace('Frontiers in Marine Science | www.frontiersin.org\n7\nAugust 2021 | Volume 8 | Article 704682', '')
        .replace('Dao et al.\nSeawater Temperature on Coral Reefs', '')
        .replace('www.blue-communities.org', '')
        .replace('\n', ' ')
        .replace('ñ', 'n')
        for _, row in (
            df
            .query("file_num == 4")
            .query("page_number >= 0")
            .iterrows()
        )
    ]
).split('INTRODUCTION')[1].split('DATA AVAILABILITY STATEMENT')[0]
# print(text_4)

### Research Paper 5

In [12]:
text_5 = ' '.join(
    [
        row['text']
        .replace(
            "⁎ Corresponding author.\nE-mail address: rvw@ﬁt.edu (R. van Woesik).\n1 Present address: School of Zoology and The Steinhardt Museum of Natural History, Tel\nAviv University, Tel Aviv 69978, Israel.\nhttp://dx.doi.org/10.1016/j.scitotenv.2023.162113\nReceived 11 December 2022; Received in revised form 2 February 2023; Accepted 4 February 2023\nAvailable online 9 February 2023\n0048-9697/© 2023 The Author(s). Published by Elsevier B.V. This is an open access article under the CC BY-NC license (http://creativecommons.org/licenses/by-nc/4.0/).", ""
        )
        .replace(
            "Contents lists available at ScienceDirect\nScience of the Total Environment\njournal homepage: www.elsevier.com/locate/scitotenv", ""
        )
        .replace("Science of the Total Environment 871 (2023) 162113", "")
        .replace('\n\n', '')
        .replace('\n', ' ')
        for _, row in (
            df
            .query("file_num == 5")
            .query("page_number >= 0")
            .iterrows()
        )
    ]
).split('1. Introduction')[1].split('CRediT authorship contribution statement')[0]
# print(text_5)

### Research Paper 6

In [13]:
text_6 = ' '.join(
    [
        row['text']
        .replace(u'\xa0', u' ')
        .replace('\xad', '')
        .replace('\u2009', '')
        .replace('JOHNSON et al.', '')
        .replace('\u2003', ' ')
        .replace('\u2002', ' ')
        .replace('VAN WOESIK et al.', '')
        .replace('Funding information', '')
        .replace('Division of Ocean Sciences, Grant/Award ', '')
        .replace('Number: OCE 1829393 and OCE 1838667', '')
        .replace('4231', '')
        .replace('4232', '')
        .replace('4233', '')
        .replace('4234', '')
        .replace('4235', '')
        .replace('4236', '')
        .replace('4237', '')
        .replace('4238', '')
        .replace('4239', '')
        .replace('4240', '')
        .replace('4241', '')
        .replace('2  |  ENVIRONMENTAL', 'ENVIRONMENTAL')
        .replace('3  |  DIFFERENTIAL THERMAL-STRESS', 'DIFFERENTIAL THERMAL-STRESS')
        .replace('4  |  LINKING CORAL-BLEACHING', 'LINKING CORAL-BLEACHING')
        .replace('5  |  LOOKING FORWARD', '')
        .replace('6  |  FROM SCIENCE TO MANAGEMENT', 'FROM SCIENCE TO MANAGEMENT')
        .replace('7  |  MESOSCALE SANCTUARIES', 'MESOSCALE SANCTUARIES')
        .replace('8  |  CONCLUDING REMARKS', 'CONCLUDING REMARKS')
        .replace('K E Y W O R D S', '')
        .replace('CORAL \nBLEACHING', 'CORAL BLEACHING')
        .replace(', https://www.ncbi.nlm.nih.gov/geo/', '')
        .replace(', https://geome​-db.org/', '')
        .replace(' (https://www.bco-dmo.org)', '')
        .replace('; https://coral​reefw​atch.noaa.gov/produ​ct/5km/index_5km_dhw.php', '')
        .replace('climate change, conservation, coral bleaching, coral reefs, corals, global warming, mesoscale', '')
        .replace('sanctuaries, networks, protected reefs, refugia, thermal stress', '')
        .replace('\n', ' ')
        for _, row in (
            df
            .query("file_num == 6")
            .query("page_number >= 1")
            .iterrows()
        )
    ]
).split('1  |  INTRODUCTION ')[1].split(' ACKNOWLEDGMENTS ')[0]
text_6 = text_6.replace('      |    ', '').replace('   |     ', '')
# print(text_6)

### White Paper 1

In [14]:
text_7 = ' '.join(
    [
        row['text']
        # .replace('\n', ' ')
        .replace('CORAL BLEACHING – A REVIEW OF THE CAUSES AND CONSEQUENCES', '')
        .replace('CORAL BLEACHING: SCIENCE', '')
        .replace('A REEF MANAGER’S GUIDE TO CORAL BLEACHING', '')
        .replace('© Kirsten Michalek-Wagner', '')
        .replace('© Chris Hawkins', '')
        .replace('© Ove Hoegh-Guldberg', '')
        .replace('© Christian Perthen', '')
        .replace('© Paul Marshall', '')
        .replace('© Yusri Yusuf', '')
        .replace('© James Oliver', '')
        .replace('© Australian Institute of Marine Science, Long Term Monitoring Program', '')
        .replace('© Rohan Arthur', '')
        .replace('© Ove Hoegh-Guldberg', '')
        .replace('© Simon Albert', '')
        .replace('© Yusri Yusuf', '')
        .replace('4. CORAL BLEACHING', 'CORAL BLEACHING')
        .replace('\n', ' ')
        for _, row in df.query("file_name == '7_WP_1.pdf'").query("page_number >= 0").iterrows()
    ]
)
# print(text_7)

### White Paper 2

In [15]:
text_8 = ' '.join(
    [
        row['text']
        .replace('1.  ', '')
        .replace('2.  ', '')
        .replace('3.  ', '')
        .replace('4.  ', '')
        .replace('5.  ', '')
        .replace('6.  ', '')
        .replace('7.  ', '')
        .replace('8.  ', '')
        .replace('▪', '')
        .replace('\t', '')
        for _, row in df.query("file_name == '8_WP_2.pdf'").query("page_number >= 1").iterrows()
    ]
).split('Abstract')[1].split('References')[0]
text_8 = (
    text_8
    .replace('\n 3\n', '')
    .replace('\n 4\n', '')
    .replace('\n 5\n', '')
    .replace('\n 6\n', '')
    .replace('\n 7\n', '')
    .replace('\n 8\n', '')
    .replace('\n 9\n', '')
    .replace('\n 10\n', '')
    .replace('\n 11\n', '')
    .replace('\n 12\n', '')
    .replace('\n 13\n', '')
    .replace('\n', ' ')
)
# print(text_8)

### White Paper 3

In [16]:
text_9 = ' '.join(
    [
        row['text']
        .replace(
            'Authors: Tim McClanahan1, Emily Darling1, Remy Oddenyo2, Gautam Surya3,  Maria Beger4, Helen Fox5, \nStacy Jupiter6, Lizzie McLeod7, Lisa McManus8, Robert van Woesik9, Hedley Grantham3, Cheryl Logan10, \nJoseph Maina11, Vardhan Patankar12, Amelia Wenger1,13, Jens Zinke14',
            ''
        )
        .replace('Forecasting Climate Sanctuaries for Securing the Future of Coral Reefs', '')
        .replace('EXECUTIVE SUMMARY', '')
        .replace('Vibrant Oceans Initiative Whitepaper | APRIL 2022', '')
        .replace('•', '')
        .replace('1 Wildlife Conservation Society, Marine', '')
        .replace('2 Wildlife Conservation Society, Kenya', '')
        .replace('3 Wildlife Conservation Society, Forests and ', '')
        .replace('Climate Change', '')
        .replace('4 University of Leeds', '')
        .replace('5 Coral Reef Alliance', '')
        .replace('6 Wildlife Conservation Society, Melanesia', '')
        .replace('7 The Nature Conservancy', '')
        .replace('8 University of Hawaiʻi', '')
        .replace('9 Florida Institute of Technology', '')
        .replace('10 California State University Monterey Bay', '')
        .replace('11 Macquarie University ', '')
        .replace('12 Wildlife Conservation Society, India', '')
        .replace('13 University of Queensland', '')
        .replace('14 University of Leicester', '')
        .replace('Box 1 | MODELING ADAPTATION POTENTIAL', '')
        .replace('Box 2 | WHAT TO MONITOR?', '')
        .replace('Background', '')
        .replace('FORECASTING CLIMATE SANCTUARIES FOR ', '')
        .replace('SECURING THE FUTURE OF CORAL REEFS\n', '')
        .replace('Defining coral sanctuaries: the 50 Reefs approach', '')
        .replace('Adding to the 50 Reefs', '')
        .replace('Sanctuary models: a way forward', '')
        .replace('Box 3 | LOCAL AND REGIONAL APPROACHES TO IMPLEMENTING A PORTFOLIO', '')
        .replace('OF RESILIENT REEFS', '')
        .replace('The future of climate sanctuaries for coral reefs' ,'')
        .replace('Conclusions' ,'')
        .replace('Recommendations' ,'')
        .replace('Recommendations' ,'')
        .replace('1interest', 'interest')
        .replace('\t', '')
        .replace('–', '-')
        .replace('\n \n', '')
        .replace('Photos by: Tom Vierus, Jocelyn Bentley, Mike Markovina, Emily Darling, The Ocean Image Bank | Design by: Haley Williams' ,'')
        # .replace('' ,'')
        for _, row in df.query("file_name == '9_WP_3.pdf'").query("page_number >= 0").iterrows()
    ]
).split('References')[0]

text_9 = (
    text_9
    .replace('1\n', '')
    .replace('2\n', '')
    .replace('3\n', '')
    .replace('\n 4\n', '')
    .replace('\n 5\n', '')
    .replace('\n 6\n', '')
    .replace('\n 7\n', '')
    .replace('\n 8\n', '')
    .replace('\n 9\n', '')
    .replace('\n 10\n', '')
    .replace('\n 12\n', '')
    .replace('\n 13\n', '')
    .replace('\n 1', '')
    .replace(' \n\n \n\n', '')
    .replace('  \n\n\n', '')
    .replace('\n', ' ')
)
# print(text_9)

### Combine Cleaned Texts

In [None]:
df_cleaned = (
    pd.DataFrame.from_records(
        [
            {"file_num": 1, "text_cleaned": text_1, 'type': 'research_paper'},
            {"file_num": 2, "text_cleaned": text_2, 'type': 'research_paper'},
            {"file_num": 3, "text_cleaned": text_3, 'type': 'research_paper'},
            {"file_num": 4, "text_cleaned": text_4, 'type': 'research_paper'},
            {"file_num": 5, "text_cleaned": text_5, 'type': 'research_paper'},
            {"file_num": 6, "text_cleaned": text_6, 'type': 'research_paper'},
            {"file_num": 7, "text_cleaned": None, 'type': 'research_paper'},
            {"file_num": 8, "text_cleaned": None, 'type': 'research_paper'},
            {"file_num": 9, "text_cleaned": None, 'type': 'research_paper'},
            {"file_num": 10, "text_cleaned": None, 'type': 'research_paper'},
            {"file_num": 11, "text_cleaned": text_7, 'type': 'white_paper'},
            {"file_num": 12, "text_cleaned": text_8, 'type': 'white_paper'},
            {"file_num": 13, "text_cleaned": text_9, 'type': 'white_paper'},
        ]
    )
    .assign(text_cleaned=lambda df: df['text_cleaned'].str.lower())
)
df_cleaned

### Combine Cleaned Texts with Raw Metadata

In [18]:
df = (
    df
    .assign(
        combined_text=lambda df: (
            df.groupby(
                [
                    'file_num',
                    'file_name',
                    'page_number',
                    'page_char_count',
                    'page_sentence_count_raw',
                    'page_token_count',
                ]
            )['text']
            .transform(lambda x: ' '.join(x))
        ),
        char_count=lambda df: (
            df
            .groupby(paper_id_cols)
            ['page_char_count']
            .transform('sum')
        ),
        sentence_count_raw=lambda df: (
            df
            .groupby(paper_id_cols)
            ['page_sentence_count_raw']
            .transform('sum')
        ),
        token_count=lambda df: (
            df
            .groupby(paper_id_cols)
            ['page_token_count']
            .transform('sum')
        ),
    )
    .drop_duplicates(paper_id_cols, ignore_index=True)
    .drop(
        columns=[
            'text',
            'page_number',
            'page_char_count',
            'page_sentence_count_raw',
            'page_token_count',
        ]
    )
    .merge(df_cleaned, on=['file_num'], how='left')
    .merge(pd.DataFrame.from_records(paper_urls), on='file_num', how='left')
    .rename(columns={'combined_text': "text", "text_cleaned": "text_cleaned"})
    [output_columns]
)
df

Unnamed: 0,file_num,file_name,url,text,char_count,sentence_count_raw,token_count,text_cleaned,type
0,1,1.pdf,https://iopscience.iop.org/article/10.1088/175...,IOP Conference Series: Earth\nand Environmenta...,34950,295,8738,the coral reef ecosystem is one of the biologi...,research_paper
1,2,2.pdf,https://academic.oup.com/oocc/article-pdf/4/1/...,2023 Record marine heat waves: coral reef blea...,22218,176,5555,"extreme 2023 high temperatures 2023, an el ni ...",research_paper
2,3,3.pdf,https://pmc.ncbi.nlm.nih.gov/articles/PMC9448965/,Ecology and Evolution. 2022;12:e9263. \n﻿ \n ...,50617,626,12655,coral reefs harbor the highest levels of biod...,research_paper
3,4,4.pdf,https://www.frontiersin.org/journals/marine-sc...,ORIGINAL RESEARCH\npublished: 09 August 2021\n...,40856,385,10212,coral reefs contain some of the highest level...,research_paper
4,5,5.pdf,https://www.sciencedirect.com/science/article/...,Oceanic differences in coral-bleaching respons...,79538,843,19882,"the intensity, frequency, and duration of mar...",research_paper
5,6,6.pdf,https://onlinelibrary.wiley.com/doi/10.1111/gc...,Glob Change Biol. 2022;28:4229–4250.\t\n﻿\n |...,134512,2149,33628,the relationship between scleractinian corals ...,research_paper
6,7,7.pdf,https://journals.plos.org/plosone/article/file...,Surviving Coral Bleaching Events: Porites Grow...,61718,368,15429,,research_paper
7,8,8.pdf,https://www.nature.com/articles/s41598-023-336...,1\nVol.:(0123456789)\nScientific Reports | ...,33433,320,8359,,research_paper
8,9,9.pdf,https://www.nature.com/articles/s41467-023-406...,Article\nhttps://doi.org/10.1038/s41467-023-40...,57846,637,14460,,research_paper
9,10,10.pdf,https://pmc.ncbi.nlm.nih.gov/articles/PMC9545801/,RESEARCH ARTICLE\nRecurring bleaching events d...,89270,675,22317,,research_paper


## Export to Disk

In [19]:
%%time
df.to_parquet(fpath_processed, index=False)

CPU times: user 10.3 ms, sys: 1 ms, total: 11.3 ms
Wall time: 13.4 ms
