# Clean Text from Research and White Papers

In [1]:
import os
from glob import glob

import fitz
import pandas as pd
import requests

In [2]:
PROJ_ROOT = os.path.join(os.pardir)

## About

Clean research paper texts.

## User Inputs

In [3]:
fname_processed = 'research_and_white_papers_cleaned.parquet'

paper_id_cols = ['file_num', 'file_name']

In [4]:
data_dir = os.path.join(PROJ_ROOT, 'data')
raw_data_dir = os.path.join(data_dir, 'raw')
processed_data_dir = os.path.join(data_dir, 'processed')

fpath_processed = os.path.join(processed_data_dir, fname_processed)

## Load Data, including Metadata

In [5]:
%%time
filepaths = sorted(glob(os.path.join(raw_data_dir, "*.pdf")))
records = [
    {
        # metadata
        "file_num": k,
        "file_name": os.path.basename(fpath),
        "page_number": page_number,
        "page_char_count": len(page.get_text()),
        "page_sentence_count_raw": len(page.get_text().split(". ")),
        "page_token_count": round(len(page.get_text())/4),
        # data
        "text": page.get_text(),
    }
    for k, fpath in enumerate(filepaths, 1)
    for page_number, page in enumerate(fitz.open(fpath))
]
df = pd.DataFrame.from_records(records)
print(
    f"Extracted contents from {len(filepaths):,} .pdf files into "
    f"{len(df):,} pages"
)
df.head()

Extracted contents from 7 .pdf files into 99 pages
CPU times: user 1.77 s, sys: 11 ms, total: 1.78 s
Wall time: 1.78 s


Unnamed: 0,file_num,file_name,page_number,page_char_count,page_sentence_count_raw,page_token_count,text
0,1,1.pdf,0,857,4,214,IOP Conference Series: Earth\nand Environmenta...
1,1,1.pdf,1,3279,22,820,Content from this work may be used under the t...
2,1,1.pdf,2,2347,17,587,The 6th International Conference on Tropical a...
3,1,1.pdf,3,3459,25,865,The 6th International Conference on Tropical a...
4,1,1.pdf,4,2837,16,709,The 6th International Conference on Tropical a...


## Process Data

### Research Paper 1

In [6]:
text_1 = ' '.join(
    [
        row['text']
        .split("\n \n \n \n \n \n \n")[1]
        .replace('\n', ' ')
        for _, row in (
            df
            .query("file_num == 1")
            .query("page_number >= 1")
            .iterrows()
        )
    ]
).split('1. Introduction  ')[1].split('  References')[0]
# print(text_1)

### Research Paper 2

In [7]:
text_2 = ' '.join(
    [
        row['text']
        .replace('\n', ' ')
        .replace(' | Oxford Open Climate Change, 2024, Vol. 4, No. 1', '')
        .replace('Goreau and Hayes | ', '')
        .replace(
            (
                'Downloaded from https://academic.oup.com/oocc/article/4/1/'
                'kgae005/7666987 by guest on 22 November 2024'
            ),
            ''
        )
        for _, row in (
            df
            .query("file_num == 2")
            .query("page_number >= 0")
            .iterrows()
        )
    ]
).split(' Introduction: ')[1].split('Acknowledgements ')[0]
# print(text_2)

### Research Paper 3

In [8]:
text_3 = ' '.join(
    [
        row['text']
        .replace(u'\xa0', u' ')
        .replace('\xad', '')
        .replace('\u2009', '')
        .replace('JOHNSON et al.', '')
        .replace('\u2003', ' ')
        .replace('\u2002', ' ')
        .replace('3 of 12', '')
        .replace('4 of 12', '')
        .replace('5 of 12', '')
        .replace('6 of 12', '')
        .replace('7 of 12', '')
        .replace('8 of 12', '')
        .replace('9 of 12', '')
        .replace('10 of 12', '')
        .replace('|', '')
        .replace('T A X O N O M Y  C L A S S I F I C A T I O N\n', '')
        .replace('Applied ecology; Biodiversity ecology; Functional ecology; Global change ecology; \nMacroecology', '')
        .replace('\n', ' ')
        for _, row in (
            df
            .query("file_num == 3")
            .query("page_number >= 0")
            .iterrows()
        )
    ]
).split('1    INTRODUCTION')[1].split('AUTHOR CONTRIBUTIONS')[0]
# print(text_3)

### Research Paper 4

In [9]:
text_4 = ' '.join(
    [
        row['text']
        .replace('Frontiers in Marine Science | www.frontiersin.org\n1\nAugust 2021 | Volume 8 | Article 704682', '')
        .replace('Frontiers in Marine Science | www.frontiersin.org\n2\nAugust 2021 | Volume 8 | Article 704682', '')
        .replace('Frontiers in Marine Science | www.frontiersin.org\n3\nAugust 2021 | Volume 8 | Article 704682', '')
        .replace('Frontiers in Marine Science | www.frontiersin.org\n4\nAugust 2021 | Volume 8 | Article 704682', '')
        .replace('Frontiers in Marine Science | www.frontiersin.org\n5\nAugust 2021 | Volume 8 | Article 704682', '')
        .replace('Frontiers in Marine Science | www.frontiersin.org\n6\nAugust 2021 | Volume 8 | Article 704682', '')
        .replace('Frontiers in Marine Science | www.frontiersin.org\n7\nAugust 2021 | Volume 8 | Article 704682', '')
        .replace('Dao et al.\nSeawater Temperature on Coral Reefs', '')
        .replace('www.blue-communities.org', '')
        .replace('\n', ' ')
        .replace('ñ', 'n')
        for _, row in (
            df
            .query("file_num == 4")
            .query("page_number >= 0")
            .iterrows()
        )
    ]
).split('INTRODUCTION')[1].split('DATA AVAILABILITY STATEMENT')[0]
# print(text_4)

### Research Paper 5

In [10]:
text_5 = ' '.join(
    [
        row['text']
        .replace(
            "⁎ Corresponding author.\nE-mail address: rvw@ﬁt.edu (R. van Woesik).\n1 Present address: School of Zoology and The Steinhardt Museum of Natural History, Tel\nAviv University, Tel Aviv 69978, Israel.\nhttp://dx.doi.org/10.1016/j.scitotenv.2023.162113\nReceived 11 December 2022; Received in revised form 2 February 2023; Accepted 4 February 2023\nAvailable online 9 February 2023\n0048-9697/© 2023 The Author(s). Published by Elsevier B.V. This is an open access article under the CC BY-NC license (http://creativecommons.org/licenses/by-nc/4.0/).", ""
        )
        .replace(
            "Contents lists available at ScienceDirect\nScience of the Total Environment\njournal homepage: www.elsevier.com/locate/scitotenv", ""
        )
        .replace("Science of the Total Environment 871 (2023) 162113", "")
        .replace('\n\n', '')
        .replace('\n', ' ')
        for _, row in (
            df
            .query("file_num == 5")
            .query("page_number >= 0")
            .iterrows()
        )
    ]
).split('1. Introduction')[1].split('CRediT authorship contribution statement')[0]
# print(text_5)

### Research Paper 6

In [11]:
text_6 = ' '.join(
    [
        row['text']
        .replace(u'\xa0', u' ')
        .replace('\xad', '')
        .replace('\u2009', '')
        .replace('JOHNSON et al.', '')
        .replace('\u2003', ' ')
        .replace('\u2002', ' ')
        .replace('VAN WOESIK et al.', '')
        .replace('Funding information', '')
        .replace('Division of Ocean Sciences, Grant/Award ', '')
        .replace('Number: OCE 1829393 and OCE 1838667', '')
        .replace('4231', '')
        .replace('4232', '')
        .replace('4233', '')
        .replace('4234', '')
        .replace('4235', '')
        .replace('4236', '')
        .replace('4237', '')
        .replace('4238', '')
        .replace('4239', '')
        .replace('4240', '')
        .replace('4241', '')
        .replace('2  |  ENVIRONMENTAL', 'ENVIRONMENTAL')
        .replace('3  |  DIFFERENTIAL THERMAL-STRESS', 'DIFFERENTIAL THERMAL-STRESS')
        .replace('4  |  LINKING CORAL-BLEACHING', 'LINKING CORAL-BLEACHING')
        .replace('5  |  LOOKING FORWARD', '')
        .replace('6  |  FROM SCIENCE TO MANAGEMENT', 'FROM SCIENCE TO MANAGEMENT')
        .replace('7  |  MESOSCALE SANCTUARIES', 'MESOSCALE SANCTUARIES')
        .replace('8  |  CONCLUDING REMARKS', 'CONCLUDING REMARKS')
        .replace('K E Y W O R D S', '')
        .replace('CORAL \nBLEACHING', 'CORAL BLEACHING')
        .replace(', https://www.ncbi.nlm.nih.gov/geo/', '')
        .replace(', https://geome​-db.org/', '')
        .replace(' (https://www.bco-dmo.org)', '')
        .replace('; https://coral​reefw​atch.noaa.gov/produ​ct/5km/index_5km_dhw.php', '')
        .replace('climate change, conservation, coral bleaching, coral reefs, corals, global warming, mesoscale', '')
        .replace('sanctuaries, networks, protected reefs, refugia, thermal stress', '')
        .replace('\n', ' ')
        for _, row in (
            df
            .query("file_num == 6")
            .query("page_number >= 1")
            .iterrows()
        )
    ]
).split('1  |  INTRODUCTION ')[1].split(' ACKNOWLEDGMENTS ')[0]
text_6 = text_6.replace('      |    ', '').replace('   |     ', '')
# print(text_6)

### White Paper 1

In [12]:
text_7 = ' '.join(
    [
        row['text']
        # .replace('\n', ' ')
        .replace('CORAL BLEACHING – A REVIEW OF THE CAUSES AND CONSEQUENCES', '')
        .replace('CORAL BLEACHING: SCIENCE', '')
        .replace('A REEF MANAGER’S GUIDE TO CORAL BLEACHING', '')
        .replace('© Kirsten Michalek-Wagner', '')
        .replace('© Chris Hawkins', '')
        .replace('© Ove Hoegh-Guldberg', '')
        .replace('© Christian Perthen', '')
        .replace('© Paul Marshall', '')
        .replace('© Yusri Yusuf', '')
        .replace('© James Oliver', '')
        .replace('© Australian Institute of Marine Science, Long Term Monitoring Program', '')
        .replace('© Rohan Arthur', '')
        .replace('© Ove Hoegh-Guldberg', '')
        .replace('© Simon Albert', '')
        .replace('© Yusri Yusuf', '')
        .replace('4. CORAL BLEACHING', 'CORAL BLEACHING')
        .replace('\n', ' ')
        for _, row in df.query("file_num == 7").query("page_number >= 0").iterrows()
    ]
)
# print(text_7)

### Combine Cleaned Texts

In [13]:
df_cleaned = (
    pd.DataFrame.from_records(
        [
            {"file_num": 1, "text_cleaned": text_1, 'type': 'research_paper'},
            {"file_num": 2, "text_cleaned": text_2, 'type': 'research_paper'},
            {"file_num": 3, "text_cleaned": text_3, 'type': 'research_paper'},
            {"file_num": 4, "text_cleaned": text_4, 'type': 'research_paper'},
            {"file_num": 5, "text_cleaned": text_5, 'type': 'research_paper'},
            {"file_num": 6, "text_cleaned": text_6, 'type': 'research_paper'},
            {"file_num": 7, "text_cleaned": text_7, 'type': 'white_paper'},
        ]
    )
    .assign(text_cleaned=lambda df: df['text_cleaned'].str.lower())
)

### Combine Cleaned Texts with Raw Metadata

In [14]:
df = (
    df
    .assign(
        combined_text=lambda df: (
            df.groupby(
                [
                    'file_num',
                    'file_name',
                    'page_number',
                    'page_char_count',
                    'page_sentence_count_raw',
                    'page_token_count',
                ]
            )['text']
            .transform(lambda x: ' '.join(x))
        ),
        char_count=lambda df: (
            df
            .groupby(paper_id_cols)
            ['page_char_count']
            .transform('sum')
        ),
        sentence_count_raw=lambda df: (
            df
            .groupby(paper_id_cols)
            ['page_sentence_count_raw']
            .transform('sum')
        ),
        token_count=lambda df: (
            df
            .groupby(paper_id_cols)
            ['page_token_count']
            .transform('sum')
        ),
    )
    .drop_duplicates(paper_id_cols, ignore_index=True)
    .drop(
        columns=[
            'text',
            'page_number',
            'page_char_count',
            'page_sentence_count_raw',
            'page_token_count',
        ]
    )
    .merge(df_cleaned, on=['file_num'], how='left')
    .rename(columns={"text_cleaned": "combined_text_cleaned"})
)
df

Unnamed: 0,file_num,file_name,combined_text,char_count,sentence_count_raw,token_count,combined_text_cleaned,type
0,1,1.pdf,IOP Conference Series: Earth\nand Environmenta...,34950,295,8738,the coral reef ecosystem is one of the biologi...,research_paper
1,2,2.pdf,2023 Record marine heat waves: coral reef blea...,22218,176,5555,"extreme 2023 high temperatures 2023, an el ni~...",research_paper
2,3,3.pdf,Ecology and Evolution. 2022;12:e9263. \n﻿ \n ...,50617,626,12655,coral reefs harbor the highest levels of biod...,research_paper
3,4,4.pdf,ORIGINAL RESEARCH\npublished: 09 August 2021\n...,40856,385,10212,coral reefs contain some of the highest level...,research_paper
4,5,5.pdf,Oceanic differences in coral-bleaching respons...,79538,843,19882,"the intensity, frequency, and duration of mar...",research_paper
5,6,6.pdf,Glob Change Biol. 2022;28:4229–4250.\t\n﻿\n |...,134512,2149,33628,the relationship between scleractinian corals ...,research_paper
6,7,7_WP_1.pdf,CORAL BLEACHING –\nA REVIEW OF THE CAUSES\nAND...,60858,192,15215,coral bleaching – a review of the causes and c...,white_paper


## Export to Disk

In [15]:
%%time
df.to_parquet(fpath_processed, index=False)

CPU times: user 8.39 ms, sys: 2.01 ms, total: 10.4 ms
Wall time: 10.1 ms
