In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import spacy

In [72]:
df_fulltext = pd.read_csv("data/papers_to_annotate_fulltext.csv", sep=";")

In [48]:
def clean_tokenized_sentences(sentences):
    new_sentences = []
    i = 0

    while i < len(sentences):
        current_sentence = sentences[i]
        next_sentence = sentences[i + 1] if i + 1 < len(sentences) else ""
        combined_sentence = current_sentence + " " + next_sentence

        # Define a list of sentence endings to check
        endings_to_check = ["et al\.$", "cf\.$", "eg\.$", "Fig\.$", "in\.$", "m\.$", r"\d{4}\;$"]

        # Check if the current sentence ends with any of the specified endings
        for ending_pattern in endings_to_check:
            if re.search(ending_pattern, current_sentence):
                new_sentences.append(combined_sentence)
                i += 2  # Skip the next sentence since it's been joined
                break
        else:
            # If none of the specified endings are found, add the current sentence as is
            new_sentences.append(current_sentence)
            i += 1

    # Filter out sentences with less than 3 words or with only single-character words
    new_sentences = [sentence for sentence in new_sentences if len(word_tokenize(sentence)) > 3]
    new_sentences = [sentence for sentence in new_sentences if not all(len(word) == 1 for word in word_tokenize(sentence))]

    return new_sentences

In [33]:
def merge_sentences(sentences):
    merged_sentences = []
    current_sentence = ""

    for sentence in sentences:
        if '(' in sentence and ')' not in sentence:
            # Start merging sentences when an opening parenthesis is found without a closing parenthesis
            current_sentence = sentence
        elif ')' in sentence and current_sentence:
            # If a closing parenthesis is found, join it with the current sentence
            current_sentence += " " + sentence
            merged_sentences.append(current_sentence)
            current_sentence = ""
        elif current_sentence:
            # Continue merging if the current sentence does not contain a closing parenthesis
            current_sentence += " " + sentence
        else:
            # If there's no current sentence, add the sentence as it is
            merged_sentences.append(sentence)

    # Join the merged sentences to form the final result
    #result = ' '.join(merged_sentences)

    return merged_sentences

In [None]:
def sentence_tokenizing(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    sentences = clean_tokenized_sentences(sentences)
    sentences = merge_sentences(sentences)
    return sentences

In [65]:
def remove_text(text):
    end_words = ["Funding", "Author statement", "CRediT authorship"]
    last_occurrence = -1
    for word in end_words:
        
        index = text.rfind(word)
        if index > last_occurrence:
            last_occurrence = index

        if last_occurrence >= 0:
            # Remove all text that comes after the last occurrence
            text = text[:last_occurrence]
    return text

In [67]:
df_fulltext["text_cut"] = df_fulltext["full_text"].apply(lambda x: remove_text(x))

In [80]:
df_sentences = df_fulltext["text_cut"].apply(lambda x: sentence_tokenizing(x))

In [None]:
i = 0
for text in df_sentences:
    title = ''.join(e for e in df_fulltext["title"][i] if e.isalnum())
    file_name = "data/tokenized/tokenized_text_" + str(title) + ".txt"
    with open(file_name, "w") as f:
        for line in text:
            f.write(line)
            f.write('\n')
    i += 1