In [1]:
import os
import re
import random
import nlp_project_functions as functions

dir = "data/data_preprocessed"

orgel_list = []
aba_list = []
inside_per = False
inside_loc = False


In [2]:
for filename in os.listdir("data/data_preprocessed/"):
    if filename.endswith('txt'):
        with open(os.path.join(dir, filename)) as f:
            sermon = f.read()
        
        sermon = functions.sermon_cleanup(sermon)

        # add spaces around tags
        sermon = re.sub(r'</(PERSON|LOCATION)>(\S)', r'</\1> \2', sermon)
        sermon = re.sub(r'(\S)<(PERSON|LOCATION)>', r'\1 <\2>', sermon)

        # add spaces around punctuation
        sermon = re.sub(r'(,|;|:|\.|\?|!|\))', r' \1 ', sermon)
        sermon = re.sub(r'(\()', r' \1 ', sermon)

        # filter out long names 
        long_names = functions.get_long_names(sermon)
        for name in long_names:
            sermon = re.sub(name, '', sermon)

        # create lists of words
        words = re.split(r'\s', sermon)
        words = [i for i in words if i != ""]
        
        if filename.startswith("E"):
            #orgel_list.append("# " + filename[:7])
            for word in words:
                orgel_list.append(functions.word_to_row(word))
        else:
            #aba_list.append("# " + filename)
            for word in words:
                aba_list.append(functions.word_to_row(word))

In [3]:
print(f"Orgelpredigten Tokens: {len(orgel_list):,}")
print(f"Abacus Tokens: {len(aba_list):,}")

Orgelpredigten Tokens: 641,481
Abacus Tokens: 157,431


In [4]:
all_tokens = [token[0] for token in orgel_list + aba_list]
all_labels = [token[1] for token in orgel_list + aba_list]

In [5]:
named_entities = [token for token in filter(lambda x: x != "O", all_labels) ]

In [6]:
all = len(all_tokens)
ne = len(named_entities)

print(f"{ne:,} of {all:,} tokens (≈{ne/(all/100):.2f}%) are named entities.")

15,746 of 798,912 tokens (≈1.97%) are named entities.


In [7]:
combined = [[token, label] for token, label in zip(all_tokens, all_labels)]

In [8]:
all_data = ""

for item in combined:
    if item[0] in (".", "!", "?", ":"):
        all_data += f"{item[0]}\t{item[1]}\n\n"
    else:
        all_data += f"{item[0]}\t{item[1]}\n"

In [9]:
if functions.check_bio_validity(all_data.split("\n")):
    with open('data/train_test_val/all_data.tsv', 'w', encoding='utf8') as f:
        f.write(all_data)
    print("Data written to file!")

Data written to file!


In [10]:
tokens, labels = functions.read_conll_data('data/train_test_val/all_data.tsv')

sentences = []
longest = 0
for token_i, label_i in zip(tokens, labels):
    # throw out 'sentences' under 3 words long
    if len(token_i) > 2:
        sentence = [[token, label] for token, label in list(zip(token_i,label_i))]
        # split up sentences that are over 300 tokens long
        if len(sentence) <= 300:
            sentences.append(sentence)
        else:
            list_of_sents = functions.find_good_split(sentence, index=int(len(sentence)/2))
            sentences.extend(list_of_sents)
    if len(sentences[-1]) > longest:
        longest = len(sentences[-1])
        print(f"Current longest sentence: {longest}")

random.shuffle(sentences)

Current longest sentence: 85
Current longest sentence: 91
Current longest sentence: 104
Current longest sentence: 117
Current longest sentence: 132
Current longest sentence: 185
Current longest sentence: 204
Current longest sentence: 212
Current longest sentence: 234
Current longest sentence: 254
Current longest sentence: 288
Current longest sentence: 294
Current longest sentence: 300


In [11]:
test = sentences[:int(len(sentences) * 0.2)]
train = sentences[int(len(sentences) * 0.2):]

In [12]:
print(f"Total number of sentences: {len(sentences)}")
print(f"Sentences in training data: {len(train)} (= {100/(len(sentences)/len(train)):.3f}%)")
print(f"Sentences in test data: {len(test)} (= {100/(len(sentences)/len(test)):.3f}")

Total number of sentences: 29299
Sentences in training data: 23440 (= 80.003%)
Sentences in test data: 5859 (= 19.997


In [13]:
def two_d_list_to_conll(list):
    string = ""
    for id, sentence in enumerate(list):
        for idx, item in enumerate(sentence):
            try:
                string += f"{item[0]}\t{item[1]}\n"
            except:
                print(f"Problem in sentence {id}, token {idx}.")
        string += "\n"
    return string

In [14]:
train_tsv = two_d_list_to_conll(train)
test_tsv = two_d_list_to_conll(test)

with open('data/train_test_val/train.tsv', 'w', encoding='utf8') as f:
    f.write(train_tsv)

with open('data/train_test_val/test.tsv', 'w', encoding='utf8') as f:
    f.write(test_tsv)

In [15]:
%%bash

# List of files to iterate over
files=("train.tsv" "test.tsv")

# Loop through each file
for f in "${files[@]}"
do
    python -m spacy convert ./data/train_test_val/$f ./data/train_test_val/ -c conll -s -n 10
done

[38;5;3m⚠ Sentence boundaries found, automatic sentence segmentation with `-s`
disabled.[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (2344 documents):
data/train_test_val/train.spacy[0m
[38;5;3m⚠ Sentence boundaries found, automatic sentence segmentation with `-s`
disabled.[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (586 documents):
data/train_test_val/test.spacy[0m
