# Setup

Der Code unterhalb braucht nur gensim, kann mit `pip install gensim` installiert werden. 

In [1]:
import os
import xml.etree.ElementTree as ET
from gensim.models import Word2Vec
from gensim.models import FastText
import subprocess
import re


# Word2Vec trainieren

Der Code unterhalb ist nur relevant, wenn du das Modell neu trainieren willst - ein fertiges Modell ist allerdings schon geuploaded. 

Der Code zum Testen / Anwenden ist weiter unten. 

In [5]:
def cleanup(text:str, re_cleanup:bool, lowercase:bool)->str: 
    if lowercase: 
            text = text.lower()
    if re_cleanup: 
        # Replace all punctuation chars with a single whitespace, keep only normal words and digits
        text = re.sub(r"[^\w\n\d']+", ' ', text)
        text = re.sub(r"^\s+|\s+(?=\n)", '', text, flags=re.MULTILINE) # Delete leading/trailing whitespaces
    text = re.sub(r'[^(\n|\S)]+', ' ', text) # Clean up: spaces (except newlines) to single space
    return text


def extract_text_from_xml(file_path):
    try:
        # Parse the XML file
        tree = ET.parse(file_path)
        root = tree.getroot()

        # Ensure the XML has the expected format (root is <body>)
        if root.tag != 'body':
            return []

        # Extract text content from all <text> elements
        text_elements = root.findall('.//text')
        texts = [cleanup(elem.text, re_cleanup=True, lowercase=True) for elem in text_elements if elem.text]
        return texts
    except Exception as e:
        print(f"Error when reading file {file_path}")
        return []


def process_folder(folder_path):
    all_texts = []

    # Walk through all files and subdirectories
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            # Check if the file is an .xml file
            if file.endswith('.xml'):
                file_path = os.path.join(root, file)
                # Extract text from the XML file
                extracted_texts = extract_text_from_xml(file_path)
                all_texts.extend(extracted_texts)

    return all_texts

# Specify the folder to process
folder_path = 'XML'

# Process the folder and print the results
all_extracted_texts = process_folder(folder_path)

Error when reading file XML/._touchepasamastatue_output.xml
Error when reading file XML/._novelumcarcassonne_output.xml
Error when reading file XML/._normaux_output.xml
Error when reading file XML/._Nemesis2_output.xml
Error when reading file XML/._Hélix_output.xml
Error when reading file XML/._Mora_output.xml
Error when reading file XML/._Destoursetdeslys_output.xml
Error when reading file XML/._meduanocta_output.xml
Error when reading file XML/._Nemesis_output.xml
Error when reading file XML/._braves_output.xml
Error when reading file XML/._Alvarium_output.xml
Error when reading file XML/._natifs_output.xml
Error when reading file XML/._patriaalbiges_output.xml
Error when reading file XML/._ClermontNC_output.xml
Error when reading file XML/._Furie_output.xml
Error when reading file XML/._tenesoun_output.xml
Error when reading file XML/._GUD_output.xml
Error when reading file XML/._remparts_output.xml
Error when reading file XML/._Korser_output.xml
Error when reading file XML/._maquis

In [6]:
with open("training_corpus.txt", 'w') as fw: 
    fw.writelines('\n'.join(all_extracted_texts))

In [7]:
# Train Model
model = FastText(
    corpus_file="training_corpus.txt",
    vector_size=100,  # Dimensionality of the word embeddings
    window=5,         # Max distance between current and predicted word
    min_count=5,      # Ignores all words with total frequency < 3
    workers=4,        # Number of threads to run in parallel
    epochs=5,          # Number of training epochs
    sg=1
)

# 4) Save the trained model for later use
model.save("word2vec.model")

In [9]:
# Assume `model` is your trained Word2Vec model
vocabulary = model.wv.key_to_index

# Print the vocabulary (all words)
with open("vocabulary.txt", 'w') as fw: 
    fw.writelines('\n'.join(list(vocabulary.keys())))

# Modell anwenden

Wenn in fastText-Modell verwendet wird (ist beim aktuellen word2vec-Modell und dem zugehörigen Code oberhalb der Fall), können beliebige Sequenzen und Wörter getestet werden, unabhängig davon, ob diese in den Trainingsdaten waren. In diesem Fall wird die Eingabe in einzelne, kleinere Buchstabenfolgen zerlegt, die sich in den Trainingsdaten finden. Trotzdem ist das Ergebnis auf Wörtern aus dem Vokabular (kann mit dem Code direkt oberhalb in eine Datei geschrieben werden) vermutlich ab besten. 

Wenn ein normales Word2Vec-Modell verwendet wird (nicht empfohlen, schlechtere Ergebnisse), können nur Wörter aus dem Vokabular eingegeben werden.

In [12]:
# Load Model
model = FastText.load("word2vec.model")

In [23]:
# Find nearest neighbours for single word

similar_words = model.wv.most_similar("aujourd'hui", topn=20)
print(similar_words)

[("qu'aujourd'hui", 0.960066556930542), ("d'aujourd'hui", 0.955970048904419), ('aujourd', 0.816941499710083), ('hui', 0.7796117663383484), ('jourdan', 0.7659460306167603), ('jourdain', 0.7533005475997925), ('résonne', 0.7487667202949524), ("d'autrefois", 0.7469144463539124), ('launedujour', 0.745730459690094), ('toujours', 0.7379209399223328), ('béhourd', 0.7367293238639832), ('lourd', 0.7348629832267761), ('séjours', 0.7308028340339661), ('encourue', 0.7272129654884338), ('séjour', 0.7234699726104736), ("d'habitude", 0.7185230255126953), ("l'habitude", 0.7172859311103821), ('cauchemardesque', 0.7167842388153076), ('beaujolais', 0.7124072313308716), ('demain', 0.7122790813446045)]


In [26]:
# Check similarity between two words
similarity = model.wv.similarity("aujourd'hui", "")
print(similarity)


0.27673393
