# Setup

Der Code unterhalb braucht nur gensim, kann mit `pip install gensim` installiert werden. 

In [17]:
import os
import xml.etree.ElementTree as ET
from gensim.models import Word2Vec
import subprocess
import re


# Word2Vec trainieren

Der Code unterhalb ist nur relevant, wenn du das Modell neu trainieren willst - ein fertiges Modell ist allerdings schon geuploaded. 

Der Code zum Testen / Anwenden ist weiter unten. 

In [39]:
def cleanup(text:str, re_cleanup:bool, lowercase:bool)->str: 
    if lowercase: 
            text = text.lower()
    if re_cleanup: 
        # Replace all punctuation chars with a single whitespace, keep only normal words and digits
        text = re.sub(r"[^\w\n\d']+", ' ', text)
        text = re.sub(r"^\s+|\s+(?=\n)", '', text, flags=re.MULTILINE) # Delete leading/trailing whitespaces
    text = re.sub(r'[^(\n|\S)]+', ' ', text) # Clean up: spaces (except newlines) to single space
    return text


def extract_text_from_xml(file_path):
    try:
        # Parse the XML file
        tree = ET.parse(file_path)
        root = tree.getroot()

        # Ensure the XML has the expected format (root is <body>)
        if root.tag != 'body':
            return []

        # Extract text content from all <text> elements
        text_elements = root.findall('.//text')
        texts = [cleanup(elem.text, re_cleanup=True, lowercase=True) for elem in text_elements if elem.text]
        return texts
    except Exception as e:
        print(f"Error when reading file {file_path}")
        return []


def process_folder(folder_path):
    all_texts = []

    # Walk through all files and subdirectories
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            # Check if the file is an .xml file
            if file.endswith('.xml'):
                file_path = os.path.join(root, file)
                # Extract text from the XML file
                extracted_texts = extract_text_from_xml(file_path)
                all_texts.extend(extracted_texts)

    return all_texts

# Specify the folder to process
folder_path = 'XML'

# Process the folder and print the results
all_extracted_texts = process_folder(folder_path)

Error when reading file XML/._touchepasamastatue_output.xml
Error when reading file XML/._novelumcarcassonne_output.xml
Error when reading file XML/._normaux_output.xml
Error when reading file XML/._Nemesis2_output.xml
Error when reading file XML/._Hélix_output.xml
Error when reading file XML/._Mora_output.xml
Error when reading file XML/._Destoursetdeslys_output.xml
Error when reading file XML/._meduanocta_output.xml
Error when reading file XML/._Nemesis_output.xml
Error when reading file XML/._braves_output.xml
Error when reading file XML/._Alvarium_output.xml
Error when reading file XML/._natifs_output.xml
Error when reading file XML/._patriaalbiges_output.xml
Error when reading file XML/._ClermontNC_output.xml
Error when reading file XML/._Furie_output.xml
Error when reading file XML/._tenesoun_output.xml
Error when reading file XML/._GUD_output.xml
Error when reading file XML/._remparts_output.xml
Error when reading file XML/._Korser_output.xml
Error when reading file XML/._maquis

In [21]:
with open("training_corpus.txt", 'w') as fw: 
    fw.writelines('\n'.join(all_extracted_texts))

In [22]:
#Write all word to a file which appear at least k times
k = 2

command = f"cat training_corpus.txt | tr '[:upper:]' '[:lower:]' | tr -s '[:space:]' '\\n' | tr -d '[:punct:]' | grep -v '[0-9]' | sort | uniq -c | awk '$1 > {k} {{print $2}}' > words_preprocessed.txt"
subprocess.run(command, shell=True)


CompletedProcess(args="cat training_corpus.txt | tr '[:upper:]' '[:lower:]' | tr -s '[:space:]' '\\n' | tr -d '[:punct:]' | grep -v '[0-9]' | sort | uniq -c | awk '$1 > 2 {print $2}' > words_preprocessed.txt", returncode=0)

In [23]:
# Train Model
model = Word2Vec(
    corpus_file="training_corpus.txt",
    vector_size=100,  # Dimensionality of the word embeddings
    window=5,         # Max distance between current and predicted word
    min_count=3,      # Ignores all words with total frequency < 3
    workers=4,        # Number of threads to run in parallel
    epochs=5,          # Number of training epochs
    sg=1
)

# 4) Save the trained model for later use
model.save("word2vec.model")

# Modell anwenden

Eigentlich können nur Embeddings für einzelne Wörter erstellt werden - es können allerdings mehrere einzelne Embeddings aufaddiertt werden, auch wenn das Ergebnis dann eher so mittel ist. 

Generell wirkt das Ergebnis recht mittelmäßig, vmtl. weil nicht genug Daten vorhanden sind. 

In [40]:
# Load Model
model = Word2Vec.load("word2vec.model")

In [43]:
# Combine two vectors

# Get word vectors
v1 = model.wv['star']  
v2 = model.wv['wars'] 
# Combine word vectors
v_combined = (v1 + v2) / 2


similar_words = model.wv.most_similar(v_combined, topn=10)
print(similar_words)

[('wars', 0.9887760877609253), ('novel', 0.9866569638252258), ('wolf', 0.9847815632820129), ('originally', 0.9843636155128479), ('frost', 0.9841103553771973), ('civilians', 0.9840000867843628), ('illustrator', 0.9839940667152405), ('earlier', 0.9831578135490417), ('hunting', 0.9830578565597534), ('illustrated', 0.9830324649810791)]


In [38]:
# Find nearest neighbours for single word

similar_words = model.wv.most_similar("europe", topn=10)
print(similar_words)

[('invasion', 0.6447374820709229), ('ouest', 0.6436580419540405), ('occident', 0.6303271651268005), ('enracinement', 0.6251101493835449), ('turquie', 0.6235107779502869), ('ours', 0.6135778427124023), ('frontières', 0.61162930727005), ('islam', 0.6114954352378845), ('asie', 0.6085038781166077), ('mémorables', 0.6056879162788391)]


In [None]:
# Check similarity between two words
similarity = model.wv.similarity("word1", "word2")
print(similarity)
