In [108]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
import glob
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import os
from nltk.tokenize import word_tokenize
import spacy

In [109]:
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1500000

-----------

In [110]:
def remove_punctuation(text):
    text = text.replace("’", " ")
    return text.translate(str.maketrans("", "", string.punctuation))

#weitere cleaning-prozesse
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = text.lower().split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)


def remove_extra_signs(text):
    text = text.replace("“", "")
    text = text.replace("”", "")
    return text

#zahlen entfernen
def remove_numbers(text):
    text = re.sub(r"\d", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

input_pos = input("Bitte Wortart eingeben (z.B. NOUN, VERB oder ADJ)...").upper()

#wortarten filtern
def pos_filter(text):

    
    #analysieren des texts
    doc = nlp(text)
    
    #extrahieren der substantive
    substantives = [token.lemma_.lower() for token in doc if token.pos_ == input_pos]
    
    #drucken der ergebnisse
    #print(substantives)
    
    #filtern der substantive nach länge
    substantives = [token.lemma_.lower() for token in doc if token.pos_ == input_pos and len(token.lemma_) > 4]
    
    return " ".join(substantives)

Bitte Wortart eingeben (z.B. NOUN, VERB oder ADJ)... adj


In [111]:
def clean_all(text):
    text = remove_punctuation(text)
    text = preprocess_text(text)
    text = remove_extra_signs(text)
    text = remove_numbers(text)
    text = pos_filter(text)
    return text

#### Alle Texte verarbeiten

In [112]:
input_directory = input("Bitte Eingabeverzeichnis einfügen...")
output_directory = "/home/simon/bachelorarbeit/vergleich_mk_trump/cleaned_docs/" + input("Bitte all_words, nouns, adjectives oder verbs eingeben...") + "/" + input("Name Ausgabe-Ordner eingeben...")

#ausgabeverzeichnis erstellen, falls es nicht existiert
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

pattern = os.path.join(input_directory, "*.txt")


for path in glob.glob(pattern):

    with open(path, "r") as file:
        text_raw = file.read()
        text_cleaned = clean_all(text_raw)

        #bereinigte datei in neuem verzeichnis speichern
        output_file = os.path.join(output_directory, f"{os.path.basename(path)}")
        with open(output_file, "w") as file:
            file.write(text_cleaned)

        print(f"Bereinigter Text wurde erfolgreich in die Datei {output_file} gespeichert.")

        print(text_cleaned)
        print(text_cleaned.split()[:10])


Bitte Eingabeverzeichnis einfügen... /home/simon/Dokumente/Uni Würzburg/Digital Humanities/Bachelorarbeit/datasets/my_trump/00_cleaned_files/2024/youtube
Bitte all_words, nouns, adjectives oder verbs eingeben... adjectives
Name Ausgabe-Ordner eingeben... clean_trump


Bereinigter Text wurde erfolgreich in die Datei /home/simon/bachelorarbeit/vergleich_mk_trump/cleaned_docs/adjectives/clean_trump/FULL SPEECH: Trump Pledges To Enact The 'Largest Deportation In The History Of Our Country' In CPAC Speech.txt gespeichert.
urgent alien whole little migrant deadly interesting tough tough middle tough migrant category severe violent mental insane insane asylum mental terrorist human secure super military military great ready ready entire mental large right equal entire horrible handsome handsome handsome great unbelievable representative wonderful competent criminal tough tough large crazy mexic medical contagious sorry sorry
['urgent', 'alien', 'whole', 'little', 'migrant', 'deadly', 'interesting', 'tough', 'tough', 'middle']
Bereinigter Text wurde erfolgreich in die Datei /home/simon/bachelorarbeit/vergleich_mk_trump/cleaned_docs/adjectives/clean_trump/FULL SPEECH: Trump Pledges To Carry Out 'The Largest Domestic Deportation Operation In American History'